{ "best_global_step": 5500, "best_metric": 5.37880277633667, "best_model_checkpoint": "checkpoints_phoneme_tokenizer/checkpoint-5500", "epoch": 140.53108026554014, "eval_steps": 500, "global_step": 116500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012070006035003017, "grad_norm": 35.80157470703125, "learning_rate": 1.8000000000000001e-06, "loss": 11.4697, "step": 10 }, { "epoch": 0.024140012070006035, "grad_norm": 11.402566909790039, "learning_rate": 3.8000000000000005e-06, "loss": 10.2113, "step": 20 }, { "epoch": 0.036210018105009054, "grad_norm": 12.09052562713623, "learning_rate": 5.8e-06, "loss": 9.5479, "step": 30 }, { "epoch": 0.04828002414001207, "grad_norm": 5.4004807472229, "learning_rate": 7.800000000000002e-06, "loss": 9.3316, "step": 40 }, { "epoch": 0.060350030175015085, "grad_norm": 6.185907363891602, "learning_rate": 9.800000000000001e-06, "loss": 9.1945, "step": 50 }, { "epoch": 0.07242003621001811, "grad_norm": 3.9841644763946533, "learning_rate": 1.18e-05, "loss": 9.0727, "step": 60 }, { "epoch": 0.08449004224502112, "grad_norm": 2.649885892868042, "learning_rate": 1.38e-05, "loss": 8.9948, "step": 70 }, { "epoch": 0.09656004828002414, "grad_norm": 2.9412100315093994, "learning_rate": 1.58e-05, "loss": 8.8836, "step": 80 }, { "epoch": 0.10863005431502716, "grad_norm": 3.387864589691162, "learning_rate": 1.7800000000000002e-05, "loss": 8.8105, "step": 90 }, { "epoch": 0.12070006035003017, "grad_norm": 2.5590574741363525, "learning_rate": 1.98e-05, "loss": 8.7116, "step": 100 }, { "epoch": 0.1327700663850332, "grad_norm": 2.837214469909668, "learning_rate": 1.9999999782870688e-05, "loss": 8.6411, "step": 110 }, { "epoch": 0.14484007242003621, "grad_norm": 3.53305983543396, "learning_rate": 1.9999999541615894e-05, "loss": 8.6194, "step": 120 }, { "epoch": 0.15691007845503924, "grad_norm": 5.364113807678223, "learning_rate": 1.99999993003611e-05, "loss": 8.5655, "step": 130 }, { "epoch": 0.16898008449004223, "grad_norm": 3.8617589473724365, "learning_rate": 1.9999999059106307e-05, "loss": 8.5766, "step": 140 }, { "epoch": 0.18105009052504525, "grad_norm": 2.729674816131592, "learning_rate": 1.9999998817851513e-05, "loss": 8.5625, "step": 150 }, { "epoch": 0.19312009656004828, "grad_norm": 2.4414279460906982, "learning_rate": 1.9999998576596716e-05, "loss": 8.5483, "step": 160 }, { "epoch": 0.2051901025950513, "grad_norm": 2.1394906044006348, "learning_rate": 1.9999998335341922e-05, "loss": 8.5318, "step": 170 }, { "epoch": 0.21726010863005432, "grad_norm": 2.384667158126831, "learning_rate": 1.999999809408713e-05, "loss": 8.5186, "step": 180 }, { "epoch": 0.22933011466505734, "grad_norm": 2.5800416469573975, "learning_rate": 1.9999997852832335e-05, "loss": 8.5329, "step": 190 }, { "epoch": 0.24140012070006034, "grad_norm": 2.6168289184570312, "learning_rate": 1.999999761157754e-05, "loss": 8.4888, "step": 200 }, { "epoch": 0.25347012673506336, "grad_norm": 2.8624589443206787, "learning_rate": 1.9999997370322747e-05, "loss": 8.4729, "step": 210 }, { "epoch": 0.2655401327700664, "grad_norm": 2.2759859561920166, "learning_rate": 1.9999997129067953e-05, "loss": 8.4918, "step": 220 }, { "epoch": 0.2776101388050694, "grad_norm": 2.3907318115234375, "learning_rate": 1.999999688781316e-05, "loss": 8.4566, "step": 230 }, { "epoch": 0.28968014484007243, "grad_norm": 2.311438798904419, "learning_rate": 1.9999996646558366e-05, "loss": 8.4958, "step": 240 }, { "epoch": 0.30175015087507545, "grad_norm": 3.734734535217285, "learning_rate": 1.9999996405303572e-05, "loss": 8.4011, "step": 250 }, { "epoch": 0.3138201569100785, "grad_norm": 10.149917602539062, "learning_rate": 1.9999996164048778e-05, "loss": 8.3085, "step": 260 }, { "epoch": 0.3258901629450815, "grad_norm": 3.0872750282287598, "learning_rate": 1.9999995922793984e-05, "loss": 8.2189, "step": 270 }, { "epoch": 0.33796016898008446, "grad_norm": 2.7601139545440674, "learning_rate": 1.999999568153919e-05, "loss": 8.0977, "step": 280 }, { "epoch": 0.3500301750150875, "grad_norm": 3.284580945968628, "learning_rate": 1.9999995440284397e-05, "loss": 7.9374, "step": 290 }, { "epoch": 0.3621001810500905, "grad_norm": 2.9990365505218506, "learning_rate": 1.9999995199029603e-05, "loss": 7.8959, "step": 300 }, { "epoch": 0.37417018708509353, "grad_norm": 3.21844744682312, "learning_rate": 1.999999495777481e-05, "loss": 7.8299, "step": 310 }, { "epoch": 0.38624019312009655, "grad_norm": 3.0272881984710693, "learning_rate": 1.9999994716520015e-05, "loss": 7.7384, "step": 320 }, { "epoch": 0.3983101991550996, "grad_norm": 3.775240182876587, "learning_rate": 1.999999447526522e-05, "loss": 7.6741, "step": 330 }, { "epoch": 0.4103802051901026, "grad_norm": 2.9841678142547607, "learning_rate": 1.9999994234010428e-05, "loss": 7.6424, "step": 340 }, { "epoch": 0.4224502112251056, "grad_norm": 3.6655266284942627, "learning_rate": 1.9999993992755634e-05, "loss": 7.6109, "step": 350 }, { "epoch": 0.43452021726010864, "grad_norm": 3.328448534011841, "learning_rate": 1.999999375150084e-05, "loss": 7.5188, "step": 360 }, { "epoch": 0.44659022329511167, "grad_norm": 3.805652618408203, "learning_rate": 1.9999993510246046e-05, "loss": 7.5503, "step": 370 }, { "epoch": 0.4586602293301147, "grad_norm": 3.0681569576263428, "learning_rate": 1.9999993268991253e-05, "loss": 7.5258, "step": 380 }, { "epoch": 0.47073023536511766, "grad_norm": 3.2476582527160645, "learning_rate": 1.999999302773646e-05, "loss": 7.4939, "step": 390 }, { "epoch": 0.4828002414001207, "grad_norm": 3.2614359855651855, "learning_rate": 1.9999992786481665e-05, "loss": 7.445, "step": 400 }, { "epoch": 0.4948702474351237, "grad_norm": 3.2094168663024902, "learning_rate": 1.999999254522687e-05, "loss": 7.4175, "step": 410 }, { "epoch": 0.5069402534701267, "grad_norm": 3.7619616985321045, "learning_rate": 1.9999992303972078e-05, "loss": 7.4249, "step": 420 }, { "epoch": 0.5190102595051298, "grad_norm": 3.055659294128418, "learning_rate": 1.9999992062717284e-05, "loss": 7.4069, "step": 430 }, { "epoch": 0.5310802655401328, "grad_norm": 3.232218027114868, "learning_rate": 1.999999182146249e-05, "loss": 7.4012, "step": 440 }, { "epoch": 0.5431502715751357, "grad_norm": 3.4231531620025635, "learning_rate": 1.9999991580207696e-05, "loss": 7.3888, "step": 450 }, { "epoch": 0.5552202776101388, "grad_norm": 3.222292184829712, "learning_rate": 1.9999991338952902e-05, "loss": 7.357, "step": 460 }, { "epoch": 0.5672902836451418, "grad_norm": 3.320664644241333, "learning_rate": 1.999999109769811e-05, "loss": 7.3527, "step": 470 }, { "epoch": 0.5793602896801449, "grad_norm": 3.2404608726501465, "learning_rate": 1.9999990856443315e-05, "loss": 7.3578, "step": 480 }, { "epoch": 0.5914302957151478, "grad_norm": 3.174070119857788, "learning_rate": 1.999999061518852e-05, "loss": 7.3051, "step": 490 }, { "epoch": 0.6035003017501509, "grad_norm": 2.9915647506713867, "learning_rate": 1.9999990373933727e-05, "loss": 7.3263, "step": 500 }, { "epoch": 0.6035003017501509, "eval_loss": 7.311534881591797, "eval_runtime": 8.1916, "eval_samples_per_second": 85.087, "eval_steps_per_second": 10.743, "step": 500 }, { "epoch": 0.6155703077851539, "grad_norm": 3.0733962059020996, "learning_rate": 1.9999990132678933e-05, "loss": 7.3206, "step": 510 }, { "epoch": 0.627640313820157, "grad_norm": 3.124647855758667, "learning_rate": 1.999998989142414e-05, "loss": 7.3149, "step": 520 }, { "epoch": 0.6397103198551599, "grad_norm": 3.9316301345825195, "learning_rate": 1.9999989650169346e-05, "loss": 7.2823, "step": 530 }, { "epoch": 0.651780325890163, "grad_norm": 3.1440505981445312, "learning_rate": 1.9999989408914552e-05, "loss": 7.2933, "step": 540 }, { "epoch": 0.663850331925166, "grad_norm": 2.8404123783111572, "learning_rate": 1.999998916765976e-05, "loss": 7.2385, "step": 550 }, { "epoch": 0.6759203379601689, "grad_norm": 3.221916675567627, "learning_rate": 1.9999988926404965e-05, "loss": 7.2526, "step": 560 }, { "epoch": 0.687990343995172, "grad_norm": 2.8018670082092285, "learning_rate": 1.9999988685150167e-05, "loss": 7.2287, "step": 570 }, { "epoch": 0.700060350030175, "grad_norm": 3.086871385574341, "learning_rate": 1.9999988443895374e-05, "loss": 7.2253, "step": 580 }, { "epoch": 0.712130356065178, "grad_norm": 3.4184234142303467, "learning_rate": 1.999998820264058e-05, "loss": 7.2226, "step": 590 }, { "epoch": 0.724200362100181, "grad_norm": 3.6511592864990234, "learning_rate": 1.9999987961385786e-05, "loss": 7.2267, "step": 600 }, { "epoch": 0.7362703681351841, "grad_norm": 2.9182682037353516, "learning_rate": 1.9999987720130992e-05, "loss": 7.2124, "step": 610 }, { "epoch": 0.7483403741701871, "grad_norm": 3.844871759414673, "learning_rate": 1.99999874788762e-05, "loss": 7.2274, "step": 620 }, { "epoch": 0.7604103802051901, "grad_norm": 3.159799337387085, "learning_rate": 1.9999987237621405e-05, "loss": 7.1818, "step": 630 }, { "epoch": 0.7724803862401931, "grad_norm": 3.0129897594451904, "learning_rate": 1.999998699636661e-05, "loss": 7.2092, "step": 640 }, { "epoch": 0.7845503922751962, "grad_norm": 3.0193910598754883, "learning_rate": 1.9999986755111817e-05, "loss": 7.1825, "step": 650 }, { "epoch": 0.7966203983101992, "grad_norm": 3.144216299057007, "learning_rate": 1.9999986513857023e-05, "loss": 7.1564, "step": 660 }, { "epoch": 0.8086904043452021, "grad_norm": 3.0291526317596436, "learning_rate": 1.999998627260223e-05, "loss": 7.156, "step": 670 }, { "epoch": 0.8207604103802052, "grad_norm": 3.6699483394622803, "learning_rate": 1.9999986031347436e-05, "loss": 7.1381, "step": 680 }, { "epoch": 0.8328304164152082, "grad_norm": 3.2029929161071777, "learning_rate": 1.9999985790092642e-05, "loss": 7.0952, "step": 690 }, { "epoch": 0.8449004224502112, "grad_norm": 3.2381982803344727, "learning_rate": 1.9999985548837848e-05, "loss": 7.1413, "step": 700 }, { "epoch": 0.8569704284852142, "grad_norm": 3.3621699810028076, "learning_rate": 1.9999985307583054e-05, "loss": 7.1052, "step": 710 }, { "epoch": 0.8690404345202173, "grad_norm": 2.9438650608062744, "learning_rate": 1.999998506632826e-05, "loss": 7.1517, "step": 720 }, { "epoch": 0.8811104405552203, "grad_norm": 3.2723660469055176, "learning_rate": 1.9999984825073467e-05, "loss": 7.12, "step": 730 }, { "epoch": 0.8931804465902233, "grad_norm": 2.9166128635406494, "learning_rate": 1.9999984583818673e-05, "loss": 7.1243, "step": 740 }, { "epoch": 0.9052504526252263, "grad_norm": 3.1118640899658203, "learning_rate": 1.999998434256388e-05, "loss": 7.1015, "step": 750 }, { "epoch": 0.9173204586602294, "grad_norm": 3.4652955532073975, "learning_rate": 1.9999984101309085e-05, "loss": 7.1036, "step": 760 }, { "epoch": 0.9293904646952323, "grad_norm": 3.730862617492676, "learning_rate": 1.999998386005429e-05, "loss": 7.0734, "step": 770 }, { "epoch": 0.9414604707302353, "grad_norm": 3.0955240726470947, "learning_rate": 1.9999983618799498e-05, "loss": 7.0827, "step": 780 }, { "epoch": 0.9535304767652384, "grad_norm": 2.899793863296509, "learning_rate": 1.9999983377544704e-05, "loss": 7.0959, "step": 790 }, { "epoch": 0.9656004828002414, "grad_norm": 3.4812543392181396, "learning_rate": 1.999998313628991e-05, "loss": 7.0768, "step": 800 }, { "epoch": 0.9776704888352444, "grad_norm": 3.1788957118988037, "learning_rate": 1.9999982895035117e-05, "loss": 7.093, "step": 810 }, { "epoch": 0.9897404948702474, "grad_norm": 3.4140141010284424, "learning_rate": 1.999998265378032e-05, "loss": 7.0532, "step": 820 }, { "epoch": 1.0012070006035003, "grad_norm": 3.3003287315368652, "learning_rate": 1.9999982412525526e-05, "loss": 7.031, "step": 830 }, { "epoch": 1.0132770066385033, "grad_norm": 3.456433057785034, "learning_rate": 1.9999982171270732e-05, "loss": 7.0419, "step": 840 }, { "epoch": 1.0253470126735063, "grad_norm": 3.56581974029541, "learning_rate": 1.9999981930015938e-05, "loss": 6.9839, "step": 850 }, { "epoch": 1.0374170187085094, "grad_norm": 3.1069068908691406, "learning_rate": 1.9999981688761144e-05, "loss": 7.0171, "step": 860 }, { "epoch": 1.0494870247435124, "grad_norm": 2.8458690643310547, "learning_rate": 1.999998144750635e-05, "loss": 6.9946, "step": 870 }, { "epoch": 1.0615570307785154, "grad_norm": 3.2202632427215576, "learning_rate": 1.9999981206251557e-05, "loss": 6.9786, "step": 880 }, { "epoch": 1.0736270368135183, "grad_norm": 3.1370668411254883, "learning_rate": 1.9999980964996763e-05, "loss": 6.9825, "step": 890 }, { "epoch": 1.0856970428485213, "grad_norm": 3.274165391921997, "learning_rate": 1.999998072374197e-05, "loss": 6.9791, "step": 900 }, { "epoch": 1.0977670488835245, "grad_norm": 3.463301181793213, "learning_rate": 1.9999980482487175e-05, "loss": 6.959, "step": 910 }, { "epoch": 1.1098370549185275, "grad_norm": 3.930248737335205, "learning_rate": 1.9999980241232385e-05, "loss": 6.9919, "step": 920 }, { "epoch": 1.1219070609535304, "grad_norm": 3.4840762615203857, "learning_rate": 1.999997999997759e-05, "loss": 6.9692, "step": 930 }, { "epoch": 1.1339770669885334, "grad_norm": 3.479872703552246, "learning_rate": 1.9999979758722797e-05, "loss": 6.9327, "step": 940 }, { "epoch": 1.1460470730235366, "grad_norm": 3.5596141815185547, "learning_rate": 1.9999979517468004e-05, "loss": 6.9763, "step": 950 }, { "epoch": 1.1581170790585396, "grad_norm": 3.2090272903442383, "learning_rate": 1.999997927621321e-05, "loss": 6.9373, "step": 960 }, { "epoch": 1.1701870850935425, "grad_norm": 3.351907968521118, "learning_rate": 1.9999979034958416e-05, "loss": 6.9175, "step": 970 }, { "epoch": 1.1822570911285455, "grad_norm": 3.4880125522613525, "learning_rate": 1.999997879370362e-05, "loss": 6.9329, "step": 980 }, { "epoch": 1.1943270971635487, "grad_norm": 3.1454896926879883, "learning_rate": 1.9999978552448825e-05, "loss": 6.9053, "step": 990 }, { "epoch": 1.2063971031985516, "grad_norm": 3.349754810333252, "learning_rate": 1.999997831119403e-05, "loss": 6.9306, "step": 1000 }, { "epoch": 1.2063971031985516, "eval_loss": 6.933590888977051, "eval_runtime": 8.1382, "eval_samples_per_second": 85.646, "eval_steps_per_second": 10.813, "step": 1000 }, { "epoch": 1.2184671092335546, "grad_norm": 2.9569194316864014, "learning_rate": 1.9999978069939237e-05, "loss": 6.8759, "step": 1010 }, { "epoch": 1.2305371152685576, "grad_norm": 3.491403341293335, "learning_rate": 1.9999977828684444e-05, "loss": 6.9237, "step": 1020 }, { "epoch": 1.2426071213035605, "grad_norm": 3.2862954139709473, "learning_rate": 1.999997758742965e-05, "loss": 6.9, "step": 1030 }, { "epoch": 1.2546771273385637, "grad_norm": 3.6485886573791504, "learning_rate": 1.9999977346174856e-05, "loss": 6.8659, "step": 1040 }, { "epoch": 1.2667471333735667, "grad_norm": 3.2352893352508545, "learning_rate": 1.9999977104920062e-05, "loss": 6.8894, "step": 1050 }, { "epoch": 1.2788171394085697, "grad_norm": 3.328986883163452, "learning_rate": 1.999997686366527e-05, "loss": 6.8718, "step": 1060 }, { "epoch": 1.2908871454435729, "grad_norm": 3.8388311862945557, "learning_rate": 1.9999976622410475e-05, "loss": 6.8475, "step": 1070 }, { "epoch": 1.3029571514785756, "grad_norm": 3.065333604812622, "learning_rate": 1.999997638115568e-05, "loss": 6.8319, "step": 1080 }, { "epoch": 1.3150271575135788, "grad_norm": 3.3170716762542725, "learning_rate": 1.9999976139900887e-05, "loss": 6.7979, "step": 1090 }, { "epoch": 1.3270971635485818, "grad_norm": 3.627504825592041, "learning_rate": 1.9999975898646093e-05, "loss": 6.834, "step": 1100 }, { "epoch": 1.3391671695835847, "grad_norm": 3.3063063621520996, "learning_rate": 1.99999756573913e-05, "loss": 6.812, "step": 1110 }, { "epoch": 1.351237175618588, "grad_norm": 3.4426610469818115, "learning_rate": 1.9999975416136506e-05, "loss": 6.8136, "step": 1120 }, { "epoch": 1.3633071816535909, "grad_norm": 3.105926275253296, "learning_rate": 1.9999975174881712e-05, "loss": 6.8197, "step": 1130 }, { "epoch": 1.3753771876885938, "grad_norm": 3.765151023864746, "learning_rate": 1.9999974933626918e-05, "loss": 6.8212, "step": 1140 }, { "epoch": 1.3874471937235968, "grad_norm": 3.159360408782959, "learning_rate": 1.9999974692372124e-05, "loss": 6.7666, "step": 1150 }, { "epoch": 1.3995171997585998, "grad_norm": 3.308159351348877, "learning_rate": 1.999997445111733e-05, "loss": 6.791, "step": 1160 }, { "epoch": 1.411587205793603, "grad_norm": 3.585155487060547, "learning_rate": 1.9999974209862537e-05, "loss": 6.7435, "step": 1170 }, { "epoch": 1.423657211828606, "grad_norm": 3.7532918453216553, "learning_rate": 1.9999973968607743e-05, "loss": 6.7523, "step": 1180 }, { "epoch": 1.435727217863609, "grad_norm": 3.954225540161133, "learning_rate": 1.999997372735295e-05, "loss": 6.7772, "step": 1190 }, { "epoch": 1.4477972238986119, "grad_norm": 3.8626627922058105, "learning_rate": 1.9999973486098156e-05, "loss": 6.7542, "step": 1200 }, { "epoch": 1.4598672299336148, "grad_norm": 3.871762275695801, "learning_rate": 1.9999973244843362e-05, "loss": 6.7276, "step": 1210 }, { "epoch": 1.471937235968618, "grad_norm": 3.64743971824646, "learning_rate": 1.9999973003588568e-05, "loss": 6.7359, "step": 1220 }, { "epoch": 1.484007242003621, "grad_norm": 3.657015562057495, "learning_rate": 1.999997276233377e-05, "loss": 6.7168, "step": 1230 }, { "epoch": 1.496077248038624, "grad_norm": 3.7097036838531494, "learning_rate": 1.9999972521078977e-05, "loss": 6.7306, "step": 1240 }, { "epoch": 1.5081472540736272, "grad_norm": 3.6480331420898438, "learning_rate": 1.9999972279824183e-05, "loss": 6.7126, "step": 1250 }, { "epoch": 1.52021726010863, "grad_norm": 3.506474733352661, "learning_rate": 1.999997203856939e-05, "loss": 6.7175, "step": 1260 }, { "epoch": 1.532287266143633, "grad_norm": 3.7088019847869873, "learning_rate": 1.9999971797314596e-05, "loss": 6.6856, "step": 1270 }, { "epoch": 1.544357272178636, "grad_norm": 3.9099819660186768, "learning_rate": 1.9999971556059802e-05, "loss": 6.7121, "step": 1280 }, { "epoch": 1.556427278213639, "grad_norm": 4.337199687957764, "learning_rate": 1.9999971314805008e-05, "loss": 6.7081, "step": 1290 }, { "epoch": 1.5684972842486422, "grad_norm": 3.5252597332000732, "learning_rate": 1.9999971073550214e-05, "loss": 6.6882, "step": 1300 }, { "epoch": 1.5805672902836452, "grad_norm": 3.9732561111450195, "learning_rate": 1.999997083229542e-05, "loss": 6.6706, "step": 1310 }, { "epoch": 1.5926372963186481, "grad_norm": 3.53662371635437, "learning_rate": 1.9999970591040627e-05, "loss": 6.6926, "step": 1320 }, { "epoch": 1.6047073023536513, "grad_norm": 3.0833487510681152, "learning_rate": 1.9999970349785833e-05, "loss": 6.6619, "step": 1330 }, { "epoch": 1.616777308388654, "grad_norm": 3.513148307800293, "learning_rate": 1.999997010853104e-05, "loss": 6.604, "step": 1340 }, { "epoch": 1.6288473144236573, "grad_norm": 3.416132688522339, "learning_rate": 1.9999969867276245e-05, "loss": 6.6334, "step": 1350 }, { "epoch": 1.6409173204586602, "grad_norm": 3.58746337890625, "learning_rate": 1.999996962602145e-05, "loss": 6.6614, "step": 1360 }, { "epoch": 1.6529873264936632, "grad_norm": 3.8537800312042236, "learning_rate": 1.9999969384766658e-05, "loss": 6.637, "step": 1370 }, { "epoch": 1.6650573325286664, "grad_norm": 3.5923471450805664, "learning_rate": 1.9999969143511864e-05, "loss": 6.6423, "step": 1380 }, { "epoch": 1.6771273385636691, "grad_norm": 3.664403200149536, "learning_rate": 1.999996890225707e-05, "loss": 6.6013, "step": 1390 }, { "epoch": 1.6891973445986723, "grad_norm": 4.395875453948975, "learning_rate": 1.9999968661002276e-05, "loss": 6.6222, "step": 1400 }, { "epoch": 1.7012673506336753, "grad_norm": 3.9861371517181396, "learning_rate": 1.9999968419747483e-05, "loss": 6.6111, "step": 1410 }, { "epoch": 1.7133373566686783, "grad_norm": 4.107321739196777, "learning_rate": 1.999996817849269e-05, "loss": 6.6084, "step": 1420 }, { "epoch": 1.7254073627036814, "grad_norm": 3.7860617637634277, "learning_rate": 1.9999967937237895e-05, "loss": 6.628, "step": 1430 }, { "epoch": 1.7374773687386844, "grad_norm": 3.900850296020508, "learning_rate": 1.99999676959831e-05, "loss": 6.6081, "step": 1440 }, { "epoch": 1.7495473747736874, "grad_norm": 3.4248597621917725, "learning_rate": 1.9999967454728308e-05, "loss": 6.5818, "step": 1450 }, { "epoch": 1.7616173808086906, "grad_norm": 3.699570655822754, "learning_rate": 1.9999967213473514e-05, "loss": 6.5979, "step": 1460 }, { "epoch": 1.7736873868436933, "grad_norm": 3.364863634109497, "learning_rate": 1.999996697221872e-05, "loss": 6.6205, "step": 1470 }, { "epoch": 1.7857573928786965, "grad_norm": 4.40854549407959, "learning_rate": 1.9999966730963926e-05, "loss": 6.5728, "step": 1480 }, { "epoch": 1.7978273989136995, "grad_norm": 4.39726448059082, "learning_rate": 1.9999966489709132e-05, "loss": 6.5884, "step": 1490 }, { "epoch": 1.8098974049487024, "grad_norm": 3.877406358718872, "learning_rate": 1.999996624845434e-05, "loss": 6.5496, "step": 1500 }, { "epoch": 1.8098974049487024, "eval_loss": 6.569983959197998, "eval_runtime": 8.1249, "eval_samples_per_second": 85.786, "eval_steps_per_second": 10.831, "step": 1500 }, { "epoch": 1.8219674109837056, "grad_norm": 3.8496246337890625, "learning_rate": 1.9999966007199545e-05, "loss": 6.5466, "step": 1510 }, { "epoch": 1.8340374170187084, "grad_norm": 3.6774070262908936, "learning_rate": 1.999996576594475e-05, "loss": 6.5551, "step": 1520 }, { "epoch": 1.8461074230537116, "grad_norm": 3.744553565979004, "learning_rate": 1.9999965524689957e-05, "loss": 6.533, "step": 1530 }, { "epoch": 1.8581774290887145, "grad_norm": 4.939999103546143, "learning_rate": 1.9999965283435163e-05, "loss": 6.5544, "step": 1540 }, { "epoch": 1.8702474351237175, "grad_norm": 3.7696049213409424, "learning_rate": 1.999996504218037e-05, "loss": 6.5204, "step": 1550 }, { "epoch": 1.8823174411587207, "grad_norm": 3.793053388595581, "learning_rate": 1.9999964800925576e-05, "loss": 6.5201, "step": 1560 }, { "epoch": 1.8943874471937237, "grad_norm": 3.905697822570801, "learning_rate": 1.9999964559670782e-05, "loss": 6.5133, "step": 1570 }, { "epoch": 1.9064574532287266, "grad_norm": 3.6795618534088135, "learning_rate": 1.999996431841599e-05, "loss": 6.4885, "step": 1580 }, { "epoch": 1.9185274592637296, "grad_norm": 3.8723955154418945, "learning_rate": 1.9999964077161195e-05, "loss": 6.4691, "step": 1590 }, { "epoch": 1.9305974652987326, "grad_norm": 3.49161958694458, "learning_rate": 1.99999638359064e-05, "loss": 6.5118, "step": 1600 }, { "epoch": 1.9426674713337357, "grad_norm": 4.202633857727051, "learning_rate": 1.9999963594651607e-05, "loss": 6.5041, "step": 1610 }, { "epoch": 1.9547374773687387, "grad_norm": 3.8224329948425293, "learning_rate": 1.9999963353396813e-05, "loss": 6.4886, "step": 1620 }, { "epoch": 1.9668074834037417, "grad_norm": 3.633540630340576, "learning_rate": 1.999996311214202e-05, "loss": 6.4834, "step": 1630 }, { "epoch": 1.9788774894387449, "grad_norm": 3.7334141731262207, "learning_rate": 1.9999962870887226e-05, "loss": 6.4488, "step": 1640 }, { "epoch": 1.9909474954737476, "grad_norm": 4.178130149841309, "learning_rate": 1.999996262963243e-05, "loss": 6.457, "step": 1650 }, { "epoch": 2.0024140012070006, "grad_norm": 4.090626239776611, "learning_rate": 1.9999962388377635e-05, "loss": 6.4446, "step": 1660 }, { "epoch": 2.014484007242004, "grad_norm": 3.7989370822906494, "learning_rate": 1.999996214712284e-05, "loss": 6.3821, "step": 1670 }, { "epoch": 2.0265540132770066, "grad_norm": 3.9468798637390137, "learning_rate": 1.9999961905868047e-05, "loss": 6.3635, "step": 1680 }, { "epoch": 2.0386240193120098, "grad_norm": 4.080833435058594, "learning_rate": 1.9999961664613253e-05, "loss": 6.3716, "step": 1690 }, { "epoch": 2.0506940253470125, "grad_norm": 4.077414035797119, "learning_rate": 1.999996142335846e-05, "loss": 6.3491, "step": 1700 }, { "epoch": 2.0627640313820157, "grad_norm": 4.297895908355713, "learning_rate": 1.9999961182103666e-05, "loss": 6.3503, "step": 1710 }, { "epoch": 2.074834037417019, "grad_norm": 3.8533949851989746, "learning_rate": 1.9999960940848872e-05, "loss": 6.3677, "step": 1720 }, { "epoch": 2.0869040434520216, "grad_norm": 3.9991440773010254, "learning_rate": 1.9999960699594078e-05, "loss": 6.3584, "step": 1730 }, { "epoch": 2.098974049487025, "grad_norm": 4.177391529083252, "learning_rate": 1.9999960458339284e-05, "loss": 6.3713, "step": 1740 }, { "epoch": 2.1110440555220276, "grad_norm": 4.062652587890625, "learning_rate": 1.999996021708449e-05, "loss": 6.2994, "step": 1750 }, { "epoch": 2.1231140615570308, "grad_norm": 4.1460280418396, "learning_rate": 1.9999959975829697e-05, "loss": 6.3422, "step": 1760 }, { "epoch": 2.135184067592034, "grad_norm": 3.517653465270996, "learning_rate": 1.9999959734574903e-05, "loss": 6.327, "step": 1770 }, { "epoch": 2.1472540736270367, "grad_norm": 4.079253673553467, "learning_rate": 1.999995949332011e-05, "loss": 6.3185, "step": 1780 }, { "epoch": 2.15932407966204, "grad_norm": 4.037069320678711, "learning_rate": 1.9999959252065315e-05, "loss": 6.307, "step": 1790 }, { "epoch": 2.1713940856970426, "grad_norm": 4.584740161895752, "learning_rate": 1.9999959010810522e-05, "loss": 6.3463, "step": 1800 }, { "epoch": 2.183464091732046, "grad_norm": 4.165858745574951, "learning_rate": 1.9999958769555728e-05, "loss": 6.2954, "step": 1810 }, { "epoch": 2.195534097767049, "grad_norm": 3.7596864700317383, "learning_rate": 1.9999958528300934e-05, "loss": 6.3094, "step": 1820 }, { "epoch": 2.2076041038020517, "grad_norm": 4.200249671936035, "learning_rate": 1.999995828704614e-05, "loss": 6.328, "step": 1830 }, { "epoch": 2.219674109837055, "grad_norm": 4.582587242126465, "learning_rate": 1.9999958045791347e-05, "loss": 6.3109, "step": 1840 }, { "epoch": 2.231744115872058, "grad_norm": 3.7759268283843994, "learning_rate": 1.9999957804536553e-05, "loss": 6.3113, "step": 1850 }, { "epoch": 2.243814121907061, "grad_norm": 3.9443435668945312, "learning_rate": 1.999995756328176e-05, "loss": 6.291, "step": 1860 }, { "epoch": 2.255884127942064, "grad_norm": 4.428783416748047, "learning_rate": 1.9999957322026965e-05, "loss": 6.2674, "step": 1870 }, { "epoch": 2.267954133977067, "grad_norm": 4.355795860290527, "learning_rate": 1.999995708077217e-05, "loss": 6.2813, "step": 1880 }, { "epoch": 2.28002414001207, "grad_norm": 3.886857032775879, "learning_rate": 1.9999956839517378e-05, "loss": 6.3012, "step": 1890 }, { "epoch": 2.292094146047073, "grad_norm": 4.490399360656738, "learning_rate": 1.999995659826258e-05, "loss": 6.2491, "step": 1900 }, { "epoch": 2.304164152082076, "grad_norm": 4.127467632293701, "learning_rate": 1.9999956357007787e-05, "loss": 6.2499, "step": 1910 }, { "epoch": 2.316234158117079, "grad_norm": 4.235049247741699, "learning_rate": 1.9999956115752993e-05, "loss": 6.2552, "step": 1920 }, { "epoch": 2.3283041641520823, "grad_norm": 4.562445640563965, "learning_rate": 1.99999558744982e-05, "loss": 6.2556, "step": 1930 }, { "epoch": 2.340374170187085, "grad_norm": 4.285699367523193, "learning_rate": 1.9999955633243405e-05, "loss": 6.268, "step": 1940 }, { "epoch": 2.3524441762220882, "grad_norm": 3.6267175674438477, "learning_rate": 1.999995539198861e-05, "loss": 6.234, "step": 1950 }, { "epoch": 2.364514182257091, "grad_norm": 3.908088445663452, "learning_rate": 1.9999955150733818e-05, "loss": 6.2339, "step": 1960 }, { "epoch": 2.376584188292094, "grad_norm": 4.06550931930542, "learning_rate": 1.9999954909479024e-05, "loss": 6.2585, "step": 1970 }, { "epoch": 2.3886541943270974, "grad_norm": 4.259022235870361, "learning_rate": 1.999995466822423e-05, "loss": 6.2736, "step": 1980 }, { "epoch": 2.4007242003621, "grad_norm": 4.167940139770508, "learning_rate": 1.9999954426969436e-05, "loss": 6.2066, "step": 1990 }, { "epoch": 2.4127942063971033, "grad_norm": 4.81652307510376, "learning_rate": 1.9999954185714646e-05, "loss": 6.2016, "step": 2000 }, { "epoch": 2.4127942063971033, "eval_loss": 6.277624130249023, "eval_runtime": 8.1212, "eval_samples_per_second": 85.825, "eval_steps_per_second": 10.836, "step": 2000 }, { "epoch": 2.424864212432106, "grad_norm": 4.151612281799316, "learning_rate": 1.9999953944459852e-05, "loss": 6.2081, "step": 2010 }, { "epoch": 2.4369342184671092, "grad_norm": 4.6985883712768555, "learning_rate": 1.999995370320506e-05, "loss": 6.2007, "step": 2020 }, { "epoch": 2.4490042245021124, "grad_norm": 4.6884074211120605, "learning_rate": 1.9999953461950265e-05, "loss": 6.1913, "step": 2030 }, { "epoch": 2.461074230537115, "grad_norm": 4.132936477661133, "learning_rate": 1.999995322069547e-05, "loss": 6.1907, "step": 2040 }, { "epoch": 2.4731442365721183, "grad_norm": 4.300856590270996, "learning_rate": 1.9999952979440677e-05, "loss": 6.1441, "step": 2050 }, { "epoch": 2.485214242607121, "grad_norm": 3.916482925415039, "learning_rate": 1.999995273818588e-05, "loss": 6.1798, "step": 2060 }, { "epoch": 2.4972842486421243, "grad_norm": 4.4001688957214355, "learning_rate": 1.9999952496931086e-05, "loss": 6.1806, "step": 2070 }, { "epoch": 2.5093542546771275, "grad_norm": 4.604231834411621, "learning_rate": 1.9999952255676292e-05, "loss": 6.157, "step": 2080 }, { "epoch": 2.52142426071213, "grad_norm": 4.063570976257324, "learning_rate": 1.99999520144215e-05, "loss": 6.1178, "step": 2090 }, { "epoch": 2.5334942667471334, "grad_norm": 3.85394549369812, "learning_rate": 1.9999951773166705e-05, "loss": 6.1918, "step": 2100 }, { "epoch": 2.545564272782136, "grad_norm": 5.3021674156188965, "learning_rate": 1.999995153191191e-05, "loss": 6.1822, "step": 2110 }, { "epoch": 2.5576342788171393, "grad_norm": 4.325355529785156, "learning_rate": 1.9999951290657117e-05, "loss": 6.1386, "step": 2120 }, { "epoch": 2.5697042848521425, "grad_norm": 4.458052635192871, "learning_rate": 1.9999951049402323e-05, "loss": 6.1447, "step": 2130 }, { "epoch": 2.5817742908871457, "grad_norm": 4.082151412963867, "learning_rate": 1.999995080814753e-05, "loss": 6.1506, "step": 2140 }, { "epoch": 2.5938442969221485, "grad_norm": 4.054776668548584, "learning_rate": 1.9999950566892736e-05, "loss": 6.1806, "step": 2150 }, { "epoch": 2.605914302957151, "grad_norm": 4.010200023651123, "learning_rate": 1.9999950325637942e-05, "loss": 6.1164, "step": 2160 }, { "epoch": 2.6179843089921544, "grad_norm": 4.434733867645264, "learning_rate": 1.9999950084383148e-05, "loss": 6.1314, "step": 2170 }, { "epoch": 2.6300543150271576, "grad_norm": 5.285394668579102, "learning_rate": 1.9999949843128355e-05, "loss": 6.1671, "step": 2180 }, { "epoch": 2.6421243210621608, "grad_norm": 4.716556549072266, "learning_rate": 1.999994960187356e-05, "loss": 6.1082, "step": 2190 }, { "epoch": 2.6541943270971635, "grad_norm": 5.001852035522461, "learning_rate": 1.9999949360618767e-05, "loss": 6.112, "step": 2200 }, { "epoch": 2.6662643331321667, "grad_norm": 4.333192348480225, "learning_rate": 1.9999949119363973e-05, "loss": 6.1422, "step": 2210 }, { "epoch": 2.6783343391671695, "grad_norm": 3.991992473602295, "learning_rate": 1.999994887810918e-05, "loss": 6.0625, "step": 2220 }, { "epoch": 2.6904043452021726, "grad_norm": 4.0101518630981445, "learning_rate": 1.9999948636854386e-05, "loss": 6.0762, "step": 2230 }, { "epoch": 2.702474351237176, "grad_norm": 4.289837837219238, "learning_rate": 1.9999948395599592e-05, "loss": 6.1197, "step": 2240 }, { "epoch": 2.7145443572721786, "grad_norm": 4.2591447830200195, "learning_rate": 1.9999948154344798e-05, "loss": 6.0765, "step": 2250 }, { "epoch": 2.7266143633071818, "grad_norm": 5.089157581329346, "learning_rate": 1.9999947913090004e-05, "loss": 6.119, "step": 2260 }, { "epoch": 2.7386843693421845, "grad_norm": 3.870039463043213, "learning_rate": 1.999994767183521e-05, "loss": 6.1173, "step": 2270 }, { "epoch": 2.7507543753771877, "grad_norm": 4.682967185974121, "learning_rate": 1.9999947430580417e-05, "loss": 6.0604, "step": 2280 }, { "epoch": 2.762824381412191, "grad_norm": 4.432596683502197, "learning_rate": 1.9999947189325623e-05, "loss": 6.1031, "step": 2290 }, { "epoch": 2.7748943874471936, "grad_norm": 4.1108269691467285, "learning_rate": 1.999994694807083e-05, "loss": 6.0626, "step": 2300 }, { "epoch": 2.786964393482197, "grad_norm": 4.565855979919434, "learning_rate": 1.9999946706816032e-05, "loss": 6.0518, "step": 2310 }, { "epoch": 2.7990343995171996, "grad_norm": 4.437104225158691, "learning_rate": 1.9999946465561238e-05, "loss": 6.052, "step": 2320 }, { "epoch": 2.8111044055522028, "grad_norm": 4.2852983474731445, "learning_rate": 1.9999946224306444e-05, "loss": 6.0629, "step": 2330 }, { "epoch": 2.823174411587206, "grad_norm": 4.230132102966309, "learning_rate": 1.999994598305165e-05, "loss": 6.0686, "step": 2340 }, { "epoch": 2.8352444176222087, "grad_norm": 4.169485092163086, "learning_rate": 1.9999945741796857e-05, "loss": 5.9987, "step": 2350 }, { "epoch": 2.847314423657212, "grad_norm": 4.118642330169678, "learning_rate": 1.9999945500542063e-05, "loss": 6.0175, "step": 2360 }, { "epoch": 2.8593844296922146, "grad_norm": 4.453094482421875, "learning_rate": 1.999994525928727e-05, "loss": 6.0084, "step": 2370 }, { "epoch": 2.871454435727218, "grad_norm": 4.645263195037842, "learning_rate": 1.9999945018032475e-05, "loss": 6.0518, "step": 2380 }, { "epoch": 2.883524441762221, "grad_norm": 4.563093185424805, "learning_rate": 1.999994477677768e-05, "loss": 5.9886, "step": 2390 }, { "epoch": 2.8955944477972237, "grad_norm": 4.025425434112549, "learning_rate": 1.9999944535522888e-05, "loss": 6.0164, "step": 2400 }, { "epoch": 2.907664453832227, "grad_norm": 4.080743312835693, "learning_rate": 1.9999944294268094e-05, "loss": 6.0218, "step": 2410 }, { "epoch": 2.9197344598672297, "grad_norm": 4.533753871917725, "learning_rate": 1.99999440530133e-05, "loss": 6.0029, "step": 2420 }, { "epoch": 2.931804465902233, "grad_norm": 4.443739891052246, "learning_rate": 1.9999943811758507e-05, "loss": 5.977, "step": 2430 }, { "epoch": 2.943874471937236, "grad_norm": 4.566436767578125, "learning_rate": 1.9999943570503713e-05, "loss": 5.9862, "step": 2440 }, { "epoch": 2.9559444779722392, "grad_norm": 5.218013286590576, "learning_rate": 1.999994332924892e-05, "loss": 5.9999, "step": 2450 }, { "epoch": 2.968014484007242, "grad_norm": 4.603177547454834, "learning_rate": 1.9999943087994125e-05, "loss": 5.9929, "step": 2460 }, { "epoch": 2.980084490042245, "grad_norm": 4.260584354400635, "learning_rate": 1.999994284673933e-05, "loss": 5.9833, "step": 2470 }, { "epoch": 2.992154496077248, "grad_norm": 4.8346052169799805, "learning_rate": 1.9999942605484538e-05, "loss": 5.9829, "step": 2480 }, { "epoch": 3.003621001810501, "grad_norm": 4.525562763214111, "learning_rate": 1.9999942364229744e-05, "loss": 6.0017, "step": 2490 }, { "epoch": 3.015691007845504, "grad_norm": 4.5755157470703125, "learning_rate": 1.999994212297495e-05, "loss": 5.8435, "step": 2500 }, { "epoch": 3.015691007845504, "eval_loss": 6.0249786376953125, "eval_runtime": 8.1262, "eval_samples_per_second": 85.772, "eval_steps_per_second": 10.829, "step": 2500 }, { "epoch": 3.027761013880507, "grad_norm": 4.270496368408203, "learning_rate": 1.9999941881720156e-05, "loss": 5.8403, "step": 2510 }, { "epoch": 3.03983101991551, "grad_norm": 5.235534191131592, "learning_rate": 1.9999941640465362e-05, "loss": 5.8469, "step": 2520 }, { "epoch": 3.051901025950513, "grad_norm": 4.75909423828125, "learning_rate": 1.999994139921057e-05, "loss": 5.8194, "step": 2530 }, { "epoch": 3.063971031985516, "grad_norm": 4.392838954925537, "learning_rate": 1.9999941157955775e-05, "loss": 5.831, "step": 2540 }, { "epoch": 3.076041038020519, "grad_norm": 4.258215427398682, "learning_rate": 1.999994091670098e-05, "loss": 5.8389, "step": 2550 }, { "epoch": 3.088111044055522, "grad_norm": 4.256363391876221, "learning_rate": 1.9999940675446187e-05, "loss": 5.8493, "step": 2560 }, { "epoch": 3.100181050090525, "grad_norm": 4.110770225524902, "learning_rate": 1.9999940434191394e-05, "loss": 5.8445, "step": 2570 }, { "epoch": 3.112251056125528, "grad_norm": 5.043090343475342, "learning_rate": 1.99999401929366e-05, "loss": 5.8123, "step": 2580 }, { "epoch": 3.124321062160531, "grad_norm": 5.677689552307129, "learning_rate": 1.9999939951681806e-05, "loss": 5.8296, "step": 2590 }, { "epoch": 3.1363910681955343, "grad_norm": 5.297140121459961, "learning_rate": 1.9999939710427012e-05, "loss": 5.821, "step": 2600 }, { "epoch": 3.148461074230537, "grad_norm": 4.279438018798828, "learning_rate": 1.999993946917222e-05, "loss": 5.8147, "step": 2610 }, { "epoch": 3.16053108026554, "grad_norm": 5.074165344238281, "learning_rate": 1.9999939227917425e-05, "loss": 5.7925, "step": 2620 }, { "epoch": 3.172601086300543, "grad_norm": 4.660333633422852, "learning_rate": 1.999993898666263e-05, "loss": 5.7963, "step": 2630 }, { "epoch": 3.184671092335546, "grad_norm": 5.213122367858887, "learning_rate": 1.9999938745407837e-05, "loss": 5.8135, "step": 2640 }, { "epoch": 3.1967410983705493, "grad_norm": 4.691129207611084, "learning_rate": 1.9999938504153043e-05, "loss": 5.8175, "step": 2650 }, { "epoch": 3.208811104405552, "grad_norm": 4.675544261932373, "learning_rate": 1.999993826289825e-05, "loss": 5.823, "step": 2660 }, { "epoch": 3.2208811104405553, "grad_norm": 4.336679458618164, "learning_rate": 1.9999938021643456e-05, "loss": 5.8368, "step": 2670 }, { "epoch": 3.2329511164755584, "grad_norm": 4.559335708618164, "learning_rate": 1.9999937780388662e-05, "loss": 5.8389, "step": 2680 }, { "epoch": 3.245021122510561, "grad_norm": 4.564547538757324, "learning_rate": 1.9999937539133868e-05, "loss": 5.7995, "step": 2690 }, { "epoch": 3.2570911285455644, "grad_norm": 4.881879806518555, "learning_rate": 1.9999937297879074e-05, "loss": 5.7961, "step": 2700 }, { "epoch": 3.269161134580567, "grad_norm": 4.885284423828125, "learning_rate": 1.999993705662428e-05, "loss": 5.8253, "step": 2710 }, { "epoch": 3.2812311406155703, "grad_norm": 4.964684963226318, "learning_rate": 1.9999936815369487e-05, "loss": 5.784, "step": 2720 }, { "epoch": 3.2933011466505735, "grad_norm": 4.644830226898193, "learning_rate": 1.999993657411469e-05, "loss": 5.7946, "step": 2730 }, { "epoch": 3.3053711526855762, "grad_norm": 4.739643096923828, "learning_rate": 1.9999936332859896e-05, "loss": 5.7766, "step": 2740 }, { "epoch": 3.3174411587205794, "grad_norm": 5.476995944976807, "learning_rate": 1.9999936091605102e-05, "loss": 5.7797, "step": 2750 }, { "epoch": 3.3295111647555826, "grad_norm": 4.820225715637207, "learning_rate": 1.9999935850350308e-05, "loss": 5.7786, "step": 2760 }, { "epoch": 3.3415811707905854, "grad_norm": 5.247040748596191, "learning_rate": 1.9999935609095514e-05, "loss": 5.8089, "step": 2770 }, { "epoch": 3.3536511768255886, "grad_norm": 4.964357376098633, "learning_rate": 1.999993536784072e-05, "loss": 5.7659, "step": 2780 }, { "epoch": 3.3657211828605913, "grad_norm": 5.9192328453063965, "learning_rate": 1.9999935126585927e-05, "loss": 5.8011, "step": 2790 }, { "epoch": 3.3777911888955945, "grad_norm": 4.913425922393799, "learning_rate": 1.9999934885331133e-05, "loss": 5.8053, "step": 2800 }, { "epoch": 3.3898611949305977, "grad_norm": 4.491196155548096, "learning_rate": 1.999993464407634e-05, "loss": 5.7498, "step": 2810 }, { "epoch": 3.4019312009656004, "grad_norm": 4.075977802276611, "learning_rate": 1.9999934402821546e-05, "loss": 5.7754, "step": 2820 }, { "epoch": 3.4140012070006036, "grad_norm": 4.701062202453613, "learning_rate": 1.9999934161566752e-05, "loss": 5.7594, "step": 2830 }, { "epoch": 3.4260712130356064, "grad_norm": 4.464338779449463, "learning_rate": 1.9999933920311958e-05, "loss": 5.7487, "step": 2840 }, { "epoch": 3.4381412190706095, "grad_norm": 4.848139762878418, "learning_rate": 1.9999933679057164e-05, "loss": 5.7735, "step": 2850 }, { "epoch": 3.4502112251056127, "grad_norm": 4.883413314819336, "learning_rate": 1.999993343780237e-05, "loss": 5.7571, "step": 2860 }, { "epoch": 3.4622812311406155, "grad_norm": 4.8064703941345215, "learning_rate": 1.9999933196547577e-05, "loss": 5.7567, "step": 2870 }, { "epoch": 3.4743512371756187, "grad_norm": 4.740821838378906, "learning_rate": 1.9999932955292783e-05, "loss": 5.773, "step": 2880 }, { "epoch": 3.4864212432106214, "grad_norm": 4.760237216949463, "learning_rate": 1.999993271403799e-05, "loss": 5.7786, "step": 2890 }, { "epoch": 3.4984912492456246, "grad_norm": 5.0027756690979, "learning_rate": 1.9999932472783195e-05, "loss": 5.7692, "step": 2900 }, { "epoch": 3.510561255280628, "grad_norm": 4.797269821166992, "learning_rate": 1.99999322315284e-05, "loss": 5.7367, "step": 2910 }, { "epoch": 3.5226312613156305, "grad_norm": 4.526089668273926, "learning_rate": 1.9999931990273608e-05, "loss": 5.7379, "step": 2920 }, { "epoch": 3.5347012673506337, "grad_norm": 4.4259819984436035, "learning_rate": 1.9999931749018814e-05, "loss": 5.7186, "step": 2930 }, { "epoch": 3.5467712733856365, "grad_norm": 4.950536727905273, "learning_rate": 1.999993150776402e-05, "loss": 5.7407, "step": 2940 }, { "epoch": 3.5588412794206397, "grad_norm": 4.783015727996826, "learning_rate": 1.9999931266509226e-05, "loss": 5.7195, "step": 2950 }, { "epoch": 3.570911285455643, "grad_norm": 4.393186092376709, "learning_rate": 1.9999931025254433e-05, "loss": 5.7079, "step": 2960 }, { "epoch": 3.5829812914906456, "grad_norm": 4.648801326751709, "learning_rate": 1.999993078399964e-05, "loss": 5.7336, "step": 2970 }, { "epoch": 3.595051297525649, "grad_norm": 5.3399224281311035, "learning_rate": 1.999993054274484e-05, "loss": 5.7567, "step": 2980 }, { "epoch": 3.6071213035606515, "grad_norm": 5.182999134063721, "learning_rate": 1.9999930301490048e-05, "loss": 5.6882, "step": 2990 }, { "epoch": 3.6191913095956547, "grad_norm": 5.317803859710693, "learning_rate": 1.9999930060235254e-05, "loss": 5.7216, "step": 3000 }, { "epoch": 3.6191913095956547, "eval_loss": 5.855939865112305, "eval_runtime": 8.1317, "eval_samples_per_second": 85.714, "eval_steps_per_second": 10.822, "step": 3000 }, { "epoch": 3.631261315630658, "grad_norm": 4.965269565582275, "learning_rate": 1.999992981898046e-05, "loss": 5.7086, "step": 3010 }, { "epoch": 3.643331321665661, "grad_norm": 4.8982367515563965, "learning_rate": 1.9999929577725666e-05, "loss": 5.7181, "step": 3020 }, { "epoch": 3.655401327700664, "grad_norm": 5.793337345123291, "learning_rate": 1.9999929336470873e-05, "loss": 5.7124, "step": 3030 }, { "epoch": 3.667471333735667, "grad_norm": 4.828402519226074, "learning_rate": 1.999992909521608e-05, "loss": 5.7216, "step": 3040 }, { "epoch": 3.6795413397706698, "grad_norm": 4.2826762199401855, "learning_rate": 1.9999928853961285e-05, "loss": 5.7061, "step": 3050 }, { "epoch": 3.691611345805673, "grad_norm": 4.731307029724121, "learning_rate": 1.999992861270649e-05, "loss": 5.708, "step": 3060 }, { "epoch": 3.703681351840676, "grad_norm": 4.717021942138672, "learning_rate": 1.9999928371451698e-05, "loss": 5.7148, "step": 3070 }, { "epoch": 3.715751357875679, "grad_norm": 5.286647796630859, "learning_rate": 1.9999928130196907e-05, "loss": 5.6818, "step": 3080 }, { "epoch": 3.727821363910682, "grad_norm": 5.005406856536865, "learning_rate": 1.9999927888942113e-05, "loss": 5.6887, "step": 3090 }, { "epoch": 3.739891369945685, "grad_norm": 4.6989054679870605, "learning_rate": 1.999992764768732e-05, "loss": 5.7221, "step": 3100 }, { "epoch": 3.751961375980688, "grad_norm": 4.697889804840088, "learning_rate": 1.9999927406432526e-05, "loss": 5.7077, "step": 3110 }, { "epoch": 3.764031382015691, "grad_norm": 4.891970634460449, "learning_rate": 1.9999927165177732e-05, "loss": 5.7052, "step": 3120 }, { "epoch": 3.776101388050694, "grad_norm": 4.643106460571289, "learning_rate": 1.9999926923922938e-05, "loss": 5.7036, "step": 3130 }, { "epoch": 3.788171394085697, "grad_norm": 4.476682186126709, "learning_rate": 1.999992668266814e-05, "loss": 5.687, "step": 3140 }, { "epoch": 3.8002414001207, "grad_norm": 5.070344924926758, "learning_rate": 1.9999926441413347e-05, "loss": 5.7214, "step": 3150 }, { "epoch": 3.812311406155703, "grad_norm": 4.795352935791016, "learning_rate": 1.9999926200158553e-05, "loss": 5.6953, "step": 3160 }, { "epoch": 3.8243814121907063, "grad_norm": 4.969409942626953, "learning_rate": 1.999992595890376e-05, "loss": 5.6378, "step": 3170 }, { "epoch": 3.836451418225709, "grad_norm": 4.918354034423828, "learning_rate": 1.9999925717648966e-05, "loss": 5.6849, "step": 3180 }, { "epoch": 3.848521424260712, "grad_norm": 5.721909523010254, "learning_rate": 1.9999925476394172e-05, "loss": 5.6544, "step": 3190 }, { "epoch": 3.860591430295715, "grad_norm": 4.857161521911621, "learning_rate": 1.999992523513938e-05, "loss": 5.6254, "step": 3200 }, { "epoch": 3.872661436330718, "grad_norm": 4.972757816314697, "learning_rate": 1.9999924993884585e-05, "loss": 5.6821, "step": 3210 }, { "epoch": 3.8847314423657213, "grad_norm": 5.06093692779541, "learning_rate": 1.999992475262979e-05, "loss": 5.6557, "step": 3220 }, { "epoch": 3.896801448400724, "grad_norm": 4.827012062072754, "learning_rate": 1.9999924511374997e-05, "loss": 5.6769, "step": 3230 }, { "epoch": 3.9088714544357273, "grad_norm": 4.8500471115112305, "learning_rate": 1.9999924270120203e-05, "loss": 5.67, "step": 3240 }, { "epoch": 3.92094146047073, "grad_norm": 4.501436233520508, "learning_rate": 1.999992402886541e-05, "loss": 5.6497, "step": 3250 }, { "epoch": 3.933011466505733, "grad_norm": 5.439869403839111, "learning_rate": 1.9999923787610616e-05, "loss": 5.6488, "step": 3260 }, { "epoch": 3.9450814725407364, "grad_norm": 5.1018476486206055, "learning_rate": 1.9999923546355822e-05, "loss": 5.6676, "step": 3270 }, { "epoch": 3.957151478575739, "grad_norm": 4.843660831451416, "learning_rate": 1.9999923305101028e-05, "loss": 5.6343, "step": 3280 }, { "epoch": 3.9692214846107423, "grad_norm": 5.260195732116699, "learning_rate": 1.9999923063846234e-05, "loss": 5.6352, "step": 3290 }, { "epoch": 3.981291490645745, "grad_norm": 4.9638495445251465, "learning_rate": 1.999992282259144e-05, "loss": 5.6849, "step": 3300 }, { "epoch": 3.9933614966807482, "grad_norm": 5.397216320037842, "learning_rate": 1.9999922581336647e-05, "loss": 5.686, "step": 3310 }, { "epoch": 4.004828002414001, "grad_norm": 5.55210542678833, "learning_rate": 1.9999922340081853e-05, "loss": 5.6076, "step": 3320 }, { "epoch": 4.0168980084490045, "grad_norm": 4.684352397918701, "learning_rate": 1.999992209882706e-05, "loss": 5.4937, "step": 3330 }, { "epoch": 4.028968014484008, "grad_norm": 4.934830188751221, "learning_rate": 1.9999921857572265e-05, "loss": 5.4646, "step": 3340 }, { "epoch": 4.04103802051901, "grad_norm": 5.447464942932129, "learning_rate": 1.999992161631747e-05, "loss": 5.4809, "step": 3350 }, { "epoch": 4.053108026554013, "grad_norm": 5.638469696044922, "learning_rate": 1.9999921375062678e-05, "loss": 5.4747, "step": 3360 }, { "epoch": 4.065178032589016, "grad_norm": 5.350146293640137, "learning_rate": 1.9999921133807884e-05, "loss": 5.4713, "step": 3370 }, { "epoch": 4.0772480386240195, "grad_norm": 5.620610237121582, "learning_rate": 1.999992089255309e-05, "loss": 5.4668, "step": 3380 }, { "epoch": 4.089318044659023, "grad_norm": 4.960579872131348, "learning_rate": 1.9999920651298293e-05, "loss": 5.4749, "step": 3390 }, { "epoch": 4.101388050694025, "grad_norm": 4.534282207489014, "learning_rate": 1.99999204100435e-05, "loss": 5.4572, "step": 3400 }, { "epoch": 4.113458056729028, "grad_norm": 4.909631252288818, "learning_rate": 1.9999920168788705e-05, "loss": 5.4538, "step": 3410 }, { "epoch": 4.125528062764031, "grad_norm": 5.186295509338379, "learning_rate": 1.999991992753391e-05, "loss": 5.4751, "step": 3420 }, { "epoch": 4.137598068799035, "grad_norm": 5.374114036560059, "learning_rate": 1.9999919686279118e-05, "loss": 5.4843, "step": 3430 }, { "epoch": 4.149668074834038, "grad_norm": 4.647705078125, "learning_rate": 1.9999919445024324e-05, "loss": 5.4634, "step": 3440 }, { "epoch": 4.16173808086904, "grad_norm": 5.488769054412842, "learning_rate": 1.999991920376953e-05, "loss": 5.491, "step": 3450 }, { "epoch": 4.173808086904043, "grad_norm": 4.725619316101074, "learning_rate": 1.9999918962514737e-05, "loss": 5.4706, "step": 3460 }, { "epoch": 4.1858780929390464, "grad_norm": 5.108649253845215, "learning_rate": 1.9999918721259943e-05, "loss": 5.46, "step": 3470 }, { "epoch": 4.19794809897405, "grad_norm": 4.569761753082275, "learning_rate": 1.999991848000515e-05, "loss": 5.4604, "step": 3480 }, { "epoch": 4.210018105009053, "grad_norm": 4.539684295654297, "learning_rate": 1.9999918238750355e-05, "loss": 5.4294, "step": 3490 }, { "epoch": 4.222088111044055, "grad_norm": 5.000156402587891, "learning_rate": 1.999991799749556e-05, "loss": 5.4677, "step": 3500 }, { "epoch": 4.222088111044055, "eval_loss": 5.737122058868408, "eval_runtime": 8.1268, "eval_samples_per_second": 85.766, "eval_steps_per_second": 10.828, "step": 3500 }, { "epoch": 4.234158117079058, "grad_norm": 5.580384254455566, "learning_rate": 1.9999917756240768e-05, "loss": 5.4812, "step": 3510 }, { "epoch": 4.2462281231140615, "grad_norm": 5.41287899017334, "learning_rate": 1.9999917514985974e-05, "loss": 5.4697, "step": 3520 }, { "epoch": 4.258298129149065, "grad_norm": 5.57099723815918, "learning_rate": 1.999991727373118e-05, "loss": 5.4418, "step": 3530 }, { "epoch": 4.270368135184068, "grad_norm": 5.519484043121338, "learning_rate": 1.9999917032476386e-05, "loss": 5.4519, "step": 3540 }, { "epoch": 4.282438141219071, "grad_norm": 4.982200622558594, "learning_rate": 1.9999916791221592e-05, "loss": 5.4689, "step": 3550 }, { "epoch": 4.294508147254073, "grad_norm": 5.0454583168029785, "learning_rate": 1.99999165499668e-05, "loss": 5.4518, "step": 3560 }, { "epoch": 4.306578153289077, "grad_norm": 5.679136753082275, "learning_rate": 1.9999916308712005e-05, "loss": 5.4579, "step": 3570 }, { "epoch": 4.31864815932408, "grad_norm": 5.73008918762207, "learning_rate": 1.999991606745721e-05, "loss": 5.4782, "step": 3580 }, { "epoch": 4.330718165359083, "grad_norm": 5.278133392333984, "learning_rate": 1.9999915826202417e-05, "loss": 5.4593, "step": 3590 }, { "epoch": 4.342788171394085, "grad_norm": 5.404110431671143, "learning_rate": 1.9999915584947624e-05, "loss": 5.4365, "step": 3600 }, { "epoch": 4.354858177429088, "grad_norm": 5.1320271492004395, "learning_rate": 1.999991534369283e-05, "loss": 5.4616, "step": 3610 }, { "epoch": 4.366928183464092, "grad_norm": 5.123870372772217, "learning_rate": 1.9999915102438036e-05, "loss": 5.5095, "step": 3620 }, { "epoch": 4.378998189499095, "grad_norm": 5.091551303863525, "learning_rate": 1.9999914861183242e-05, "loss": 5.4475, "step": 3630 }, { "epoch": 4.391068195534098, "grad_norm": 5.46300745010376, "learning_rate": 1.999991461992845e-05, "loss": 5.4475, "step": 3640 }, { "epoch": 4.403138201569101, "grad_norm": 5.227514743804932, "learning_rate": 1.9999914378673655e-05, "loss": 5.4555, "step": 3650 }, { "epoch": 4.4152082076041035, "grad_norm": 5.30247688293457, "learning_rate": 1.999991413741886e-05, "loss": 5.4418, "step": 3660 }, { "epoch": 4.427278213639107, "grad_norm": 5.6019439697265625, "learning_rate": 1.9999913896164067e-05, "loss": 5.4078, "step": 3670 }, { "epoch": 4.43934821967411, "grad_norm": 5.541022300720215, "learning_rate": 1.9999913654909273e-05, "loss": 5.4545, "step": 3680 }, { "epoch": 4.451418225709113, "grad_norm": 5.505614280700684, "learning_rate": 1.999991341365448e-05, "loss": 5.4529, "step": 3690 }, { "epoch": 4.463488231744116, "grad_norm": 5.392923355102539, "learning_rate": 1.9999913172399686e-05, "loss": 5.4677, "step": 3700 }, { "epoch": 4.4755582377791185, "grad_norm": 5.023168087005615, "learning_rate": 1.9999912931144892e-05, "loss": 5.4413, "step": 3710 }, { "epoch": 4.487628243814122, "grad_norm": 5.0972137451171875, "learning_rate": 1.9999912689890098e-05, "loss": 5.4761, "step": 3720 }, { "epoch": 4.499698249849125, "grad_norm": 6.128725051879883, "learning_rate": 1.9999912448635304e-05, "loss": 5.4189, "step": 3730 }, { "epoch": 4.511768255884128, "grad_norm": 6.008657455444336, "learning_rate": 1.999991220738051e-05, "loss": 5.4416, "step": 3740 }, { "epoch": 4.523838261919131, "grad_norm": 5.0652337074279785, "learning_rate": 1.9999911966125717e-05, "loss": 5.4268, "step": 3750 }, { "epoch": 4.535908267954134, "grad_norm": 5.306916236877441, "learning_rate": 1.9999911724870923e-05, "loss": 5.4737, "step": 3760 }, { "epoch": 4.547978273989137, "grad_norm": 5.442537784576416, "learning_rate": 1.999991148361613e-05, "loss": 5.433, "step": 3770 }, { "epoch": 4.56004828002414, "grad_norm": 4.570694923400879, "learning_rate": 1.9999911242361335e-05, "loss": 5.4141, "step": 3780 }, { "epoch": 4.572118286059143, "grad_norm": 5.069536209106445, "learning_rate": 1.999991100110654e-05, "loss": 5.4438, "step": 3790 }, { "epoch": 4.584188292094146, "grad_norm": 5.854760646820068, "learning_rate": 1.9999910759851744e-05, "loss": 5.4521, "step": 3800 }, { "epoch": 4.596258298129149, "grad_norm": 5.2129645347595215, "learning_rate": 1.999991051859695e-05, "loss": 5.4007, "step": 3810 }, { "epoch": 4.608328304164152, "grad_norm": 5.326879978179932, "learning_rate": 1.9999910277342157e-05, "loss": 5.4185, "step": 3820 }, { "epoch": 4.620398310199155, "grad_norm": 4.878740310668945, "learning_rate": 1.9999910036087363e-05, "loss": 5.4784, "step": 3830 }, { "epoch": 4.632468316234158, "grad_norm": 5.480919361114502, "learning_rate": 1.999990979483257e-05, "loss": 5.41, "step": 3840 }, { "epoch": 4.644538322269161, "grad_norm": 5.839482307434082, "learning_rate": 1.9999909553577776e-05, "loss": 5.4203, "step": 3850 }, { "epoch": 4.656608328304165, "grad_norm": 5.166815757751465, "learning_rate": 1.9999909312322982e-05, "loss": 5.3788, "step": 3860 }, { "epoch": 4.668678334339167, "grad_norm": 5.162976264953613, "learning_rate": 1.9999909071068188e-05, "loss": 5.4117, "step": 3870 }, { "epoch": 4.68074834037417, "grad_norm": 4.92236328125, "learning_rate": 1.9999908829813394e-05, "loss": 5.359, "step": 3880 }, { "epoch": 4.692818346409173, "grad_norm": 5.796568393707275, "learning_rate": 1.99999085885586e-05, "loss": 5.3669, "step": 3890 }, { "epoch": 4.7048883524441765, "grad_norm": 5.832388401031494, "learning_rate": 1.9999908347303807e-05, "loss": 5.4189, "step": 3900 }, { "epoch": 4.716958358479179, "grad_norm": 5.189361095428467, "learning_rate": 1.9999908106049013e-05, "loss": 5.3836, "step": 3910 }, { "epoch": 4.729028364514182, "grad_norm": 5.938263893127441, "learning_rate": 1.999990786479422e-05, "loss": 5.3583, "step": 3920 }, { "epoch": 4.741098370549185, "grad_norm": 5.712547302246094, "learning_rate": 1.9999907623539425e-05, "loss": 5.4135, "step": 3930 }, { "epoch": 4.753168376584188, "grad_norm": 5.433440685272217, "learning_rate": 1.999990738228463e-05, "loss": 5.3657, "step": 3940 }, { "epoch": 4.7652383826191915, "grad_norm": 6.07150936126709, "learning_rate": 1.9999907141029838e-05, "loss": 5.397, "step": 3950 }, { "epoch": 4.777308388654195, "grad_norm": 5.2976202964782715, "learning_rate": 1.9999906899775044e-05, "loss": 5.3892, "step": 3960 }, { "epoch": 4.789378394689197, "grad_norm": 5.366455554962158, "learning_rate": 1.999990665852025e-05, "loss": 5.3744, "step": 3970 }, { "epoch": 4.8014484007242, "grad_norm": 4.993721961975098, "learning_rate": 1.9999906417265456e-05, "loss": 5.376, "step": 3980 }, { "epoch": 4.813518406759203, "grad_norm": 5.849991321563721, "learning_rate": 1.9999906176010663e-05, "loss": 5.3655, "step": 3990 }, { "epoch": 4.825588412794207, "grad_norm": 5.460090160369873, "learning_rate": 1.999990593475587e-05, "loss": 5.3788, "step": 4000 }, { "epoch": 4.825588412794207, "eval_loss": 5.556894779205322, "eval_runtime": 8.1266, "eval_samples_per_second": 85.768, "eval_steps_per_second": 10.829, "step": 4000 }, { "epoch": 4.83765841882921, "grad_norm": 5.482071399688721, "learning_rate": 1.9999905693501075e-05, "loss": 5.3469, "step": 4010 }, { "epoch": 4.849728424864212, "grad_norm": 5.425954818725586, "learning_rate": 1.999990545224628e-05, "loss": 5.3571, "step": 4020 }, { "epoch": 4.861798430899215, "grad_norm": 5.759055137634277, "learning_rate": 1.9999905210991487e-05, "loss": 5.3446, "step": 4030 }, { "epoch": 4.8738684369342185, "grad_norm": 5.449053764343262, "learning_rate": 1.9999904969736694e-05, "loss": 5.3651, "step": 4040 }, { "epoch": 4.885938442969222, "grad_norm": 6.302872180938721, "learning_rate": 1.9999904728481896e-05, "loss": 5.34, "step": 4050 }, { "epoch": 4.898008449004225, "grad_norm": 5.65710973739624, "learning_rate": 1.9999904487227103e-05, "loss": 5.3609, "step": 4060 }, { "epoch": 4.910078455039228, "grad_norm": 5.466217994689941, "learning_rate": 1.999990424597231e-05, "loss": 5.3015, "step": 4070 }, { "epoch": 4.92214846107423, "grad_norm": 5.546313762664795, "learning_rate": 1.9999904004717515e-05, "loss": 5.3199, "step": 4080 }, { "epoch": 4.9342184671092335, "grad_norm": 6.006786823272705, "learning_rate": 1.999990376346272e-05, "loss": 5.3006, "step": 4090 }, { "epoch": 4.946288473144237, "grad_norm": 5.544032573699951, "learning_rate": 1.9999903522207928e-05, "loss": 5.3119, "step": 4100 }, { "epoch": 4.95835847917924, "grad_norm": 6.054759502410889, "learning_rate": 1.9999903280953134e-05, "loss": 5.323, "step": 4110 }, { "epoch": 4.970428485214242, "grad_norm": 5.331243515014648, "learning_rate": 1.999990303969834e-05, "loss": 5.3268, "step": 4120 }, { "epoch": 4.982498491249245, "grad_norm": 5.223010063171387, "learning_rate": 1.9999902798443546e-05, "loss": 5.3263, "step": 4130 }, { "epoch": 4.994568497284249, "grad_norm": 5.72308349609375, "learning_rate": 1.9999902557188752e-05, "loss": 5.3345, "step": 4140 }, { "epoch": 5.006035003017502, "grad_norm": 6.678725242614746, "learning_rate": 1.999990231593396e-05, "loss": 5.2413, "step": 4150 }, { "epoch": 5.018105009052505, "grad_norm": 5.920678615570068, "learning_rate": 1.9999902074679168e-05, "loss": 5.1563, "step": 4160 }, { "epoch": 5.030175015087508, "grad_norm": 5.243211269378662, "learning_rate": 1.9999901833424374e-05, "loss": 5.106, "step": 4170 }, { "epoch": 5.04224502112251, "grad_norm": 5.988300323486328, "learning_rate": 1.999990159216958e-05, "loss": 5.0699, "step": 4180 }, { "epoch": 5.0543150271575135, "grad_norm": 6.261308193206787, "learning_rate": 1.9999901350914787e-05, "loss": 5.1011, "step": 4190 }, { "epoch": 5.066385033192517, "grad_norm": 5.10464334487915, "learning_rate": 1.9999901109659993e-05, "loss": 5.0757, "step": 4200 }, { "epoch": 5.07845503922752, "grad_norm": 5.591700553894043, "learning_rate": 1.99999008684052e-05, "loss": 5.0879, "step": 4210 }, { "epoch": 5.090525045262523, "grad_norm": 5.646613597869873, "learning_rate": 1.9999900627150402e-05, "loss": 5.0888, "step": 4220 }, { "epoch": 5.102595051297525, "grad_norm": 6.026980876922607, "learning_rate": 1.999990038589561e-05, "loss": 5.1, "step": 4230 }, { "epoch": 5.1146650573325285, "grad_norm": 7.228109359741211, "learning_rate": 1.9999900144640815e-05, "loss": 5.1047, "step": 4240 }, { "epoch": 5.126735063367532, "grad_norm": 6.021022319793701, "learning_rate": 1.999989990338602e-05, "loss": 5.0924, "step": 4250 }, { "epoch": 5.138805069402535, "grad_norm": 6.311712741851807, "learning_rate": 1.9999899662131227e-05, "loss": 5.0664, "step": 4260 }, { "epoch": 5.150875075437538, "grad_norm": 5.831450939178467, "learning_rate": 1.9999899420876433e-05, "loss": 5.0517, "step": 4270 }, { "epoch": 5.16294508147254, "grad_norm": 5.826018810272217, "learning_rate": 1.999989917962164e-05, "loss": 5.0946, "step": 4280 }, { "epoch": 5.175015087507544, "grad_norm": 6.47373104095459, "learning_rate": 1.9999898938366846e-05, "loss": 5.0672, "step": 4290 }, { "epoch": 5.187085093542547, "grad_norm": 6.7830657958984375, "learning_rate": 1.9999898697112052e-05, "loss": 5.1114, "step": 4300 }, { "epoch": 5.19915509957755, "grad_norm": 6.044583797454834, "learning_rate": 1.9999898455857258e-05, "loss": 5.0832, "step": 4310 }, { "epoch": 5.211225105612553, "grad_norm": 6.144287586212158, "learning_rate": 1.9999898214602464e-05, "loss": 5.0833, "step": 4320 }, { "epoch": 5.2232951116475554, "grad_norm": 5.412092208862305, "learning_rate": 1.999989797334767e-05, "loss": 5.1093, "step": 4330 }, { "epoch": 5.235365117682559, "grad_norm": 6.062222957611084, "learning_rate": 1.9999897732092877e-05, "loss": 5.1267, "step": 4340 }, { "epoch": 5.247435123717562, "grad_norm": 5.867584228515625, "learning_rate": 1.9999897490838083e-05, "loss": 5.1027, "step": 4350 }, { "epoch": 5.259505129752565, "grad_norm": 5.6109619140625, "learning_rate": 1.999989724958329e-05, "loss": 5.1078, "step": 4360 }, { "epoch": 5.271575135787568, "grad_norm": 6.168236255645752, "learning_rate": 1.9999897008328495e-05, "loss": 5.1008, "step": 4370 }, { "epoch": 5.2836451418225705, "grad_norm": 6.195356369018555, "learning_rate": 1.99998967670737e-05, "loss": 5.1148, "step": 4380 }, { "epoch": 5.295715147857574, "grad_norm": 6.47273063659668, "learning_rate": 1.9999896525818908e-05, "loss": 5.0979, "step": 4390 }, { "epoch": 5.307785153892577, "grad_norm": 6.748480319976807, "learning_rate": 1.9999896284564114e-05, "loss": 5.0915, "step": 4400 }, { "epoch": 5.31985515992758, "grad_norm": 5.7227783203125, "learning_rate": 1.999989604330932e-05, "loss": 5.0795, "step": 4410 }, { "epoch": 5.331925165962583, "grad_norm": 6.437520503997803, "learning_rate": 1.9999895802054526e-05, "loss": 5.1137, "step": 4420 }, { "epoch": 5.343995171997586, "grad_norm": 6.3151631355285645, "learning_rate": 1.9999895560799733e-05, "loss": 5.0728, "step": 4430 }, { "epoch": 5.356065178032589, "grad_norm": 5.572515487670898, "learning_rate": 1.999989531954494e-05, "loss": 5.107, "step": 4440 }, { "epoch": 5.368135184067592, "grad_norm": 5.284977436065674, "learning_rate": 1.9999895078290145e-05, "loss": 5.1158, "step": 4450 }, { "epoch": 5.380205190102595, "grad_norm": 5.7821574211120605, "learning_rate": 1.999989483703535e-05, "loss": 5.0454, "step": 4460 }, { "epoch": 5.392275196137598, "grad_norm": 6.195584774017334, "learning_rate": 1.9999894595780554e-05, "loss": 5.1013, "step": 4470 }, { "epoch": 5.4043452021726015, "grad_norm": 6.023886203765869, "learning_rate": 1.999989435452576e-05, "loss": 5.0747, "step": 4480 }, { "epoch": 5.416415208207604, "grad_norm": 5.50214147567749, "learning_rate": 1.9999894113270967e-05, "loss": 5.0931, "step": 4490 }, { "epoch": 5.428485214242607, "grad_norm": 5.871306419372559, "learning_rate": 1.9999893872016173e-05, "loss": 5.0995, "step": 4500 }, { "epoch": 5.428485214242607, "eval_loss": 5.4549407958984375, "eval_runtime": 8.1428, "eval_samples_per_second": 85.597, "eval_steps_per_second": 10.807, "step": 4500 }, { "epoch": 5.44055522027761, "grad_norm": 5.521939754486084, "learning_rate": 1.999989363076138e-05, "loss": 5.0811, "step": 4510 }, { "epoch": 5.452625226312613, "grad_norm": 6.775201797485352, "learning_rate": 1.9999893389506585e-05, "loss": 5.061, "step": 4520 }, { "epoch": 5.464695232347617, "grad_norm": 6.263267993927002, "learning_rate": 1.999989314825179e-05, "loss": 5.1089, "step": 4530 }, { "epoch": 5.476765238382619, "grad_norm": 5.68864631652832, "learning_rate": 1.9999892906996998e-05, "loss": 5.0697, "step": 4540 }, { "epoch": 5.488835244417622, "grad_norm": 5.565762996673584, "learning_rate": 1.9999892665742204e-05, "loss": 5.0501, "step": 4550 }, { "epoch": 5.500905250452625, "grad_norm": 6.179160118103027, "learning_rate": 1.999989242448741e-05, "loss": 5.0556, "step": 4560 }, { "epoch": 5.512975256487628, "grad_norm": 6.172834396362305, "learning_rate": 1.9999892183232616e-05, "loss": 5.0847, "step": 4570 }, { "epoch": 5.525045262522632, "grad_norm": 5.6235127449035645, "learning_rate": 1.9999891941977822e-05, "loss": 5.0946, "step": 4580 }, { "epoch": 5.537115268557634, "grad_norm": 6.206549167633057, "learning_rate": 1.999989170072303e-05, "loss": 5.0792, "step": 4590 }, { "epoch": 5.549185274592637, "grad_norm": 5.90172004699707, "learning_rate": 1.9999891459468235e-05, "loss": 5.1157, "step": 4600 }, { "epoch": 5.56125528062764, "grad_norm": 5.63142204284668, "learning_rate": 1.999989121821344e-05, "loss": 5.0684, "step": 4610 }, { "epoch": 5.5733252866626435, "grad_norm": 6.137442111968994, "learning_rate": 1.9999890976958647e-05, "loss": 5.0983, "step": 4620 }, { "epoch": 5.585395292697647, "grad_norm": 5.996283531188965, "learning_rate": 1.9999890735703854e-05, "loss": 5.1142, "step": 4630 }, { "epoch": 5.597465298732649, "grad_norm": 5.100437641143799, "learning_rate": 1.999989049444906e-05, "loss": 5.0659, "step": 4640 }, { "epoch": 5.609535304767652, "grad_norm": 6.021582126617432, "learning_rate": 1.9999890253194266e-05, "loss": 5.0931, "step": 4650 }, { "epoch": 5.621605310802655, "grad_norm": 5.52263879776001, "learning_rate": 1.9999890011939472e-05, "loss": 5.0752, "step": 4660 }, { "epoch": 5.6336753168376585, "grad_norm": 5.449699401855469, "learning_rate": 1.999988977068468e-05, "loss": 5.0832, "step": 4670 }, { "epoch": 5.645745322872662, "grad_norm": 5.539669990539551, "learning_rate": 1.9999889529429885e-05, "loss": 5.0512, "step": 4680 }, { "epoch": 5.657815328907665, "grad_norm": 5.864988803863525, "learning_rate": 1.999988928817509e-05, "loss": 5.0727, "step": 4690 }, { "epoch": 5.669885334942667, "grad_norm": 5.793348789215088, "learning_rate": 1.9999889046920297e-05, "loss": 5.0627, "step": 4700 }, { "epoch": 5.68195534097767, "grad_norm": 5.724533557891846, "learning_rate": 1.9999888805665503e-05, "loss": 5.0322, "step": 4710 }, { "epoch": 5.694025347012674, "grad_norm": 5.713397026062012, "learning_rate": 1.999988856441071e-05, "loss": 5.0831, "step": 4720 }, { "epoch": 5.706095353047677, "grad_norm": 6.055660247802734, "learning_rate": 1.9999888323155916e-05, "loss": 5.0823, "step": 4730 }, { "epoch": 5.718165359082679, "grad_norm": 5.7128496170043945, "learning_rate": 1.9999888081901122e-05, "loss": 5.0574, "step": 4740 }, { "epoch": 5.730235365117682, "grad_norm": 5.553981304168701, "learning_rate": 1.9999887840646328e-05, "loss": 5.0522, "step": 4750 }, { "epoch": 5.7423053711526855, "grad_norm": 5.5248703956604, "learning_rate": 1.9999887599391534e-05, "loss": 5.071, "step": 4760 }, { "epoch": 5.754375377187689, "grad_norm": 5.703786373138428, "learning_rate": 1.999988735813674e-05, "loss": 5.08, "step": 4770 }, { "epoch": 5.766445383222692, "grad_norm": 6.309385776519775, "learning_rate": 1.9999887116881947e-05, "loss": 5.0673, "step": 4780 }, { "epoch": 5.778515389257695, "grad_norm": 5.689455032348633, "learning_rate": 1.9999886875627153e-05, "loss": 5.0658, "step": 4790 }, { "epoch": 5.790585395292697, "grad_norm": 5.735537528991699, "learning_rate": 1.999988663437236e-05, "loss": 5.064, "step": 4800 }, { "epoch": 5.8026554013277005, "grad_norm": 5.698563098907471, "learning_rate": 1.9999886393117565e-05, "loss": 5.0605, "step": 4810 }, { "epoch": 5.814725407362704, "grad_norm": 5.632929801940918, "learning_rate": 1.999988615186277e-05, "loss": 5.045, "step": 4820 }, { "epoch": 5.826795413397707, "grad_norm": 5.763766288757324, "learning_rate": 1.9999885910607978e-05, "loss": 5.0604, "step": 4830 }, { "epoch": 5.83886541943271, "grad_norm": 6.388914108276367, "learning_rate": 1.9999885669353184e-05, "loss": 5.07, "step": 4840 }, { "epoch": 5.850935425467712, "grad_norm": 5.539135456085205, "learning_rate": 1.999988542809839e-05, "loss": 5.0635, "step": 4850 }, { "epoch": 5.863005431502716, "grad_norm": 5.619162559509277, "learning_rate": 1.9999885186843597e-05, "loss": 5.088, "step": 4860 }, { "epoch": 5.875075437537719, "grad_norm": 5.756056785583496, "learning_rate": 1.9999884945588803e-05, "loss": 5.0278, "step": 4870 }, { "epoch": 5.887145443572722, "grad_norm": 6.571120262145996, "learning_rate": 1.9999884704334006e-05, "loss": 5.0567, "step": 4880 }, { "epoch": 5.899215449607725, "grad_norm": 5.549358367919922, "learning_rate": 1.9999884463079212e-05, "loss": 5.071, "step": 4890 }, { "epoch": 5.9112854556427274, "grad_norm": 5.314532279968262, "learning_rate": 1.9999884221824418e-05, "loss": 5.0691, "step": 4900 }, { "epoch": 5.923355461677731, "grad_norm": 5.932446479797363, "learning_rate": 1.9999883980569624e-05, "loss": 5.0435, "step": 4910 }, { "epoch": 5.935425467712734, "grad_norm": 5.742858409881592, "learning_rate": 1.999988373931483e-05, "loss": 5.0401, "step": 4920 }, { "epoch": 5.947495473747737, "grad_norm": 5.2341084480285645, "learning_rate": 1.9999883498060037e-05, "loss": 5.0457, "step": 4930 }, { "epoch": 5.95956547978274, "grad_norm": 5.992002010345459, "learning_rate": 1.9999883256805243e-05, "loss": 5.0151, "step": 4940 }, { "epoch": 5.9716354858177425, "grad_norm": 6.653188705444336, "learning_rate": 1.999988301555045e-05, "loss": 5.0297, "step": 4950 }, { "epoch": 5.983705491852746, "grad_norm": 5.973825454711914, "learning_rate": 1.9999882774295655e-05, "loss": 5.0761, "step": 4960 }, { "epoch": 5.995775497887749, "grad_norm": 5.937385082244873, "learning_rate": 1.999988253304086e-05, "loss": 5.0804, "step": 4970 }, { "epoch": 6.007242003621002, "grad_norm": 6.461676597595215, "learning_rate": 1.9999882291786068e-05, "loss": 4.9093, "step": 4980 }, { "epoch": 6.019312009656005, "grad_norm": 6.657212734222412, "learning_rate": 1.9999882050531274e-05, "loss": 4.7704, "step": 4990 }, { "epoch": 6.031382015691008, "grad_norm": 6.585206985473633, "learning_rate": 1.999988180927648e-05, "loss": 4.7525, "step": 5000 }, { "epoch": 6.031382015691008, "eval_loss": 5.434846878051758, "eval_runtime": 8.1302, "eval_samples_per_second": 85.729, "eval_steps_per_second": 10.824, "step": 5000 }, { "epoch": 6.043452021726011, "grad_norm": 6.622472763061523, "learning_rate": 1.9999881568021686e-05, "loss": 4.7437, "step": 5010 }, { "epoch": 6.055522027761014, "grad_norm": 6.851701736450195, "learning_rate": 1.9999881326766893e-05, "loss": 4.7579, "step": 5020 }, { "epoch": 6.067592033796017, "grad_norm": 6.240424633026123, "learning_rate": 1.99998810855121e-05, "loss": 4.7337, "step": 5030 }, { "epoch": 6.07966203983102, "grad_norm": 5.8504767417907715, "learning_rate": 1.9999880844257305e-05, "loss": 4.7156, "step": 5040 }, { "epoch": 6.091732045866023, "grad_norm": 6.922354221343994, "learning_rate": 1.999988060300251e-05, "loss": 4.7577, "step": 5050 }, { "epoch": 6.103802051901026, "grad_norm": 6.277132034301758, "learning_rate": 1.9999880361747717e-05, "loss": 4.7929, "step": 5060 }, { "epoch": 6.115872057936029, "grad_norm": 6.1485595703125, "learning_rate": 1.9999880120492924e-05, "loss": 4.7627, "step": 5070 }, { "epoch": 6.127942063971032, "grad_norm": 6.594834327697754, "learning_rate": 1.999987987923813e-05, "loss": 4.7639, "step": 5080 }, { "epoch": 6.140012070006035, "grad_norm": 6.486940383911133, "learning_rate": 1.9999879637983336e-05, "loss": 4.7897, "step": 5090 }, { "epoch": 6.152082076041038, "grad_norm": 6.114495277404785, "learning_rate": 1.9999879396728542e-05, "loss": 4.785, "step": 5100 }, { "epoch": 6.164152082076041, "grad_norm": 6.1007280349731445, "learning_rate": 1.999987915547375e-05, "loss": 4.7909, "step": 5110 }, { "epoch": 6.176222088111044, "grad_norm": 6.366045951843262, "learning_rate": 1.9999878914218955e-05, "loss": 4.7667, "step": 5120 }, { "epoch": 6.188292094146047, "grad_norm": 6.484925746917725, "learning_rate": 1.9999878672964158e-05, "loss": 4.7609, "step": 5130 }, { "epoch": 6.20036210018105, "grad_norm": 6.360560894012451, "learning_rate": 1.9999878431709364e-05, "loss": 4.7814, "step": 5140 }, { "epoch": 6.2124321062160535, "grad_norm": 5.977457046508789, "learning_rate": 1.999987819045457e-05, "loss": 4.7863, "step": 5150 }, { "epoch": 6.224502112251056, "grad_norm": 5.970700263977051, "learning_rate": 1.9999877949199776e-05, "loss": 4.8042, "step": 5160 }, { "epoch": 6.236572118286059, "grad_norm": 6.769618988037109, "learning_rate": 1.9999877707944982e-05, "loss": 4.7754, "step": 5170 }, { "epoch": 6.248642124321062, "grad_norm": 6.907576084136963, "learning_rate": 1.999987746669019e-05, "loss": 4.7714, "step": 5180 }, { "epoch": 6.260712130356065, "grad_norm": 6.720979690551758, "learning_rate": 1.9999877225435395e-05, "loss": 4.816, "step": 5190 }, { "epoch": 6.2727821363910685, "grad_norm": 6.272497653961182, "learning_rate": 1.99998769841806e-05, "loss": 4.7859, "step": 5200 }, { "epoch": 6.284852142426071, "grad_norm": 6.0941667556762695, "learning_rate": 1.9999876742925807e-05, "loss": 4.7755, "step": 5210 }, { "epoch": 6.296922148461074, "grad_norm": 6.62696647644043, "learning_rate": 1.9999876501671013e-05, "loss": 4.7793, "step": 5220 }, { "epoch": 6.308992154496077, "grad_norm": 6.401269912719727, "learning_rate": 1.999987626041622e-05, "loss": 4.7912, "step": 5230 }, { "epoch": 6.32106216053108, "grad_norm": 6.364692687988281, "learning_rate": 1.999987601916143e-05, "loss": 4.7761, "step": 5240 }, { "epoch": 6.333132166566084, "grad_norm": 6.186086654663086, "learning_rate": 1.9999875777906636e-05, "loss": 4.825, "step": 5250 }, { "epoch": 6.345202172601086, "grad_norm": 6.358392715454102, "learning_rate": 1.9999875536651842e-05, "loss": 4.8029, "step": 5260 }, { "epoch": 6.357272178636089, "grad_norm": 6.47671365737915, "learning_rate": 1.9999875295397048e-05, "loss": 4.794, "step": 5270 }, { "epoch": 6.369342184671092, "grad_norm": 6.533702850341797, "learning_rate": 1.9999875054142254e-05, "loss": 4.7993, "step": 5280 }, { "epoch": 6.3814121907060954, "grad_norm": 7.3025078773498535, "learning_rate": 1.9999874812887457e-05, "loss": 4.8072, "step": 5290 }, { "epoch": 6.393482196741099, "grad_norm": 6.682429313659668, "learning_rate": 1.9999874571632663e-05, "loss": 4.7953, "step": 5300 }, { "epoch": 6.405552202776102, "grad_norm": 6.97099494934082, "learning_rate": 1.999987433037787e-05, "loss": 4.8019, "step": 5310 }, { "epoch": 6.417622208811104, "grad_norm": 6.037415981292725, "learning_rate": 1.9999874089123076e-05, "loss": 4.7664, "step": 5320 }, { "epoch": 6.429692214846107, "grad_norm": 6.67172384262085, "learning_rate": 1.9999873847868282e-05, "loss": 4.7796, "step": 5330 }, { "epoch": 6.4417622208811105, "grad_norm": 7.016972064971924, "learning_rate": 1.9999873606613488e-05, "loss": 4.8083, "step": 5340 }, { "epoch": 6.453832226916114, "grad_norm": 6.603565216064453, "learning_rate": 1.9999873365358694e-05, "loss": 4.8074, "step": 5350 }, { "epoch": 6.465902232951117, "grad_norm": 6.538936614990234, "learning_rate": 1.99998731241039e-05, "loss": 4.778, "step": 5360 }, { "epoch": 6.477972238986119, "grad_norm": 6.275637626647949, "learning_rate": 1.9999872882849107e-05, "loss": 4.8008, "step": 5370 }, { "epoch": 6.490042245021122, "grad_norm": 6.417126655578613, "learning_rate": 1.9999872641594313e-05, "loss": 4.7804, "step": 5380 }, { "epoch": 6.502112251056126, "grad_norm": 6.163278102874756, "learning_rate": 1.999987240033952e-05, "loss": 4.8056, "step": 5390 }, { "epoch": 6.514182257091129, "grad_norm": 6.189593315124512, "learning_rate": 1.9999872159084725e-05, "loss": 4.7675, "step": 5400 }, { "epoch": 6.526252263126132, "grad_norm": 6.282558441162109, "learning_rate": 1.999987191782993e-05, "loss": 4.8055, "step": 5410 }, { "epoch": 6.538322269161134, "grad_norm": 6.510317802429199, "learning_rate": 1.9999871676575138e-05, "loss": 4.8219, "step": 5420 }, { "epoch": 6.550392275196137, "grad_norm": 6.463437557220459, "learning_rate": 1.9999871435320344e-05, "loss": 4.8001, "step": 5430 }, { "epoch": 6.562462281231141, "grad_norm": 6.69637393951416, "learning_rate": 1.999987119406555e-05, "loss": 4.8107, "step": 5440 }, { "epoch": 6.574532287266144, "grad_norm": 6.205772399902344, "learning_rate": 1.9999870952810756e-05, "loss": 4.7828, "step": 5450 }, { "epoch": 6.586602293301147, "grad_norm": 6.278364658355713, "learning_rate": 1.9999870711555963e-05, "loss": 4.7805, "step": 5460 }, { "epoch": 6.598672299336149, "grad_norm": 6.581289291381836, "learning_rate": 1.999987047030117e-05, "loss": 4.8038, "step": 5470 }, { "epoch": 6.6107423053711525, "grad_norm": 6.792966365814209, "learning_rate": 1.9999870229046375e-05, "loss": 4.7719, "step": 5480 }, { "epoch": 6.622812311406156, "grad_norm": 6.969714164733887, "learning_rate": 1.999986998779158e-05, "loss": 4.7662, "step": 5490 }, { "epoch": 6.634882317441159, "grad_norm": 6.415828704833984, "learning_rate": 1.9999869746536788e-05, "loss": 4.8175, "step": 5500 }, { "epoch": 6.634882317441159, "eval_loss": 5.37880277633667, "eval_runtime": 8.1249, "eval_samples_per_second": 85.786, "eval_steps_per_second": 10.831, "step": 5500 }, { "epoch": 6.646952323476162, "grad_norm": 6.136972427368164, "learning_rate": 1.9999869505281994e-05, "loss": 4.8208, "step": 5510 }, { "epoch": 6.659022329511165, "grad_norm": 6.965816020965576, "learning_rate": 1.99998692640272e-05, "loss": 4.8049, "step": 5520 }, { "epoch": 6.6710923355461675, "grad_norm": 6.2410969734191895, "learning_rate": 1.9999869022772406e-05, "loss": 4.8001, "step": 5530 }, { "epoch": 6.683162341581171, "grad_norm": 5.881069660186768, "learning_rate": 1.999986878151761e-05, "loss": 4.8356, "step": 5540 }, { "epoch": 6.695232347616174, "grad_norm": 6.018870830535889, "learning_rate": 1.9999868540262815e-05, "loss": 4.8202, "step": 5550 }, { "epoch": 6.707302353651177, "grad_norm": 6.043688774108887, "learning_rate": 1.999986829900802e-05, "loss": 4.7761, "step": 5560 }, { "epoch": 6.719372359686179, "grad_norm": 6.043087959289551, "learning_rate": 1.9999868057753228e-05, "loss": 4.7984, "step": 5570 }, { "epoch": 6.731442365721183, "grad_norm": 5.706445217132568, "learning_rate": 1.9999867816498434e-05, "loss": 4.7808, "step": 5580 }, { "epoch": 6.743512371756186, "grad_norm": 6.7181596755981445, "learning_rate": 1.999986757524364e-05, "loss": 4.8185, "step": 5590 }, { "epoch": 6.755582377791189, "grad_norm": 6.5804762840271, "learning_rate": 1.9999867333988846e-05, "loss": 4.803, "step": 5600 }, { "epoch": 6.767652383826192, "grad_norm": 6.019338607788086, "learning_rate": 1.9999867092734053e-05, "loss": 4.7983, "step": 5610 }, { "epoch": 6.779722389861195, "grad_norm": 6.569772720336914, "learning_rate": 1.999986685147926e-05, "loss": 4.776, "step": 5620 }, { "epoch": 6.791792395896198, "grad_norm": 6.26137638092041, "learning_rate": 1.9999866610224465e-05, "loss": 4.809, "step": 5630 }, { "epoch": 6.803862401931201, "grad_norm": 6.821934700012207, "learning_rate": 1.999986636896967e-05, "loss": 4.7856, "step": 5640 }, { "epoch": 6.815932407966204, "grad_norm": 6.875892639160156, "learning_rate": 1.9999866127714877e-05, "loss": 4.8119, "step": 5650 }, { "epoch": 6.828002414001207, "grad_norm": 6.308215618133545, "learning_rate": 1.9999865886460084e-05, "loss": 4.7967, "step": 5660 }, { "epoch": 6.8400724200362095, "grad_norm": 6.1612067222595215, "learning_rate": 1.999986564520529e-05, "loss": 4.7909, "step": 5670 }, { "epoch": 6.852142426071213, "grad_norm": 6.2947916984558105, "learning_rate": 1.9999865403950496e-05, "loss": 4.8235, "step": 5680 }, { "epoch": 6.864212432106216, "grad_norm": 5.551745414733887, "learning_rate": 1.9999865162695702e-05, "loss": 4.7944, "step": 5690 }, { "epoch": 6.876282438141219, "grad_norm": 6.090208053588867, "learning_rate": 1.999986492144091e-05, "loss": 4.8116, "step": 5700 }, { "epoch": 6.888352444176222, "grad_norm": 6.37030029296875, "learning_rate": 1.9999864680186115e-05, "loss": 4.8157, "step": 5710 }, { "epoch": 6.9004224502112255, "grad_norm": 6.531923770904541, "learning_rate": 1.999986443893132e-05, "loss": 4.8058, "step": 5720 }, { "epoch": 6.912492456246228, "grad_norm": 6.241397380828857, "learning_rate": 1.9999864197676527e-05, "loss": 4.7813, "step": 5730 }, { "epoch": 6.924562462281231, "grad_norm": 6.201876640319824, "learning_rate": 1.9999863956421733e-05, "loss": 4.7912, "step": 5740 }, { "epoch": 6.936632468316234, "grad_norm": 6.793189525604248, "learning_rate": 1.999986371516694e-05, "loss": 4.8203, "step": 5750 }, { "epoch": 6.948702474351237, "grad_norm": 5.84963321685791, "learning_rate": 1.9999863473912146e-05, "loss": 4.8267, "step": 5760 }, { "epoch": 6.9607724803862405, "grad_norm": 5.46693754196167, "learning_rate": 1.9999863232657352e-05, "loss": 4.8139, "step": 5770 }, { "epoch": 6.972842486421243, "grad_norm": 6.119303226470947, "learning_rate": 1.9999862991402558e-05, "loss": 4.8032, "step": 5780 }, { "epoch": 6.984912492456246, "grad_norm": 6.194338798522949, "learning_rate": 1.9999862750147764e-05, "loss": 4.7988, "step": 5790 }, { "epoch": 6.996982498491249, "grad_norm": 6.114941120147705, "learning_rate": 1.999986250889297e-05, "loss": 4.8074, "step": 5800 }, { "epoch": 7.008449004224502, "grad_norm": 7.3534770011901855, "learning_rate": 1.9999862267638177e-05, "loss": 4.5662, "step": 5810 }, { "epoch": 7.020519010259505, "grad_norm": 6.404786586761475, "learning_rate": 1.9999862026383383e-05, "loss": 4.4726, "step": 5820 }, { "epoch": 7.032589016294508, "grad_norm": 7.582947254180908, "learning_rate": 1.999986178512859e-05, "loss": 4.4507, "step": 5830 }, { "epoch": 7.044659022329511, "grad_norm": 6.869821071624756, "learning_rate": 1.9999861543873795e-05, "loss": 4.4303, "step": 5840 }, { "epoch": 7.056729028364514, "grad_norm": 6.565378189086914, "learning_rate": 1.9999861302619e-05, "loss": 4.4113, "step": 5850 }, { "epoch": 7.068799034399517, "grad_norm": 7.595108509063721, "learning_rate": 1.9999861061364208e-05, "loss": 4.4502, "step": 5860 }, { "epoch": 7.0808690404345205, "grad_norm": 7.4440484046936035, "learning_rate": 1.9999860820109414e-05, "loss": 4.4178, "step": 5870 }, { "epoch": 7.092939046469524, "grad_norm": 6.920553207397461, "learning_rate": 1.999986057885462e-05, "loss": 4.4623, "step": 5880 }, { "epoch": 7.105009052504526, "grad_norm": 7.676276683807373, "learning_rate": 1.9999860337599827e-05, "loss": 4.4171, "step": 5890 }, { "epoch": 7.117079058539529, "grad_norm": 7.271197319030762, "learning_rate": 1.9999860096345033e-05, "loss": 4.4244, "step": 5900 }, { "epoch": 7.129149064574532, "grad_norm": 6.891458034515381, "learning_rate": 1.999985985509024e-05, "loss": 4.4418, "step": 5910 }, { "epoch": 7.1412190706095355, "grad_norm": 7.107789516448975, "learning_rate": 1.9999859613835445e-05, "loss": 4.4663, "step": 5920 }, { "epoch": 7.153289076644539, "grad_norm": 6.229667663574219, "learning_rate": 1.999985937258065e-05, "loss": 4.4304, "step": 5930 }, { "epoch": 7.165359082679541, "grad_norm": 7.5302958488464355, "learning_rate": 1.9999859131325858e-05, "loss": 4.4496, "step": 5940 }, { "epoch": 7.177429088714544, "grad_norm": 6.849782466888428, "learning_rate": 1.9999858890071064e-05, "loss": 4.4211, "step": 5950 }, { "epoch": 7.189499094749547, "grad_norm": 7.080521106719971, "learning_rate": 1.9999858648816267e-05, "loss": 4.4432, "step": 5960 }, { "epoch": 7.201569100784551, "grad_norm": 7.322294235229492, "learning_rate": 1.9999858407561473e-05, "loss": 4.4544, "step": 5970 }, { "epoch": 7.213639106819554, "grad_norm": 6.8238749504089355, "learning_rate": 1.999985816630668e-05, "loss": 4.446, "step": 5980 }, { "epoch": 7.225709112854556, "grad_norm": 7.294742107391357, "learning_rate": 1.9999857925051885e-05, "loss": 4.4555, "step": 5990 }, { "epoch": 7.237779118889559, "grad_norm": 7.219674110412598, "learning_rate": 1.999985768379709e-05, "loss": 4.4573, "step": 6000 }, { "epoch": 7.237779118889559, "eval_loss": 5.471388339996338, "eval_runtime": 8.1333, "eval_samples_per_second": 85.697, "eval_steps_per_second": 10.82, "step": 6000 }, { "epoch": 7.2498491249245625, "grad_norm": 7.080127239227295, "learning_rate": 1.9999857442542298e-05, "loss": 4.4403, "step": 6010 }, { "epoch": 7.261919130959566, "grad_norm": 7.432303428649902, "learning_rate": 1.9999857201287504e-05, "loss": 4.4847, "step": 6020 }, { "epoch": 7.273989136994569, "grad_norm": 7.265688896179199, "learning_rate": 1.999985696003271e-05, "loss": 4.4692, "step": 6030 }, { "epoch": 7.286059143029571, "grad_norm": 7.351378440856934, "learning_rate": 1.9999856718777916e-05, "loss": 4.4657, "step": 6040 }, { "epoch": 7.298129149064574, "grad_norm": 7.817484378814697, "learning_rate": 1.9999856477523123e-05, "loss": 4.4842, "step": 6050 }, { "epoch": 7.3101991550995775, "grad_norm": 7.24118709564209, "learning_rate": 1.999985623626833e-05, "loss": 4.4514, "step": 6060 }, { "epoch": 7.322269161134581, "grad_norm": 7.859687328338623, "learning_rate": 1.9999855995013535e-05, "loss": 4.4438, "step": 6070 }, { "epoch": 7.334339167169584, "grad_norm": 7.1962785720825195, "learning_rate": 1.999985575375874e-05, "loss": 4.5222, "step": 6080 }, { "epoch": 7.346409173204586, "grad_norm": 6.848715782165527, "learning_rate": 1.9999855512503947e-05, "loss": 4.4636, "step": 6090 }, { "epoch": 7.358479179239589, "grad_norm": 7.374521732330322, "learning_rate": 1.9999855271249154e-05, "loss": 4.5085, "step": 6100 }, { "epoch": 7.370549185274593, "grad_norm": 6.664852619171143, "learning_rate": 1.999985502999436e-05, "loss": 4.4772, "step": 6110 }, { "epoch": 7.382619191309596, "grad_norm": 6.824117660522461, "learning_rate": 1.9999854788739566e-05, "loss": 4.4801, "step": 6120 }, { "epoch": 7.394689197344599, "grad_norm": 7.4021100997924805, "learning_rate": 1.9999854547484772e-05, "loss": 4.4559, "step": 6130 }, { "epoch": 7.406759203379602, "grad_norm": 6.477844715118408, "learning_rate": 1.999985430622998e-05, "loss": 4.5006, "step": 6140 }, { "epoch": 7.418829209414604, "grad_norm": 7.234144687652588, "learning_rate": 1.9999854064975185e-05, "loss": 4.5131, "step": 6150 }, { "epoch": 7.430899215449608, "grad_norm": 7.267096042633057, "learning_rate": 1.999985382372039e-05, "loss": 4.4766, "step": 6160 }, { "epoch": 7.442969221484611, "grad_norm": 7.071993827819824, "learning_rate": 1.9999853582465597e-05, "loss": 4.4765, "step": 6170 }, { "epoch": 7.455039227519614, "grad_norm": 7.265273094177246, "learning_rate": 1.9999853341210803e-05, "loss": 4.4548, "step": 6180 }, { "epoch": 7.467109233554616, "grad_norm": 7.5716352462768555, "learning_rate": 1.999985309995601e-05, "loss": 4.5014, "step": 6190 }, { "epoch": 7.4791792395896195, "grad_norm": 7.54452657699585, "learning_rate": 1.9999852858701216e-05, "loss": 4.5074, "step": 6200 }, { "epoch": 7.491249245624623, "grad_norm": 6.9370036125183105, "learning_rate": 1.999985261744642e-05, "loss": 4.497, "step": 6210 }, { "epoch": 7.503319251659626, "grad_norm": 7.526813983917236, "learning_rate": 1.9999852376191625e-05, "loss": 4.4912, "step": 6220 }, { "epoch": 7.515389257694629, "grad_norm": 7.471532344818115, "learning_rate": 1.999985213493683e-05, "loss": 4.5127, "step": 6230 }, { "epoch": 7.527459263729632, "grad_norm": 7.126790523529053, "learning_rate": 1.9999851893682037e-05, "loss": 4.4963, "step": 6240 }, { "epoch": 7.5395292697646346, "grad_norm": 6.9977827072143555, "learning_rate": 1.9999851652427244e-05, "loss": 4.52, "step": 6250 }, { "epoch": 7.551599275799638, "grad_norm": 6.805571556091309, "learning_rate": 1.999985141117245e-05, "loss": 4.4887, "step": 6260 }, { "epoch": 7.563669281834641, "grad_norm": 7.2792582511901855, "learning_rate": 1.9999851169917656e-05, "loss": 4.5198, "step": 6270 }, { "epoch": 7.575739287869644, "grad_norm": 7.13316535949707, "learning_rate": 1.9999850928662862e-05, "loss": 4.4778, "step": 6280 }, { "epoch": 7.587809293904647, "grad_norm": 7.960423469543457, "learning_rate": 1.999985068740807e-05, "loss": 4.532, "step": 6290 }, { "epoch": 7.59987929993965, "grad_norm": 7.563502788543701, "learning_rate": 1.9999850446153275e-05, "loss": 4.4954, "step": 6300 }, { "epoch": 7.611949305974653, "grad_norm": 7.183853626251221, "learning_rate": 1.999985020489848e-05, "loss": 4.5449, "step": 6310 }, { "epoch": 7.624019312009656, "grad_norm": 7.091987133026123, "learning_rate": 1.999984996364369e-05, "loss": 4.5306, "step": 6320 }, { "epoch": 7.636089318044659, "grad_norm": 6.971532821655273, "learning_rate": 1.9999849722388897e-05, "loss": 4.5162, "step": 6330 }, { "epoch": 7.648159324079662, "grad_norm": 7.442251682281494, "learning_rate": 1.9999849481134103e-05, "loss": 4.5016, "step": 6340 }, { "epoch": 7.660229330114665, "grad_norm": 6.948423385620117, "learning_rate": 1.999984923987931e-05, "loss": 4.5112, "step": 6350 }, { "epoch": 7.672299336149668, "grad_norm": 7.3144917488098145, "learning_rate": 1.9999848998624515e-05, "loss": 4.4714, "step": 6360 }, { "epoch": 7.684369342184671, "grad_norm": 6.420163631439209, "learning_rate": 1.9999848757369718e-05, "loss": 4.5078, "step": 6370 }, { "epoch": 7.696439348219674, "grad_norm": 7.801774501800537, "learning_rate": 1.9999848516114924e-05, "loss": 4.5131, "step": 6380 }, { "epoch": 7.708509354254677, "grad_norm": 6.850042819976807, "learning_rate": 1.999984827486013e-05, "loss": 4.489, "step": 6390 }, { "epoch": 7.72057936028968, "grad_norm": 6.392319202423096, "learning_rate": 1.9999848033605337e-05, "loss": 4.5147, "step": 6400 }, { "epoch": 7.732649366324683, "grad_norm": 7.636480331420898, "learning_rate": 1.9999847792350543e-05, "loss": 4.5281, "step": 6410 }, { "epoch": 7.744719372359686, "grad_norm": 7.048392295837402, "learning_rate": 1.999984755109575e-05, "loss": 4.5428, "step": 6420 }, { "epoch": 7.756789378394689, "grad_norm": 6.470360279083252, "learning_rate": 1.9999847309840955e-05, "loss": 4.5235, "step": 6430 }, { "epoch": 7.7688593844296925, "grad_norm": 6.857886791229248, "learning_rate": 1.999984706858616e-05, "loss": 4.5254, "step": 6440 }, { "epoch": 7.780929390464696, "grad_norm": 7.3516669273376465, "learning_rate": 1.9999846827331368e-05, "loss": 4.497, "step": 6450 }, { "epoch": 7.792999396499698, "grad_norm": 6.490270614624023, "learning_rate": 1.9999846586076574e-05, "loss": 4.5155, "step": 6460 }, { "epoch": 7.805069402534701, "grad_norm": 6.8781890869140625, "learning_rate": 1.999984634482178e-05, "loss": 4.5454, "step": 6470 }, { "epoch": 7.817139408569704, "grad_norm": 6.428595542907715, "learning_rate": 1.9999846103566986e-05, "loss": 4.5416, "step": 6480 }, { "epoch": 7.8292094146047075, "grad_norm": 7.235943794250488, "learning_rate": 1.9999845862312193e-05, "loss": 4.5168, "step": 6490 }, { "epoch": 7.84127942063971, "grad_norm": 7.114769458770752, "learning_rate": 1.99998456210574e-05, "loss": 4.5458, "step": 6500 }, { "epoch": 7.84127942063971, "eval_loss": 5.4180145263671875, "eval_runtime": 8.1288, "eval_samples_per_second": 85.744, "eval_steps_per_second": 10.826, "step": 6500 }, { "epoch": 7.853349426674713, "grad_norm": 7.500901699066162, "learning_rate": 1.9999845379802605e-05, "loss": 4.4916, "step": 6510 }, { "epoch": 7.865419432709716, "grad_norm": 6.895925998687744, "learning_rate": 1.999984513854781e-05, "loss": 4.5342, "step": 6520 }, { "epoch": 7.877489438744719, "grad_norm": 7.424803733825684, "learning_rate": 1.9999844897293018e-05, "loss": 4.5086, "step": 6530 }, { "epoch": 7.889559444779723, "grad_norm": 7.11887264251709, "learning_rate": 1.9999844656038224e-05, "loss": 4.562, "step": 6540 }, { "epoch": 7.901629450814726, "grad_norm": 6.911796569824219, "learning_rate": 1.999984441478343e-05, "loss": 4.5379, "step": 6550 }, { "epoch": 7.913699456849728, "grad_norm": 7.480235576629639, "learning_rate": 1.9999844173528636e-05, "loss": 4.5253, "step": 6560 }, { "epoch": 7.925769462884731, "grad_norm": 6.785703659057617, "learning_rate": 1.9999843932273842e-05, "loss": 4.5469, "step": 6570 }, { "epoch": 7.9378394689197345, "grad_norm": 6.6393232345581055, "learning_rate": 1.999984369101905e-05, "loss": 4.5343, "step": 6580 }, { "epoch": 7.949909474954738, "grad_norm": 6.663791656494141, "learning_rate": 1.9999843449764255e-05, "loss": 4.5198, "step": 6590 }, { "epoch": 7.961979480989741, "grad_norm": 6.691219329833984, "learning_rate": 1.999984320850946e-05, "loss": 4.4987, "step": 6600 }, { "epoch": 7.974049487024743, "grad_norm": 6.992611885070801, "learning_rate": 1.9999842967254667e-05, "loss": 4.5402, "step": 6610 }, { "epoch": 7.986119493059746, "grad_norm": 7.632913589477539, "learning_rate": 1.999984272599987e-05, "loss": 4.4853, "step": 6620 }, { "epoch": 7.9981894990947495, "grad_norm": 6.551946640014648, "learning_rate": 1.9999842484745076e-05, "loss": 4.5392, "step": 6630 }, { "epoch": 8.009656004828003, "grad_norm": 8.822531700134277, "learning_rate": 1.9999842243490283e-05, "loss": 4.1846, "step": 6640 }, { "epoch": 8.021726010863006, "grad_norm": 8.355668067932129, "learning_rate": 1.999984200223549e-05, "loss": 4.0742, "step": 6650 }, { "epoch": 8.033796016898009, "grad_norm": 8.030898094177246, "learning_rate": 1.9999841760980695e-05, "loss": 4.0718, "step": 6660 }, { "epoch": 8.045866022933012, "grad_norm": 8.529818534851074, "learning_rate": 1.99998415197259e-05, "loss": 4.0383, "step": 6670 }, { "epoch": 8.057936028968015, "grad_norm": 9.210603713989258, "learning_rate": 1.9999841278471107e-05, "loss": 4.055, "step": 6680 }, { "epoch": 8.070006035003017, "grad_norm": 8.015311241149902, "learning_rate": 1.9999841037216314e-05, "loss": 4.049, "step": 6690 }, { "epoch": 8.08207604103802, "grad_norm": 8.165966987609863, "learning_rate": 1.999984079596152e-05, "loss": 4.0361, "step": 6700 }, { "epoch": 8.094146047073023, "grad_norm": 7.878964900970459, "learning_rate": 1.9999840554706726e-05, "loss": 4.0947, "step": 6710 }, { "epoch": 8.106216053108026, "grad_norm": 7.835506439208984, "learning_rate": 1.9999840313451932e-05, "loss": 4.0466, "step": 6720 }, { "epoch": 8.11828605914303, "grad_norm": 8.19532299041748, "learning_rate": 1.999984007219714e-05, "loss": 4.0783, "step": 6730 }, { "epoch": 8.130356065178033, "grad_norm": 9.27872371673584, "learning_rate": 1.9999839830942345e-05, "loss": 4.0902, "step": 6740 }, { "epoch": 8.142426071213036, "grad_norm": 8.069944381713867, "learning_rate": 1.999983958968755e-05, "loss": 4.11, "step": 6750 }, { "epoch": 8.154496077248039, "grad_norm": 7.868702411651611, "learning_rate": 1.9999839348432757e-05, "loss": 4.0763, "step": 6760 }, { "epoch": 8.166566083283042, "grad_norm": 8.390815734863281, "learning_rate": 1.9999839107177963e-05, "loss": 4.0823, "step": 6770 }, { "epoch": 8.178636089318045, "grad_norm": 8.007402420043945, "learning_rate": 1.999983886592317e-05, "loss": 4.0678, "step": 6780 }, { "epoch": 8.190706095353047, "grad_norm": 8.887139320373535, "learning_rate": 1.9999838624668376e-05, "loss": 4.0297, "step": 6790 }, { "epoch": 8.20277610138805, "grad_norm": 7.79543924331665, "learning_rate": 1.9999838383413582e-05, "loss": 4.0901, "step": 6800 }, { "epoch": 8.214846107423053, "grad_norm": 9.026177406311035, "learning_rate": 1.9999838142158788e-05, "loss": 4.1151, "step": 6810 }, { "epoch": 8.226916113458056, "grad_norm": 8.395758628845215, "learning_rate": 1.9999837900903994e-05, "loss": 4.0999, "step": 6820 }, { "epoch": 8.23898611949306, "grad_norm": 8.978431701660156, "learning_rate": 1.99998376596492e-05, "loss": 4.099, "step": 6830 }, { "epoch": 8.251056125528063, "grad_norm": 8.005154609680176, "learning_rate": 1.9999837418394407e-05, "loss": 4.113, "step": 6840 }, { "epoch": 8.263126131563066, "grad_norm": 8.157316207885742, "learning_rate": 1.9999837177139613e-05, "loss": 4.081, "step": 6850 }, { "epoch": 8.27519613759807, "grad_norm": 8.12595272064209, "learning_rate": 1.999983693588482e-05, "loss": 4.0985, "step": 6860 }, { "epoch": 8.287266143633072, "grad_norm": 8.025388717651367, "learning_rate": 1.9999836694630025e-05, "loss": 4.1013, "step": 6870 }, { "epoch": 8.299336149668076, "grad_norm": 8.586197853088379, "learning_rate": 1.9999836453375232e-05, "loss": 4.0781, "step": 6880 }, { "epoch": 8.311406155703079, "grad_norm": 8.635668754577637, "learning_rate": 1.9999836212120438e-05, "loss": 4.1093, "step": 6890 }, { "epoch": 8.32347616173808, "grad_norm": 8.68919563293457, "learning_rate": 1.9999835970865644e-05, "loss": 4.1471, "step": 6900 }, { "epoch": 8.335546167773083, "grad_norm": 8.279667854309082, "learning_rate": 1.999983572961085e-05, "loss": 4.1086, "step": 6910 }, { "epoch": 8.347616173808087, "grad_norm": 7.815024375915527, "learning_rate": 1.9999835488356057e-05, "loss": 4.1579, "step": 6920 }, { "epoch": 8.35968617984309, "grad_norm": 7.521381378173828, "learning_rate": 1.9999835247101263e-05, "loss": 4.1393, "step": 6930 }, { "epoch": 8.371756185878093, "grad_norm": 9.081541061401367, "learning_rate": 1.999983500584647e-05, "loss": 4.1213, "step": 6940 }, { "epoch": 8.383826191913096, "grad_norm": 7.966409683227539, "learning_rate": 1.9999834764591675e-05, "loss": 4.1643, "step": 6950 }, { "epoch": 8.3958961979481, "grad_norm": 9.301593780517578, "learning_rate": 1.999983452333688e-05, "loss": 4.1278, "step": 6960 }, { "epoch": 8.407966203983102, "grad_norm": 8.071037292480469, "learning_rate": 1.9999834282082088e-05, "loss": 4.1249, "step": 6970 }, { "epoch": 8.420036210018106, "grad_norm": 8.3408203125, "learning_rate": 1.9999834040827294e-05, "loss": 4.1497, "step": 6980 }, { "epoch": 8.432106216053109, "grad_norm": 8.578961372375488, "learning_rate": 1.99998337995725e-05, "loss": 4.1434, "step": 6990 }, { "epoch": 8.44417622208811, "grad_norm": 8.839587211608887, "learning_rate": 1.9999833558317706e-05, "loss": 4.1526, "step": 7000 }, { "epoch": 8.44417622208811, "eval_loss": 5.625586032867432, "eval_runtime": 8.1263, "eval_samples_per_second": 85.771, "eval_steps_per_second": 10.829, "step": 7000 }, { "epoch": 8.456246228123113, "grad_norm": 8.679526329040527, "learning_rate": 1.9999833317062913e-05, "loss": 4.1292, "step": 7010 }, { "epoch": 8.468316234158117, "grad_norm": 8.20960807800293, "learning_rate": 1.999983307580812e-05, "loss": 4.1474, "step": 7020 }, { "epoch": 8.48038624019312, "grad_norm": 9.15270709991455, "learning_rate": 1.9999832834553325e-05, "loss": 4.1149, "step": 7030 }, { "epoch": 8.492456246228123, "grad_norm": 9.720219612121582, "learning_rate": 1.9999832593298528e-05, "loss": 4.1384, "step": 7040 }, { "epoch": 8.504526252263126, "grad_norm": 8.699888229370117, "learning_rate": 1.9999832352043734e-05, "loss": 4.1451, "step": 7050 }, { "epoch": 8.51659625829813, "grad_norm": 8.31874942779541, "learning_rate": 1.999983211078894e-05, "loss": 4.1202, "step": 7060 }, { "epoch": 8.528666264333133, "grad_norm": 8.672872543334961, "learning_rate": 1.9999831869534146e-05, "loss": 4.173, "step": 7070 }, { "epoch": 8.540736270368136, "grad_norm": 7.3553924560546875, "learning_rate": 1.9999831628279353e-05, "loss": 4.1523, "step": 7080 }, { "epoch": 8.552806276403139, "grad_norm": 8.860321998596191, "learning_rate": 1.999983138702456e-05, "loss": 4.1676, "step": 7090 }, { "epoch": 8.564876282438142, "grad_norm": 9.277664184570312, "learning_rate": 1.9999831145769765e-05, "loss": 4.1346, "step": 7100 }, { "epoch": 8.576946288473144, "grad_norm": 8.325603485107422, "learning_rate": 1.999983090451497e-05, "loss": 4.1579, "step": 7110 }, { "epoch": 8.589016294508147, "grad_norm": 8.735273361206055, "learning_rate": 1.9999830663260177e-05, "loss": 4.122, "step": 7120 }, { "epoch": 8.60108630054315, "grad_norm": 8.757452011108398, "learning_rate": 1.9999830422005384e-05, "loss": 4.1319, "step": 7130 }, { "epoch": 8.613156306578153, "grad_norm": 8.506098747253418, "learning_rate": 1.999983018075059e-05, "loss": 4.1673, "step": 7140 }, { "epoch": 8.625226312613156, "grad_norm": 8.723730087280273, "learning_rate": 1.9999829939495796e-05, "loss": 4.1531, "step": 7150 }, { "epoch": 8.63729631864816, "grad_norm": 8.179156303405762, "learning_rate": 1.9999829698241002e-05, "loss": 4.2071, "step": 7160 }, { "epoch": 8.649366324683163, "grad_norm": 7.696639060974121, "learning_rate": 1.999982945698621e-05, "loss": 4.1403, "step": 7170 }, { "epoch": 8.661436330718166, "grad_norm": 8.247722625732422, "learning_rate": 1.9999829215731415e-05, "loss": 4.1741, "step": 7180 }, { "epoch": 8.673506336753169, "grad_norm": 8.349963188171387, "learning_rate": 1.999982897447662e-05, "loss": 4.1344, "step": 7190 }, { "epoch": 8.68557634278817, "grad_norm": 9.192859649658203, "learning_rate": 1.9999828733221827e-05, "loss": 4.1953, "step": 7200 }, { "epoch": 8.697646348823174, "grad_norm": 8.10523509979248, "learning_rate": 1.9999828491967033e-05, "loss": 4.1711, "step": 7210 }, { "epoch": 8.709716354858177, "grad_norm": 8.819990158081055, "learning_rate": 1.999982825071224e-05, "loss": 4.183, "step": 7220 }, { "epoch": 8.72178636089318, "grad_norm": 7.96160364151001, "learning_rate": 1.9999828009457446e-05, "loss": 4.1819, "step": 7230 }, { "epoch": 8.733856366928183, "grad_norm": 8.150829315185547, "learning_rate": 1.9999827768202652e-05, "loss": 4.1739, "step": 7240 }, { "epoch": 8.745926372963186, "grad_norm": 8.422220230102539, "learning_rate": 1.9999827526947858e-05, "loss": 4.1969, "step": 7250 }, { "epoch": 8.75799637899819, "grad_norm": 8.429340362548828, "learning_rate": 1.9999827285693064e-05, "loss": 4.1807, "step": 7260 }, { "epoch": 8.770066385033193, "grad_norm": 8.459136962890625, "learning_rate": 1.999982704443827e-05, "loss": 4.1929, "step": 7270 }, { "epoch": 8.782136391068196, "grad_norm": 8.170445442199707, "learning_rate": 1.9999826803183477e-05, "loss": 4.2035, "step": 7280 }, { "epoch": 8.7942063971032, "grad_norm": 8.005834579467773, "learning_rate": 1.999982656192868e-05, "loss": 4.2064, "step": 7290 }, { "epoch": 8.806276403138202, "grad_norm": 8.548860549926758, "learning_rate": 1.9999826320673886e-05, "loss": 4.1578, "step": 7300 }, { "epoch": 8.818346409173204, "grad_norm": 7.619138240814209, "learning_rate": 1.9999826079419092e-05, "loss": 4.1589, "step": 7310 }, { "epoch": 8.830416415208207, "grad_norm": 8.375731468200684, "learning_rate": 1.99998258381643e-05, "loss": 4.1878, "step": 7320 }, { "epoch": 8.84248642124321, "grad_norm": 8.157581329345703, "learning_rate": 1.9999825596909505e-05, "loss": 4.1905, "step": 7330 }, { "epoch": 8.854556427278213, "grad_norm": 7.637290000915527, "learning_rate": 1.999982535565471e-05, "loss": 4.1844, "step": 7340 }, { "epoch": 8.866626433313217, "grad_norm": 8.014698028564453, "learning_rate": 1.9999825114399917e-05, "loss": 4.2003, "step": 7350 }, { "epoch": 8.87869643934822, "grad_norm": 9.105371475219727, "learning_rate": 1.9999824873145123e-05, "loss": 4.1794, "step": 7360 }, { "epoch": 8.890766445383223, "grad_norm": 8.731332778930664, "learning_rate": 1.999982463189033e-05, "loss": 4.1676, "step": 7370 }, { "epoch": 8.902836451418226, "grad_norm": 8.488369941711426, "learning_rate": 1.9999824390635536e-05, "loss": 4.1902, "step": 7380 }, { "epoch": 8.91490645745323, "grad_norm": 8.5191650390625, "learning_rate": 1.9999824149380742e-05, "loss": 4.204, "step": 7390 }, { "epoch": 8.926976463488232, "grad_norm": 7.886916637420654, "learning_rate": 1.999982390812595e-05, "loss": 4.2067, "step": 7400 }, { "epoch": 8.939046469523234, "grad_norm": 8.538840293884277, "learning_rate": 1.9999823666871158e-05, "loss": 4.21, "step": 7410 }, { "epoch": 8.951116475558237, "grad_norm": 8.265067100524902, "learning_rate": 1.9999823425616364e-05, "loss": 4.1931, "step": 7420 }, { "epoch": 8.96318648159324, "grad_norm": 7.829092979431152, "learning_rate": 1.999982318436157e-05, "loss": 4.2256, "step": 7430 }, { "epoch": 8.975256487628243, "grad_norm": 8.517146110534668, "learning_rate": 1.9999822943106776e-05, "loss": 4.2043, "step": 7440 }, { "epoch": 8.987326493663247, "grad_norm": 7.979662895202637, "learning_rate": 1.999982270185198e-05, "loss": 4.2368, "step": 7450 }, { "epoch": 8.99939649969825, "grad_norm": 8.437711715698242, "learning_rate": 1.9999822460597185e-05, "loss": 4.2287, "step": 7460 }, { "epoch": 9.010863005431503, "grad_norm": 10.64576530456543, "learning_rate": 1.999982221934239e-05, "loss": 3.7118, "step": 7470 }, { "epoch": 9.022933011466506, "grad_norm": 9.551922798156738, "learning_rate": 1.9999821978087598e-05, "loss": 3.6884, "step": 7480 }, { "epoch": 9.03500301750151, "grad_norm": 9.469926834106445, "learning_rate": 1.9999821736832804e-05, "loss": 3.6102, "step": 7490 }, { "epoch": 9.047073023536512, "grad_norm": 10.00442123413086, "learning_rate": 1.999982149557801e-05, "loss": 3.634, "step": 7500 }, { "epoch": 9.047073023536512, "eval_loss": 5.875026702880859, "eval_runtime": 8.138, "eval_samples_per_second": 85.648, "eval_steps_per_second": 10.814, "step": 7500 }, { "epoch": 9.059143029571516, "grad_norm": 9.12352466583252, "learning_rate": 1.9999821254323216e-05, "loss": 3.611, "step": 7510 }, { "epoch": 9.071213035606517, "grad_norm": 9.455389022827148, "learning_rate": 1.9999821013068423e-05, "loss": 3.6552, "step": 7520 }, { "epoch": 9.08328304164152, "grad_norm": 9.927096366882324, "learning_rate": 1.999982077181363e-05, "loss": 3.6419, "step": 7530 }, { "epoch": 9.095353047676523, "grad_norm": 10.488615989685059, "learning_rate": 1.9999820530558835e-05, "loss": 3.6312, "step": 7540 }, { "epoch": 9.107423053711527, "grad_norm": 11.213622093200684, "learning_rate": 1.999982028930404e-05, "loss": 3.6515, "step": 7550 }, { "epoch": 9.11949305974653, "grad_norm": 9.77283000946045, "learning_rate": 1.9999820048049248e-05, "loss": 3.6831, "step": 7560 }, { "epoch": 9.131563065781533, "grad_norm": 9.813368797302246, "learning_rate": 1.9999819806794454e-05, "loss": 3.6371, "step": 7570 }, { "epoch": 9.143633071816536, "grad_norm": 9.863834381103516, "learning_rate": 1.999981956553966e-05, "loss": 3.6551, "step": 7580 }, { "epoch": 9.15570307785154, "grad_norm": 10.13169002532959, "learning_rate": 1.9999819324284866e-05, "loss": 3.6474, "step": 7590 }, { "epoch": 9.167773083886543, "grad_norm": 9.484007835388184, "learning_rate": 1.9999819083030072e-05, "loss": 3.6792, "step": 7600 }, { "epoch": 9.179843089921546, "grad_norm": 9.543051719665527, "learning_rate": 1.999981884177528e-05, "loss": 3.6388, "step": 7610 }, { "epoch": 9.191913095956547, "grad_norm": 10.169478416442871, "learning_rate": 1.9999818600520485e-05, "loss": 3.6634, "step": 7620 }, { "epoch": 9.20398310199155, "grad_norm": 10.52901554107666, "learning_rate": 1.999981835926569e-05, "loss": 3.6521, "step": 7630 }, { "epoch": 9.216053108026554, "grad_norm": 9.047080039978027, "learning_rate": 1.9999818118010897e-05, "loss": 3.6837, "step": 7640 }, { "epoch": 9.228123114061557, "grad_norm": 10.44336986541748, "learning_rate": 1.9999817876756104e-05, "loss": 3.6413, "step": 7650 }, { "epoch": 9.24019312009656, "grad_norm": 10.250588417053223, "learning_rate": 1.999981763550131e-05, "loss": 3.6855, "step": 7660 }, { "epoch": 9.252263126131563, "grad_norm": 9.687636375427246, "learning_rate": 1.9999817394246516e-05, "loss": 3.7202, "step": 7670 }, { "epoch": 9.264333132166566, "grad_norm": 10.04291820526123, "learning_rate": 1.9999817152991722e-05, "loss": 3.7016, "step": 7680 }, { "epoch": 9.27640313820157, "grad_norm": 10.635927200317383, "learning_rate": 1.999981691173693e-05, "loss": 3.6815, "step": 7690 }, { "epoch": 9.288473144236573, "grad_norm": 9.766491889953613, "learning_rate": 1.999981667048213e-05, "loss": 3.6982, "step": 7700 }, { "epoch": 9.300543150271576, "grad_norm": 10.2587251663208, "learning_rate": 1.9999816429227337e-05, "loss": 3.7188, "step": 7710 }, { "epoch": 9.312613156306579, "grad_norm": 10.085198402404785, "learning_rate": 1.9999816187972544e-05, "loss": 3.7165, "step": 7720 }, { "epoch": 9.32468316234158, "grad_norm": 9.487380027770996, "learning_rate": 1.999981594671775e-05, "loss": 3.6848, "step": 7730 }, { "epoch": 9.336753168376584, "grad_norm": 10.451848983764648, "learning_rate": 1.9999815705462956e-05, "loss": 3.7351, "step": 7740 }, { "epoch": 9.348823174411587, "grad_norm": 10.552987098693848, "learning_rate": 1.9999815464208162e-05, "loss": 3.7304, "step": 7750 }, { "epoch": 9.36089318044659, "grad_norm": 9.322263717651367, "learning_rate": 1.999981522295337e-05, "loss": 3.7513, "step": 7760 }, { "epoch": 9.372963186481593, "grad_norm": 10.182112693786621, "learning_rate": 1.9999814981698575e-05, "loss": 3.7187, "step": 7770 }, { "epoch": 9.385033192516596, "grad_norm": 9.715757369995117, "learning_rate": 1.999981474044378e-05, "loss": 3.7178, "step": 7780 }, { "epoch": 9.3971031985516, "grad_norm": 9.357425689697266, "learning_rate": 1.9999814499188987e-05, "loss": 3.7094, "step": 7790 }, { "epoch": 9.409173204586603, "grad_norm": 10.089354515075684, "learning_rate": 1.9999814257934193e-05, "loss": 3.7467, "step": 7800 }, { "epoch": 9.421243210621606, "grad_norm": 9.571434020996094, "learning_rate": 1.99998140166794e-05, "loss": 3.7438, "step": 7810 }, { "epoch": 9.43331321665661, "grad_norm": 11.000099182128906, "learning_rate": 1.9999813775424606e-05, "loss": 3.7591, "step": 7820 }, { "epoch": 9.44538322269161, "grad_norm": 9.883132934570312, "learning_rate": 1.9999813534169812e-05, "loss": 3.7413, "step": 7830 }, { "epoch": 9.457453228726614, "grad_norm": 9.695323944091797, "learning_rate": 1.9999813292915018e-05, "loss": 3.7219, "step": 7840 }, { "epoch": 9.469523234761617, "grad_norm": 9.839624404907227, "learning_rate": 1.9999813051660224e-05, "loss": 3.7208, "step": 7850 }, { "epoch": 9.48159324079662, "grad_norm": 9.405845642089844, "learning_rate": 1.999981281040543e-05, "loss": 3.7554, "step": 7860 }, { "epoch": 9.493663246831623, "grad_norm": 10.221710205078125, "learning_rate": 1.9999812569150637e-05, "loss": 3.7596, "step": 7870 }, { "epoch": 9.505733252866627, "grad_norm": 9.232605934143066, "learning_rate": 1.9999812327895843e-05, "loss": 3.7628, "step": 7880 }, { "epoch": 9.51780325890163, "grad_norm": 9.905844688415527, "learning_rate": 1.999981208664105e-05, "loss": 3.7372, "step": 7890 }, { "epoch": 9.529873264936633, "grad_norm": 9.840839385986328, "learning_rate": 1.9999811845386256e-05, "loss": 3.7551, "step": 7900 }, { "epoch": 9.541943270971636, "grad_norm": 9.857927322387695, "learning_rate": 1.9999811604131462e-05, "loss": 3.7804, "step": 7910 }, { "epoch": 9.55401327700664, "grad_norm": 10.727806091308594, "learning_rate": 1.9999811362876668e-05, "loss": 3.7839, "step": 7920 }, { "epoch": 9.566083283041642, "grad_norm": 9.876007080078125, "learning_rate": 1.9999811121621874e-05, "loss": 3.7386, "step": 7930 }, { "epoch": 9.578153289076644, "grad_norm": 9.813557624816895, "learning_rate": 1.999981088036708e-05, "loss": 3.7519, "step": 7940 }, { "epoch": 9.590223295111647, "grad_norm": 10.094962120056152, "learning_rate": 1.9999810639112287e-05, "loss": 3.7416, "step": 7950 }, { "epoch": 9.60229330114665, "grad_norm": 10.115303993225098, "learning_rate": 1.9999810397857493e-05, "loss": 3.7958, "step": 7960 }, { "epoch": 9.614363307181653, "grad_norm": 8.648655891418457, "learning_rate": 1.99998101566027e-05, "loss": 3.7856, "step": 7970 }, { "epoch": 9.626433313216657, "grad_norm": 9.45441722869873, "learning_rate": 1.9999809915347905e-05, "loss": 3.775, "step": 7980 }, { "epoch": 9.63850331925166, "grad_norm": 10.677288055419922, "learning_rate": 1.999980967409311e-05, "loss": 3.7665, "step": 7990 }, { "epoch": 9.650573325286663, "grad_norm": 10.962929725646973, "learning_rate": 1.9999809432838318e-05, "loss": 3.7604, "step": 8000 }, { "epoch": 9.650573325286663, "eval_loss": 5.890118598937988, "eval_runtime": 8.1495, "eval_samples_per_second": 85.526, "eval_steps_per_second": 10.798, "step": 8000 }, { "epoch": 9.662643331321666, "grad_norm": 10.70326042175293, "learning_rate": 1.9999809191583524e-05, "loss": 3.7412, "step": 8010 }, { "epoch": 9.67471333735667, "grad_norm": 10.233471870422363, "learning_rate": 1.999980895032873e-05, "loss": 3.7907, "step": 8020 }, { "epoch": 9.68678334339167, "grad_norm": 9.891739845275879, "learning_rate": 1.9999808709073936e-05, "loss": 3.796, "step": 8030 }, { "epoch": 9.698853349426674, "grad_norm": 8.8497896194458, "learning_rate": 1.9999808467819143e-05, "loss": 3.7663, "step": 8040 }, { "epoch": 9.710923355461677, "grad_norm": 9.907861709594727, "learning_rate": 1.999980822656435e-05, "loss": 3.7873, "step": 8050 }, { "epoch": 9.72299336149668, "grad_norm": 9.72838306427002, "learning_rate": 1.9999807985309555e-05, "loss": 3.768, "step": 8060 }, { "epoch": 9.735063367531684, "grad_norm": 10.87629222869873, "learning_rate": 1.999980774405476e-05, "loss": 3.8059, "step": 8070 }, { "epoch": 9.747133373566687, "grad_norm": 9.857516288757324, "learning_rate": 1.9999807502799967e-05, "loss": 3.8055, "step": 8080 }, { "epoch": 9.75920337960169, "grad_norm": 10.452301979064941, "learning_rate": 1.9999807261545174e-05, "loss": 3.7572, "step": 8090 }, { "epoch": 9.771273385636693, "grad_norm": 10.06588363647461, "learning_rate": 1.999980702029038e-05, "loss": 3.8013, "step": 8100 }, { "epoch": 9.783343391671696, "grad_norm": 9.845911979675293, "learning_rate": 1.9999806779035583e-05, "loss": 3.7982, "step": 8110 }, { "epoch": 9.7954133977067, "grad_norm": 10.691495895385742, "learning_rate": 1.999980653778079e-05, "loss": 3.7968, "step": 8120 }, { "epoch": 9.807483403741703, "grad_norm": 9.792536735534668, "learning_rate": 1.9999806296525995e-05, "loss": 3.7896, "step": 8130 }, { "epoch": 9.819553409776704, "grad_norm": 10.069684028625488, "learning_rate": 1.99998060552712e-05, "loss": 3.795, "step": 8140 }, { "epoch": 9.831623415811707, "grad_norm": 9.246320724487305, "learning_rate": 1.9999805814016408e-05, "loss": 3.8366, "step": 8150 }, { "epoch": 9.84369342184671, "grad_norm": 10.151102066040039, "learning_rate": 1.9999805572761614e-05, "loss": 3.8022, "step": 8160 }, { "epoch": 9.855763427881714, "grad_norm": 9.494010925292969, "learning_rate": 1.999980533150682e-05, "loss": 3.8186, "step": 8170 }, { "epoch": 9.867833433916717, "grad_norm": 10.388167381286621, "learning_rate": 1.9999805090252026e-05, "loss": 3.7937, "step": 8180 }, { "epoch": 9.87990343995172, "grad_norm": 9.596305847167969, "learning_rate": 1.9999804848997232e-05, "loss": 3.777, "step": 8190 }, { "epoch": 9.891973445986723, "grad_norm": 9.457122802734375, "learning_rate": 1.999980460774244e-05, "loss": 3.8301, "step": 8200 }, { "epoch": 9.904043452021726, "grad_norm": 9.826644897460938, "learning_rate": 1.9999804366487645e-05, "loss": 3.7968, "step": 8210 }, { "epoch": 9.91611345805673, "grad_norm": 9.758737564086914, "learning_rate": 1.999980412523285e-05, "loss": 3.7917, "step": 8220 }, { "epoch": 9.928183464091733, "grad_norm": 9.707268714904785, "learning_rate": 1.9999803883978057e-05, "loss": 3.7922, "step": 8230 }, { "epoch": 9.940253470126734, "grad_norm": 9.858986854553223, "learning_rate": 1.9999803642723263e-05, "loss": 3.8184, "step": 8240 }, { "epoch": 9.952323476161737, "grad_norm": 10.84862232208252, "learning_rate": 1.999980340146847e-05, "loss": 3.7943, "step": 8250 }, { "epoch": 9.96439348219674, "grad_norm": 9.707345962524414, "learning_rate": 1.9999803160213676e-05, "loss": 3.8337, "step": 8260 }, { "epoch": 9.976463488231744, "grad_norm": 10.693458557128906, "learning_rate": 1.9999802918958882e-05, "loss": 3.8453, "step": 8270 }, { "epoch": 9.988533494266747, "grad_norm": 10.093518257141113, "learning_rate": 1.9999802677704088e-05, "loss": 3.8006, "step": 8280 }, { "epoch": 10.0, "grad_norm": 13.59939956665039, "learning_rate": 1.9999802436449295e-05, "loss": 3.8498, "step": 8290 }, { "epoch": 10.012070006035003, "grad_norm": 11.643899917602539, "learning_rate": 1.99998021951945e-05, "loss": 3.2151, "step": 8300 }, { "epoch": 10.024140012070006, "grad_norm": 11.863248825073242, "learning_rate": 1.9999801953939707e-05, "loss": 3.2066, "step": 8310 }, { "epoch": 10.03621001810501, "grad_norm": 11.347163200378418, "learning_rate": 1.9999801712684913e-05, "loss": 3.155, "step": 8320 }, { "epoch": 10.048280024140013, "grad_norm": 11.457657814025879, "learning_rate": 1.999980147143012e-05, "loss": 3.1508, "step": 8330 }, { "epoch": 10.060350030175016, "grad_norm": 11.252174377441406, "learning_rate": 1.9999801230175326e-05, "loss": 3.1903, "step": 8340 }, { "epoch": 10.072420036210017, "grad_norm": 11.139976501464844, "learning_rate": 1.9999800988920532e-05, "loss": 3.1568, "step": 8350 }, { "epoch": 10.08449004224502, "grad_norm": 11.103819847106934, "learning_rate": 1.9999800747665735e-05, "loss": 3.1615, "step": 8360 }, { "epoch": 10.096560048280024, "grad_norm": 11.155998229980469, "learning_rate": 1.999980050641094e-05, "loss": 3.2127, "step": 8370 }, { "epoch": 10.108630054315027, "grad_norm": 11.283234596252441, "learning_rate": 1.9999800265156147e-05, "loss": 3.1676, "step": 8380 }, { "epoch": 10.12070006035003, "grad_norm": 10.953408241271973, "learning_rate": 1.9999800023901353e-05, "loss": 3.1907, "step": 8390 }, { "epoch": 10.132770066385033, "grad_norm": 10.849139213562012, "learning_rate": 1.999979978264656e-05, "loss": 3.1828, "step": 8400 }, { "epoch": 10.144840072420036, "grad_norm": 11.947012901306152, "learning_rate": 1.9999799541391766e-05, "loss": 3.23, "step": 8410 }, { "epoch": 10.15691007845504, "grad_norm": 11.871581077575684, "learning_rate": 1.9999799300136972e-05, "loss": 3.1866, "step": 8420 }, { "epoch": 10.168980084490043, "grad_norm": 12.005480766296387, "learning_rate": 1.9999799058882178e-05, "loss": 3.2005, "step": 8430 }, { "epoch": 10.181050090525046, "grad_norm": 11.375064849853516, "learning_rate": 1.9999798817627384e-05, "loss": 3.2063, "step": 8440 }, { "epoch": 10.193120096560047, "grad_norm": 11.376068115234375, "learning_rate": 1.999979857637259e-05, "loss": 3.1945, "step": 8450 }, { "epoch": 10.20519010259505, "grad_norm": 11.965143203735352, "learning_rate": 1.9999798335117797e-05, "loss": 3.2265, "step": 8460 }, { "epoch": 10.217260108630054, "grad_norm": 11.070598602294922, "learning_rate": 1.9999798093863003e-05, "loss": 3.1936, "step": 8470 }, { "epoch": 10.229330114665057, "grad_norm": 12.143216133117676, "learning_rate": 1.9999797852608213e-05, "loss": 3.2273, "step": 8480 }, { "epoch": 10.24140012070006, "grad_norm": 12.296282768249512, "learning_rate": 1.999979761135342e-05, "loss": 3.2457, "step": 8490 }, { "epoch": 10.253470126735063, "grad_norm": 10.93585205078125, "learning_rate": 1.9999797370098625e-05, "loss": 3.2567, "step": 8500 }, { "epoch": 10.253470126735063, "eval_loss": 6.273431777954102, "eval_runtime": 8.1267, "eval_samples_per_second": 85.766, "eval_steps_per_second": 10.828, "step": 8500 }, { "epoch": 10.265540132770067, "grad_norm": 11.555421829223633, "learning_rate": 1.999979712884383e-05, "loss": 3.2777, "step": 8510 }, { "epoch": 10.27761013880507, "grad_norm": 10.941624641418457, "learning_rate": 1.9999796887589037e-05, "loss": 3.244, "step": 8520 }, { "epoch": 10.289680144840073, "grad_norm": 10.940681457519531, "learning_rate": 1.999979664633424e-05, "loss": 3.2558, "step": 8530 }, { "epoch": 10.301750150875076, "grad_norm": 11.643831253051758, "learning_rate": 1.9999796405079447e-05, "loss": 3.2275, "step": 8540 }, { "epoch": 10.31382015691008, "grad_norm": 12.701855659484863, "learning_rate": 1.9999796163824653e-05, "loss": 3.2731, "step": 8550 }, { "epoch": 10.32589016294508, "grad_norm": 11.714604377746582, "learning_rate": 1.999979592256986e-05, "loss": 3.3001, "step": 8560 }, { "epoch": 10.337960168980084, "grad_norm": 12.141691207885742, "learning_rate": 1.9999795681315065e-05, "loss": 3.2684, "step": 8570 }, { "epoch": 10.350030175015087, "grad_norm": 12.15954875946045, "learning_rate": 1.999979544006027e-05, "loss": 3.2987, "step": 8580 }, { "epoch": 10.36210018105009, "grad_norm": 11.77973747253418, "learning_rate": 1.9999795198805478e-05, "loss": 3.2717, "step": 8590 }, { "epoch": 10.374170187085094, "grad_norm": 12.286015510559082, "learning_rate": 1.9999794957550684e-05, "loss": 3.3125, "step": 8600 }, { "epoch": 10.386240193120097, "grad_norm": 11.290699005126953, "learning_rate": 1.999979471629589e-05, "loss": 3.2586, "step": 8610 }, { "epoch": 10.3983101991551, "grad_norm": 11.563714027404785, "learning_rate": 1.9999794475041096e-05, "loss": 3.2631, "step": 8620 }, { "epoch": 10.410380205190103, "grad_norm": 10.571041107177734, "learning_rate": 1.9999794233786302e-05, "loss": 3.2778, "step": 8630 }, { "epoch": 10.422450211225106, "grad_norm": 11.867912292480469, "learning_rate": 1.999979399253151e-05, "loss": 3.2808, "step": 8640 }, { "epoch": 10.43452021726011, "grad_norm": 10.994965553283691, "learning_rate": 1.9999793751276715e-05, "loss": 3.2927, "step": 8650 }, { "epoch": 10.446590223295111, "grad_norm": 12.00162410736084, "learning_rate": 1.999979351002192e-05, "loss": 3.3022, "step": 8660 }, { "epoch": 10.458660229330114, "grad_norm": 11.726410865783691, "learning_rate": 1.9999793268767127e-05, "loss": 3.2814, "step": 8670 }, { "epoch": 10.470730235365117, "grad_norm": 11.88224983215332, "learning_rate": 1.9999793027512334e-05, "loss": 3.2851, "step": 8680 }, { "epoch": 10.48280024140012, "grad_norm": 11.749469757080078, "learning_rate": 1.999979278625754e-05, "loss": 3.3305, "step": 8690 }, { "epoch": 10.494870247435124, "grad_norm": 11.05395221710205, "learning_rate": 1.9999792545002746e-05, "loss": 3.3242, "step": 8700 }, { "epoch": 10.506940253470127, "grad_norm": 10.41895580291748, "learning_rate": 1.9999792303747952e-05, "loss": 3.2934, "step": 8710 }, { "epoch": 10.51901025950513, "grad_norm": 11.80549144744873, "learning_rate": 1.999979206249316e-05, "loss": 3.3272, "step": 8720 }, { "epoch": 10.531080265540133, "grad_norm": 12.256775856018066, "learning_rate": 1.9999791821238365e-05, "loss": 3.3194, "step": 8730 }, { "epoch": 10.543150271575136, "grad_norm": 11.953177452087402, "learning_rate": 1.999979157998357e-05, "loss": 3.3421, "step": 8740 }, { "epoch": 10.55522027761014, "grad_norm": 11.770605087280273, "learning_rate": 1.9999791338728777e-05, "loss": 3.2966, "step": 8750 }, { "epoch": 10.567290283645141, "grad_norm": 12.233461380004883, "learning_rate": 1.9999791097473983e-05, "loss": 3.3346, "step": 8760 }, { "epoch": 10.579360289680144, "grad_norm": 11.780975341796875, "learning_rate": 1.999979085621919e-05, "loss": 3.3, "step": 8770 }, { "epoch": 10.591430295715147, "grad_norm": 11.847624778747559, "learning_rate": 1.9999790614964392e-05, "loss": 3.3384, "step": 8780 }, { "epoch": 10.60350030175015, "grad_norm": 12.042078018188477, "learning_rate": 1.99997903737096e-05, "loss": 3.3216, "step": 8790 }, { "epoch": 10.615570307785154, "grad_norm": 10.747603416442871, "learning_rate": 1.9999790132454805e-05, "loss": 3.3273, "step": 8800 }, { "epoch": 10.627640313820157, "grad_norm": 12.961419105529785, "learning_rate": 1.999978989120001e-05, "loss": 3.2983, "step": 8810 }, { "epoch": 10.63971031985516, "grad_norm": 11.36508846282959, "learning_rate": 1.9999789649945217e-05, "loss": 3.3475, "step": 8820 }, { "epoch": 10.651780325890163, "grad_norm": 12.358994483947754, "learning_rate": 1.9999789408690423e-05, "loss": 3.3456, "step": 8830 }, { "epoch": 10.663850331925167, "grad_norm": 13.851675987243652, "learning_rate": 1.999978916743563e-05, "loss": 3.3133, "step": 8840 }, { "epoch": 10.67592033796017, "grad_norm": 12.147165298461914, "learning_rate": 1.9999788926180836e-05, "loss": 3.3593, "step": 8850 }, { "epoch": 10.687990343995171, "grad_norm": 11.162426948547363, "learning_rate": 1.9999788684926042e-05, "loss": 3.3527, "step": 8860 }, { "epoch": 10.700060350030174, "grad_norm": 11.23224925994873, "learning_rate": 1.9999788443671248e-05, "loss": 3.3442, "step": 8870 }, { "epoch": 10.712130356065177, "grad_norm": 12.253539085388184, "learning_rate": 1.9999788202416454e-05, "loss": 3.3621, "step": 8880 }, { "epoch": 10.72420036210018, "grad_norm": 10.807220458984375, "learning_rate": 1.999978796116166e-05, "loss": 3.3596, "step": 8890 }, { "epoch": 10.736270368135184, "grad_norm": 11.863422393798828, "learning_rate": 1.9999787719906867e-05, "loss": 3.3685, "step": 8900 }, { "epoch": 10.748340374170187, "grad_norm": 11.473676681518555, "learning_rate": 1.9999787478652073e-05, "loss": 3.3492, "step": 8910 }, { "epoch": 10.76041038020519, "grad_norm": 10.860541343688965, "learning_rate": 1.999978723739728e-05, "loss": 3.3502, "step": 8920 }, { "epoch": 10.772480386240193, "grad_norm": 12.084607124328613, "learning_rate": 1.9999786996142486e-05, "loss": 3.3639, "step": 8930 }, { "epoch": 10.784550392275197, "grad_norm": 11.52517032623291, "learning_rate": 1.9999786754887692e-05, "loss": 3.3757, "step": 8940 }, { "epoch": 10.7966203983102, "grad_norm": 11.954156875610352, "learning_rate": 1.9999786513632898e-05, "loss": 3.3042, "step": 8950 }, { "epoch": 10.808690404345203, "grad_norm": 13.039593696594238, "learning_rate": 1.9999786272378104e-05, "loss": 3.3508, "step": 8960 }, { "epoch": 10.820760410380204, "grad_norm": 10.8887357711792, "learning_rate": 1.999978603112331e-05, "loss": 3.3994, "step": 8970 }, { "epoch": 10.832830416415208, "grad_norm": 12.414474487304688, "learning_rate": 1.9999785789868517e-05, "loss": 3.4065, "step": 8980 }, { "epoch": 10.84490042245021, "grad_norm": 11.61947250366211, "learning_rate": 1.9999785548613723e-05, "loss": 3.3588, "step": 8990 }, { "epoch": 10.856970428485214, "grad_norm": 11.75595760345459, "learning_rate": 1.999978530735893e-05, "loss": 3.4268, "step": 9000 }, { "epoch": 10.856970428485214, "eval_loss": 6.2633376121521, "eval_runtime": 8.128, "eval_samples_per_second": 85.753, "eval_steps_per_second": 10.827, "step": 9000 }, { "epoch": 10.869040434520217, "grad_norm": 11.542745590209961, "learning_rate": 1.9999785066104135e-05, "loss": 3.3891, "step": 9010 }, { "epoch": 10.88111044055522, "grad_norm": 11.749319076538086, "learning_rate": 1.999978482484934e-05, "loss": 3.3586, "step": 9020 }, { "epoch": 10.893180446590224, "grad_norm": 11.818533897399902, "learning_rate": 1.9999784583594548e-05, "loss": 3.3701, "step": 9030 }, { "epoch": 10.905250452625227, "grad_norm": 11.785455703735352, "learning_rate": 1.9999784342339754e-05, "loss": 3.4, "step": 9040 }, { "epoch": 10.91732045866023, "grad_norm": 12.270605087280273, "learning_rate": 1.999978410108496e-05, "loss": 3.414, "step": 9050 }, { "epoch": 10.929390464695233, "grad_norm": 12.616728782653809, "learning_rate": 1.9999783859830166e-05, "loss": 3.3897, "step": 9060 }, { "epoch": 10.941460470730235, "grad_norm": 11.751734733581543, "learning_rate": 1.9999783618575373e-05, "loss": 3.4014, "step": 9070 }, { "epoch": 10.953530476765238, "grad_norm": 12.290227890014648, "learning_rate": 1.999978337732058e-05, "loss": 3.409, "step": 9080 }, { "epoch": 10.965600482800241, "grad_norm": 11.48461627960205, "learning_rate": 1.9999783136065785e-05, "loss": 3.3987, "step": 9090 }, { "epoch": 10.977670488835244, "grad_norm": 12.059453964233398, "learning_rate": 1.999978289481099e-05, "loss": 3.3584, "step": 9100 }, { "epoch": 10.989740494870247, "grad_norm": 12.150031089782715, "learning_rate": 1.9999782653556197e-05, "loss": 3.3927, "step": 9110 }, { "epoch": 11.0012070006035, "grad_norm": 12.127062797546387, "learning_rate": 1.9999782412301404e-05, "loss": 3.3798, "step": 9120 }, { "epoch": 11.013277006638504, "grad_norm": 13.119755744934082, "learning_rate": 1.999978217104661e-05, "loss": 2.7458, "step": 9130 }, { "epoch": 11.025347012673507, "grad_norm": 12.264181137084961, "learning_rate": 1.9999781929791816e-05, "loss": 2.716, "step": 9140 }, { "epoch": 11.03741701870851, "grad_norm": 13.471586227416992, "learning_rate": 1.9999781688537022e-05, "loss": 2.7222, "step": 9150 }, { "epoch": 11.049487024743513, "grad_norm": 13.345455169677734, "learning_rate": 1.999978144728223e-05, "loss": 2.7382, "step": 9160 }, { "epoch": 11.061557030778516, "grad_norm": 12.739056587219238, "learning_rate": 1.9999781206027435e-05, "loss": 2.6859, "step": 9170 }, { "epoch": 11.073627036813518, "grad_norm": 12.41855239868164, "learning_rate": 1.999978096477264e-05, "loss": 2.7093, "step": 9180 }, { "epoch": 11.08569704284852, "grad_norm": 13.19820499420166, "learning_rate": 1.9999780723517844e-05, "loss": 2.7, "step": 9190 }, { "epoch": 11.097767048883524, "grad_norm": 13.438478469848633, "learning_rate": 1.999978048226305e-05, "loss": 2.7311, "step": 9200 }, { "epoch": 11.109837054918527, "grad_norm": 12.082213401794434, "learning_rate": 1.9999780241008256e-05, "loss": 2.7067, "step": 9210 }, { "epoch": 11.12190706095353, "grad_norm": 12.831727027893066, "learning_rate": 1.9999779999753462e-05, "loss": 2.6945, "step": 9220 }, { "epoch": 11.133977066988534, "grad_norm": 12.826452255249023, "learning_rate": 1.999977975849867e-05, "loss": 2.7329, "step": 9230 }, { "epoch": 11.146047073023537, "grad_norm": 12.873446464538574, "learning_rate": 1.9999779517243875e-05, "loss": 2.7377, "step": 9240 }, { "epoch": 11.15811707905854, "grad_norm": 12.943716049194336, "learning_rate": 1.999977927598908e-05, "loss": 2.7646, "step": 9250 }, { "epoch": 11.170187085093543, "grad_norm": 13.479171752929688, "learning_rate": 1.9999779034734287e-05, "loss": 2.7651, "step": 9260 }, { "epoch": 11.182257091128546, "grad_norm": 12.60240364074707, "learning_rate": 1.9999778793479493e-05, "loss": 2.7292, "step": 9270 }, { "epoch": 11.194327097163548, "grad_norm": 12.826404571533203, "learning_rate": 1.99997785522247e-05, "loss": 2.7529, "step": 9280 }, { "epoch": 11.206397103198551, "grad_norm": 13.501012802124023, "learning_rate": 1.9999778310969906e-05, "loss": 2.7287, "step": 9290 }, { "epoch": 11.218467109233554, "grad_norm": 12.68431282043457, "learning_rate": 1.9999778069715112e-05, "loss": 2.7579, "step": 9300 }, { "epoch": 11.230537115268557, "grad_norm": 12.452210426330566, "learning_rate": 1.999977782846032e-05, "loss": 2.7858, "step": 9310 }, { "epoch": 11.24260712130356, "grad_norm": 12.473185539245605, "learning_rate": 1.9999777587205525e-05, "loss": 2.782, "step": 9320 }, { "epoch": 11.254677127338564, "grad_norm": 12.507654190063477, "learning_rate": 1.999977734595073e-05, "loss": 2.7835, "step": 9330 }, { "epoch": 11.266747133373567, "grad_norm": 13.456732749938965, "learning_rate": 1.9999777104695937e-05, "loss": 2.7938, "step": 9340 }, { "epoch": 11.27881713940857, "grad_norm": 13.586787223815918, "learning_rate": 1.9999776863441143e-05, "loss": 2.7458, "step": 9350 }, { "epoch": 11.290887145443573, "grad_norm": 13.415595054626465, "learning_rate": 1.999977662218635e-05, "loss": 2.8159, "step": 9360 }, { "epoch": 11.302957151478576, "grad_norm": 13.052474975585938, "learning_rate": 1.9999776380931556e-05, "loss": 2.8002, "step": 9370 }, { "epoch": 11.315027157513578, "grad_norm": 12.816454887390137, "learning_rate": 1.9999776139676762e-05, "loss": 2.7786, "step": 9380 }, { "epoch": 11.327097163548581, "grad_norm": 13.476407051086426, "learning_rate": 1.9999775898421968e-05, "loss": 2.7613, "step": 9390 }, { "epoch": 11.339167169583584, "grad_norm": 12.925561904907227, "learning_rate": 1.9999775657167174e-05, "loss": 2.8131, "step": 9400 }, { "epoch": 11.351237175618587, "grad_norm": 13.738038063049316, "learning_rate": 1.999977541591238e-05, "loss": 2.8059, "step": 9410 }, { "epoch": 11.36330718165359, "grad_norm": 13.657283782958984, "learning_rate": 1.9999775174657587e-05, "loss": 2.8066, "step": 9420 }, { "epoch": 11.375377187688594, "grad_norm": 13.725228309631348, "learning_rate": 1.9999774933402793e-05, "loss": 2.8053, "step": 9430 }, { "epoch": 11.387447193723597, "grad_norm": 13.571160316467285, "learning_rate": 1.9999774692147996e-05, "loss": 2.7903, "step": 9440 }, { "epoch": 11.3995171997586, "grad_norm": 13.783685684204102, "learning_rate": 1.9999774450893202e-05, "loss": 2.8403, "step": 9450 }, { "epoch": 11.411587205793603, "grad_norm": 13.160444259643555, "learning_rate": 1.9999774209638408e-05, "loss": 2.8079, "step": 9460 }, { "epoch": 11.423657211828607, "grad_norm": 12.63089370727539, "learning_rate": 1.9999773968383614e-05, "loss": 2.8547, "step": 9470 }, { "epoch": 11.435727217863608, "grad_norm": 11.86548137664795, "learning_rate": 1.999977372712882e-05, "loss": 2.8444, "step": 9480 }, { "epoch": 11.447797223898611, "grad_norm": 13.068056106567383, "learning_rate": 1.9999773485874027e-05, "loss": 2.8483, "step": 9490 }, { "epoch": 11.459867229933614, "grad_norm": 12.76604175567627, "learning_rate": 1.9999773244619233e-05, "loss": 2.8482, "step": 9500 }, { "epoch": 11.459867229933614, "eval_loss": 6.707380294799805, "eval_runtime": 8.1381, "eval_samples_per_second": 85.647, "eval_steps_per_second": 10.813, "step": 9500 }, { "epoch": 11.471937235968618, "grad_norm": 12.606622695922852, "learning_rate": 1.999977300336444e-05, "loss": 2.8572, "step": 9510 }, { "epoch": 11.48400724200362, "grad_norm": 13.232547760009766, "learning_rate": 1.9999772762109645e-05, "loss": 2.8112, "step": 9520 }, { "epoch": 11.496077248038624, "grad_norm": 13.66240119934082, "learning_rate": 1.999977252085485e-05, "loss": 2.8738, "step": 9530 }, { "epoch": 11.508147254073627, "grad_norm": 13.104026794433594, "learning_rate": 1.9999772279600058e-05, "loss": 2.8736, "step": 9540 }, { "epoch": 11.52021726010863, "grad_norm": 14.501734733581543, "learning_rate": 1.9999772038345264e-05, "loss": 2.789, "step": 9550 }, { "epoch": 11.532287266143634, "grad_norm": 13.140196800231934, "learning_rate": 1.9999771797090474e-05, "loss": 2.8149, "step": 9560 }, { "epoch": 11.544357272178637, "grad_norm": 13.333620071411133, "learning_rate": 1.999977155583568e-05, "loss": 2.8389, "step": 9570 }, { "epoch": 11.55642727821364, "grad_norm": 13.59193229675293, "learning_rate": 1.9999771314580886e-05, "loss": 2.8588, "step": 9580 }, { "epoch": 11.568497284248641, "grad_norm": 12.544027328491211, "learning_rate": 1.9999771073326092e-05, "loss": 2.8866, "step": 9590 }, { "epoch": 11.580567290283645, "grad_norm": 15.13714599609375, "learning_rate": 1.99997708320713e-05, "loss": 2.8352, "step": 9600 }, { "epoch": 11.592637296318648, "grad_norm": 14.421067237854004, "learning_rate": 1.99997705908165e-05, "loss": 2.8661, "step": 9610 }, { "epoch": 11.604707302353651, "grad_norm": 14.043804168701172, "learning_rate": 1.9999770349561708e-05, "loss": 2.8546, "step": 9620 }, { "epoch": 11.616777308388654, "grad_norm": 13.313983917236328, "learning_rate": 1.9999770108306914e-05, "loss": 2.8715, "step": 9630 }, { "epoch": 11.628847314423657, "grad_norm": 13.805657386779785, "learning_rate": 1.999976986705212e-05, "loss": 2.8838, "step": 9640 }, { "epoch": 11.64091732045866, "grad_norm": 14.53101921081543, "learning_rate": 1.9999769625797326e-05, "loss": 2.9059, "step": 9650 }, { "epoch": 11.652987326493664, "grad_norm": 14.205739974975586, "learning_rate": 1.9999769384542532e-05, "loss": 2.8767, "step": 9660 }, { "epoch": 11.665057332528667, "grad_norm": 13.693346977233887, "learning_rate": 1.999976914328774e-05, "loss": 2.8706, "step": 9670 }, { "epoch": 11.67712733856367, "grad_norm": 14.214479446411133, "learning_rate": 1.9999768902032945e-05, "loss": 2.8793, "step": 9680 }, { "epoch": 11.689197344598671, "grad_norm": 14.173368453979492, "learning_rate": 1.999976866077815e-05, "loss": 2.8794, "step": 9690 }, { "epoch": 11.701267350633675, "grad_norm": 13.424755096435547, "learning_rate": 1.9999768419523357e-05, "loss": 2.9076, "step": 9700 }, { "epoch": 11.713337356668678, "grad_norm": 12.977946281433105, "learning_rate": 1.9999768178268564e-05, "loss": 2.875, "step": 9710 }, { "epoch": 11.725407362703681, "grad_norm": 14.041643142700195, "learning_rate": 1.999976793701377e-05, "loss": 2.8617, "step": 9720 }, { "epoch": 11.737477368738684, "grad_norm": 14.758345603942871, "learning_rate": 1.9999767695758976e-05, "loss": 2.9014, "step": 9730 }, { "epoch": 11.749547374773687, "grad_norm": 14.082446098327637, "learning_rate": 1.9999767454504182e-05, "loss": 2.941, "step": 9740 }, { "epoch": 11.76161738080869, "grad_norm": 13.141706466674805, "learning_rate": 1.999976721324939e-05, "loss": 2.9206, "step": 9750 }, { "epoch": 11.773687386843694, "grad_norm": 13.65825366973877, "learning_rate": 1.9999766971994595e-05, "loss": 2.9014, "step": 9760 }, { "epoch": 11.785757392878697, "grad_norm": 12.988273620605469, "learning_rate": 1.99997667307398e-05, "loss": 2.922, "step": 9770 }, { "epoch": 11.7978273989137, "grad_norm": 13.447694778442383, "learning_rate": 1.9999766489485007e-05, "loss": 2.8903, "step": 9780 }, { "epoch": 11.809897404948703, "grad_norm": 12.773164749145508, "learning_rate": 1.9999766248230213e-05, "loss": 2.9563, "step": 9790 }, { "epoch": 11.821967410983705, "grad_norm": 14.712950706481934, "learning_rate": 1.999976600697542e-05, "loss": 2.9123, "step": 9800 }, { "epoch": 11.834037417018708, "grad_norm": 13.043378829956055, "learning_rate": 1.9999765765720626e-05, "loss": 2.9429, "step": 9810 }, { "epoch": 11.846107423053711, "grad_norm": 13.708863258361816, "learning_rate": 1.9999765524465832e-05, "loss": 2.9383, "step": 9820 }, { "epoch": 11.858177429088714, "grad_norm": 13.457423210144043, "learning_rate": 1.9999765283211038e-05, "loss": 2.9501, "step": 9830 }, { "epoch": 11.870247435123717, "grad_norm": 13.911826133728027, "learning_rate": 1.9999765041956244e-05, "loss": 2.9091, "step": 9840 }, { "epoch": 11.88231744115872, "grad_norm": 14.331925392150879, "learning_rate": 1.999976480070145e-05, "loss": 2.9114, "step": 9850 }, { "epoch": 11.894387447193724, "grad_norm": 13.072750091552734, "learning_rate": 1.9999764559446653e-05, "loss": 2.88, "step": 9860 }, { "epoch": 11.906457453228727, "grad_norm": 12.810304641723633, "learning_rate": 1.999976431819186e-05, "loss": 2.9148, "step": 9870 }, { "epoch": 11.91852745926373, "grad_norm": 13.450763702392578, "learning_rate": 1.9999764076937066e-05, "loss": 2.9117, "step": 9880 }, { "epoch": 11.930597465298733, "grad_norm": 13.162555694580078, "learning_rate": 1.9999763835682272e-05, "loss": 2.9665, "step": 9890 }, { "epoch": 11.942667471333735, "grad_norm": 13.839241027832031, "learning_rate": 1.9999763594427478e-05, "loss": 2.9528, "step": 9900 }, { "epoch": 11.954737477368738, "grad_norm": 13.290181159973145, "learning_rate": 1.9999763353172684e-05, "loss": 2.9265, "step": 9910 }, { "epoch": 11.966807483403741, "grad_norm": 14.570168495178223, "learning_rate": 1.999976311191789e-05, "loss": 2.9651, "step": 9920 }, { "epoch": 11.978877489438744, "grad_norm": 13.293632507324219, "learning_rate": 1.9999762870663097e-05, "loss": 2.934, "step": 9930 }, { "epoch": 11.990947495473748, "grad_norm": 13.754737854003906, "learning_rate": 1.9999762629408303e-05, "loss": 2.9437, "step": 9940 }, { "epoch": 12.002414001207, "grad_norm": 13.731986999511719, "learning_rate": 1.999976238815351e-05, "loss": 2.8324, "step": 9950 }, { "epoch": 12.014484007242004, "grad_norm": 14.44949722290039, "learning_rate": 1.9999762146898716e-05, "loss": 2.2218, "step": 9960 }, { "epoch": 12.026554013277007, "grad_norm": 13.968744277954102, "learning_rate": 1.9999761905643922e-05, "loss": 2.288, "step": 9970 }, { "epoch": 12.03862401931201, "grad_norm": 14.75540542602539, "learning_rate": 1.9999761664389128e-05, "loss": 2.2257, "step": 9980 }, { "epoch": 12.050694025347013, "grad_norm": 13.408964157104492, "learning_rate": 1.9999761423134334e-05, "loss": 2.2581, "step": 9990 }, { "epoch": 12.062764031382017, "grad_norm": 13.833430290222168, "learning_rate": 1.999976118187954e-05, "loss": 2.2496, "step": 10000 }, { "epoch": 12.062764031382017, "eval_loss": 7.070396423339844, "eval_runtime": 8.128, "eval_samples_per_second": 85.753, "eval_steps_per_second": 10.827, "step": 10000 }, { "epoch": 12.074834037417018, "grad_norm": 14.620023727416992, "learning_rate": 1.9999760940624747e-05, "loss": 2.2468, "step": 10010 }, { "epoch": 12.086904043452021, "grad_norm": 14.136520385742188, "learning_rate": 1.9999760699369953e-05, "loss": 2.2322, "step": 10020 }, { "epoch": 12.098974049487024, "grad_norm": 13.745083808898926, "learning_rate": 1.999976045811516e-05, "loss": 2.245, "step": 10030 }, { "epoch": 12.111044055522028, "grad_norm": 14.203587532043457, "learning_rate": 1.9999760216860365e-05, "loss": 2.2152, "step": 10040 }, { "epoch": 12.12311406155703, "grad_norm": 14.011592864990234, "learning_rate": 1.999975997560557e-05, "loss": 2.2494, "step": 10050 }, { "epoch": 12.135184067592034, "grad_norm": 13.65028190612793, "learning_rate": 1.9999759734350778e-05, "loss": 2.256, "step": 10060 }, { "epoch": 12.147254073627037, "grad_norm": 14.435672760009766, "learning_rate": 1.9999759493095984e-05, "loss": 2.2953, "step": 10070 }, { "epoch": 12.15932407966204, "grad_norm": 15.0378999710083, "learning_rate": 1.999975925184119e-05, "loss": 2.2801, "step": 10080 }, { "epoch": 12.171394085697044, "grad_norm": 14.480123519897461, "learning_rate": 1.9999759010586396e-05, "loss": 2.2744, "step": 10090 }, { "epoch": 12.183464091732047, "grad_norm": 14.479362487792969, "learning_rate": 1.9999758769331603e-05, "loss": 2.2783, "step": 10100 }, { "epoch": 12.195534097767048, "grad_norm": 14.21965217590332, "learning_rate": 1.999975852807681e-05, "loss": 2.2876, "step": 10110 }, { "epoch": 12.207604103802051, "grad_norm": 14.555644989013672, "learning_rate": 1.9999758286822015e-05, "loss": 2.2948, "step": 10120 }, { "epoch": 12.219674109837054, "grad_norm": 14.55551815032959, "learning_rate": 1.999975804556722e-05, "loss": 2.2706, "step": 10130 }, { "epoch": 12.231744115872058, "grad_norm": 14.51903247833252, "learning_rate": 1.9999757804312427e-05, "loss": 2.2717, "step": 10140 }, { "epoch": 12.24381412190706, "grad_norm": 14.147241592407227, "learning_rate": 1.9999757563057634e-05, "loss": 2.3047, "step": 10150 }, { "epoch": 12.255884127942064, "grad_norm": 14.57189655303955, "learning_rate": 1.999975732180284e-05, "loss": 2.3344, "step": 10160 }, { "epoch": 12.267954133977067, "grad_norm": 13.406577110290527, "learning_rate": 1.9999757080548046e-05, "loss": 2.3455, "step": 10170 }, { "epoch": 12.28002414001207, "grad_norm": 15.312357902526855, "learning_rate": 1.9999756839293252e-05, "loss": 2.3297, "step": 10180 }, { "epoch": 12.292094146047074, "grad_norm": 14.80012035369873, "learning_rate": 1.999975659803846e-05, "loss": 2.3245, "step": 10190 }, { "epoch": 12.304164152082077, "grad_norm": 14.374696731567383, "learning_rate": 1.9999756356783665e-05, "loss": 2.3037, "step": 10200 }, { "epoch": 12.316234158117078, "grad_norm": 14.404014587402344, "learning_rate": 1.999975611552887e-05, "loss": 2.3276, "step": 10210 }, { "epoch": 12.328304164152081, "grad_norm": 14.939358711242676, "learning_rate": 1.9999755874274077e-05, "loss": 2.3118, "step": 10220 }, { "epoch": 12.340374170187085, "grad_norm": 14.552000045776367, "learning_rate": 1.9999755633019283e-05, "loss": 2.3317, "step": 10230 }, { "epoch": 12.352444176222088, "grad_norm": 14.912358283996582, "learning_rate": 1.999975539176449e-05, "loss": 2.3419, "step": 10240 }, { "epoch": 12.364514182257091, "grad_norm": 15.936635971069336, "learning_rate": 1.9999755150509696e-05, "loss": 2.3249, "step": 10250 }, { "epoch": 12.376584188292094, "grad_norm": 14.966479301452637, "learning_rate": 1.9999754909254902e-05, "loss": 2.3328, "step": 10260 }, { "epoch": 12.388654194327097, "grad_norm": 14.620243072509766, "learning_rate": 1.9999754668000105e-05, "loss": 2.3778, "step": 10270 }, { "epoch": 12.4007242003621, "grad_norm": 15.040887832641602, "learning_rate": 1.999975442674531e-05, "loss": 2.4017, "step": 10280 }, { "epoch": 12.412794206397104, "grad_norm": 15.184438705444336, "learning_rate": 1.9999754185490517e-05, "loss": 2.3366, "step": 10290 }, { "epoch": 12.424864212432107, "grad_norm": 15.494771003723145, "learning_rate": 1.9999753944235723e-05, "loss": 2.3632, "step": 10300 }, { "epoch": 12.436934218467108, "grad_norm": 15.530921936035156, "learning_rate": 1.999975370298093e-05, "loss": 2.4035, "step": 10310 }, { "epoch": 12.449004224502112, "grad_norm": 14.397787094116211, "learning_rate": 1.9999753461726136e-05, "loss": 2.3608, "step": 10320 }, { "epoch": 12.461074230537115, "grad_norm": 16.231103897094727, "learning_rate": 1.9999753220471342e-05, "loss": 2.3879, "step": 10330 }, { "epoch": 12.473144236572118, "grad_norm": 15.516502380371094, "learning_rate": 1.999975297921655e-05, "loss": 2.3655, "step": 10340 }, { "epoch": 12.485214242607121, "grad_norm": 16.045303344726562, "learning_rate": 1.9999752737961755e-05, "loss": 2.384, "step": 10350 }, { "epoch": 12.497284248642124, "grad_norm": 14.71277141571045, "learning_rate": 1.999975249670696e-05, "loss": 2.4084, "step": 10360 }, { "epoch": 12.509354254677127, "grad_norm": 15.028225898742676, "learning_rate": 1.9999752255452167e-05, "loss": 2.3748, "step": 10370 }, { "epoch": 12.52142426071213, "grad_norm": 16.379222869873047, "learning_rate": 1.9999752014197373e-05, "loss": 2.3791, "step": 10380 }, { "epoch": 12.533494266747134, "grad_norm": 15.06067943572998, "learning_rate": 1.999975177294258e-05, "loss": 2.3937, "step": 10390 }, { "epoch": 12.545564272782137, "grad_norm": 15.327324867248535, "learning_rate": 1.9999751531687786e-05, "loss": 2.4336, "step": 10400 }, { "epoch": 12.55763427881714, "grad_norm": 14.280142784118652, "learning_rate": 1.9999751290432992e-05, "loss": 2.3709, "step": 10410 }, { "epoch": 12.569704284852142, "grad_norm": 14.600737571716309, "learning_rate": 1.9999751049178198e-05, "loss": 2.4205, "step": 10420 }, { "epoch": 12.581774290887145, "grad_norm": 14.660449028015137, "learning_rate": 1.9999750807923404e-05, "loss": 2.4118, "step": 10430 }, { "epoch": 12.593844296922148, "grad_norm": 15.633573532104492, "learning_rate": 1.999975056666861e-05, "loss": 2.3909, "step": 10440 }, { "epoch": 12.605914302957151, "grad_norm": 16.438446044921875, "learning_rate": 1.9999750325413817e-05, "loss": 2.4185, "step": 10450 }, { "epoch": 12.617984308992154, "grad_norm": 14.78546142578125, "learning_rate": 1.9999750084159023e-05, "loss": 2.4205, "step": 10460 }, { "epoch": 12.630054315027158, "grad_norm": 15.683408737182617, "learning_rate": 1.999974984290423e-05, "loss": 2.3931, "step": 10470 }, { "epoch": 12.64212432106216, "grad_norm": 17.437767028808594, "learning_rate": 1.9999749601649435e-05, "loss": 2.3981, "step": 10480 }, { "epoch": 12.654194327097164, "grad_norm": 14.800107955932617, "learning_rate": 1.999974936039464e-05, "loss": 2.4724, "step": 10490 }, { "epoch": 12.666264333132167, "grad_norm": 14.86663818359375, "learning_rate": 1.9999749119139848e-05, "loss": 2.4237, "step": 10500 }, { "epoch": 12.666264333132167, "eval_loss": 7.15718936920166, "eval_runtime": 8.1403, "eval_samples_per_second": 85.623, "eval_steps_per_second": 10.81, "step": 10500 }, { "epoch": 12.67833433916717, "grad_norm": 15.876859664916992, "learning_rate": 1.9999748877885054e-05, "loss": 2.4155, "step": 10510 }, { "epoch": 12.690404345202172, "grad_norm": 14.831653594970703, "learning_rate": 1.9999748636630257e-05, "loss": 2.433, "step": 10520 }, { "epoch": 12.702474351237175, "grad_norm": 14.912435531616211, "learning_rate": 1.9999748395375463e-05, "loss": 2.4318, "step": 10530 }, { "epoch": 12.714544357272178, "grad_norm": 15.06423568725586, "learning_rate": 1.999974815412067e-05, "loss": 2.4439, "step": 10540 }, { "epoch": 12.726614363307181, "grad_norm": 15.416595458984375, "learning_rate": 1.9999747912865875e-05, "loss": 2.4388, "step": 10550 }, { "epoch": 12.738684369342185, "grad_norm": 15.364585876464844, "learning_rate": 1.9999747671611082e-05, "loss": 2.4528, "step": 10560 }, { "epoch": 12.750754375377188, "grad_norm": 15.255334854125977, "learning_rate": 1.9999747430356288e-05, "loss": 2.4291, "step": 10570 }, { "epoch": 12.762824381412191, "grad_norm": 15.204522132873535, "learning_rate": 1.9999747189101494e-05, "loss": 2.4742, "step": 10580 }, { "epoch": 12.774894387447194, "grad_norm": 16.568750381469727, "learning_rate": 1.99997469478467e-05, "loss": 2.4503, "step": 10590 }, { "epoch": 12.786964393482197, "grad_norm": 15.302580833435059, "learning_rate": 1.9999746706591907e-05, "loss": 2.4747, "step": 10600 }, { "epoch": 12.7990343995172, "grad_norm": 15.317748069763184, "learning_rate": 1.9999746465337113e-05, "loss": 2.4531, "step": 10610 }, { "epoch": 12.811104405552204, "grad_norm": 14.133782386779785, "learning_rate": 1.999974622408232e-05, "loss": 2.4916, "step": 10620 }, { "epoch": 12.823174411587205, "grad_norm": 14.86981201171875, "learning_rate": 1.9999745982827525e-05, "loss": 2.4855, "step": 10630 }, { "epoch": 12.835244417622208, "grad_norm": 16.153261184692383, "learning_rate": 1.9999745741572735e-05, "loss": 2.4611, "step": 10640 }, { "epoch": 12.847314423657211, "grad_norm": 14.713861465454102, "learning_rate": 1.999974550031794e-05, "loss": 2.49, "step": 10650 }, { "epoch": 12.859384429692215, "grad_norm": 14.610615730285645, "learning_rate": 1.9999745259063147e-05, "loss": 2.4765, "step": 10660 }, { "epoch": 12.871454435727218, "grad_norm": 14.341958045959473, "learning_rate": 1.9999745017808353e-05, "loss": 2.441, "step": 10670 }, { "epoch": 12.883524441762221, "grad_norm": 15.33028793334961, "learning_rate": 1.9999744776553556e-05, "loss": 2.4722, "step": 10680 }, { "epoch": 12.895594447797224, "grad_norm": 14.319414138793945, "learning_rate": 1.9999744535298763e-05, "loss": 2.4515, "step": 10690 }, { "epoch": 12.907664453832227, "grad_norm": 16.192440032958984, "learning_rate": 1.999974429404397e-05, "loss": 2.4436, "step": 10700 }, { "epoch": 12.91973445986723, "grad_norm": 15.409472465515137, "learning_rate": 1.9999744052789175e-05, "loss": 2.4591, "step": 10710 }, { "epoch": 12.931804465902234, "grad_norm": 15.254172325134277, "learning_rate": 1.999974381153438e-05, "loss": 2.4377, "step": 10720 }, { "epoch": 12.943874471937235, "grad_norm": 14.913445472717285, "learning_rate": 1.9999743570279587e-05, "loss": 2.4585, "step": 10730 }, { "epoch": 12.955944477972238, "grad_norm": 15.904867172241211, "learning_rate": 1.9999743329024794e-05, "loss": 2.4744, "step": 10740 }, { "epoch": 12.968014484007242, "grad_norm": 15.781237602233887, "learning_rate": 1.999974308777e-05, "loss": 2.485, "step": 10750 }, { "epoch": 12.980084490042245, "grad_norm": 14.72400188446045, "learning_rate": 1.9999742846515206e-05, "loss": 2.4929, "step": 10760 }, { "epoch": 12.992154496077248, "grad_norm": 15.126233100891113, "learning_rate": 1.9999742605260412e-05, "loss": 2.4959, "step": 10770 }, { "epoch": 13.003621001810501, "grad_norm": 15.033082008361816, "learning_rate": 1.999974236400562e-05, "loss": 2.3048, "step": 10780 }, { "epoch": 13.015691007845504, "grad_norm": 15.747492790222168, "learning_rate": 1.9999742122750825e-05, "loss": 1.8656, "step": 10790 }, { "epoch": 13.027761013880507, "grad_norm": 16.210296630859375, "learning_rate": 1.999974188149603e-05, "loss": 1.8286, "step": 10800 }, { "epoch": 13.03983101991551, "grad_norm": 15.480259895324707, "learning_rate": 1.9999741640241237e-05, "loss": 1.8213, "step": 10810 }, { "epoch": 13.051901025950514, "grad_norm": 15.455214500427246, "learning_rate": 1.9999741398986443e-05, "loss": 1.7892, "step": 10820 }, { "epoch": 13.063971031985515, "grad_norm": 15.24752140045166, "learning_rate": 1.999974115773165e-05, "loss": 1.8079, "step": 10830 }, { "epoch": 13.076041038020518, "grad_norm": 15.680542945861816, "learning_rate": 1.9999740916476856e-05, "loss": 1.7922, "step": 10840 }, { "epoch": 13.088111044055522, "grad_norm": 16.8596134185791, "learning_rate": 1.9999740675222062e-05, "loss": 1.808, "step": 10850 }, { "epoch": 13.100181050090525, "grad_norm": 16.3262939453125, "learning_rate": 1.9999740433967268e-05, "loss": 1.7686, "step": 10860 }, { "epoch": 13.112251056125528, "grad_norm": 15.057125091552734, "learning_rate": 1.9999740192712474e-05, "loss": 1.8345, "step": 10870 }, { "epoch": 13.124321062160531, "grad_norm": 15.981499671936035, "learning_rate": 1.999973995145768e-05, "loss": 1.8233, "step": 10880 }, { "epoch": 13.136391068195534, "grad_norm": 15.53369140625, "learning_rate": 1.9999739710202887e-05, "loss": 1.8257, "step": 10890 }, { "epoch": 13.148461074230537, "grad_norm": 15.23995304107666, "learning_rate": 1.9999739468948093e-05, "loss": 1.8712, "step": 10900 }, { "epoch": 13.16053108026554, "grad_norm": 14.8240385055542, "learning_rate": 1.99997392276933e-05, "loss": 1.8558, "step": 10910 }, { "epoch": 13.172601086300544, "grad_norm": 15.710564613342285, "learning_rate": 1.9999738986438505e-05, "loss": 1.8476, "step": 10920 }, { "epoch": 13.184671092335547, "grad_norm": 16.097015380859375, "learning_rate": 1.9999738745183708e-05, "loss": 1.8366, "step": 10930 }, { "epoch": 13.196741098370548, "grad_norm": 16.354156494140625, "learning_rate": 1.9999738503928914e-05, "loss": 1.871, "step": 10940 }, { "epoch": 13.208811104405552, "grad_norm": 16.74485206604004, "learning_rate": 1.999973826267412e-05, "loss": 1.8345, "step": 10950 }, { "epoch": 13.220881110440555, "grad_norm": 16.149246215820312, "learning_rate": 1.9999738021419327e-05, "loss": 1.8548, "step": 10960 }, { "epoch": 13.232951116475558, "grad_norm": 15.810958862304688, "learning_rate": 1.9999737780164533e-05, "loss": 1.8884, "step": 10970 }, { "epoch": 13.245021122510561, "grad_norm": 16.035091400146484, "learning_rate": 1.999973753890974e-05, "loss": 1.8574, "step": 10980 }, { "epoch": 13.257091128545564, "grad_norm": 15.991711616516113, "learning_rate": 1.9999737297654946e-05, "loss": 1.8612, "step": 10990 }, { "epoch": 13.269161134580568, "grad_norm": 15.935582160949707, "learning_rate": 1.9999737056400152e-05, "loss": 1.9125, "step": 11000 }, { "epoch": 13.269161134580568, "eval_loss": 7.617883682250977, "eval_runtime": 8.1209, "eval_samples_per_second": 85.828, "eval_steps_per_second": 10.836, "step": 11000 }, { "epoch": 13.28123114061557, "grad_norm": 15.919540405273438, "learning_rate": 1.9999736815145358e-05, "loss": 1.8861, "step": 11010 }, { "epoch": 13.293301146650574, "grad_norm": 14.883785247802734, "learning_rate": 1.9999736573890564e-05, "loss": 1.8596, "step": 11020 }, { "epoch": 13.305371152685577, "grad_norm": 15.804030418395996, "learning_rate": 1.999973633263577e-05, "loss": 1.8792, "step": 11030 }, { "epoch": 13.317441158720579, "grad_norm": 15.31171989440918, "learning_rate": 1.9999736091380977e-05, "loss": 1.8981, "step": 11040 }, { "epoch": 13.329511164755582, "grad_norm": 15.528714179992676, "learning_rate": 1.9999735850126183e-05, "loss": 1.8917, "step": 11050 }, { "epoch": 13.341581170790585, "grad_norm": 17.17511558532715, "learning_rate": 1.999973560887139e-05, "loss": 1.9326, "step": 11060 }, { "epoch": 13.353651176825588, "grad_norm": 16.0021915435791, "learning_rate": 1.9999735367616595e-05, "loss": 1.9211, "step": 11070 }, { "epoch": 13.365721182860591, "grad_norm": 16.842329025268555, "learning_rate": 1.99997351263618e-05, "loss": 1.8831, "step": 11080 }, { "epoch": 13.377791188895594, "grad_norm": 16.557817459106445, "learning_rate": 1.9999734885107008e-05, "loss": 1.918, "step": 11090 }, { "epoch": 13.389861194930598, "grad_norm": 15.853300094604492, "learning_rate": 1.9999734643852214e-05, "loss": 1.9068, "step": 11100 }, { "epoch": 13.4019312009656, "grad_norm": 16.029430389404297, "learning_rate": 1.999973440259742e-05, "loss": 1.9244, "step": 11110 }, { "epoch": 13.414001207000604, "grad_norm": 16.00061798095703, "learning_rate": 1.9999734161342626e-05, "loss": 1.9391, "step": 11120 }, { "epoch": 13.426071213035607, "grad_norm": 15.09553337097168, "learning_rate": 1.9999733920087833e-05, "loss": 1.9577, "step": 11130 }, { "epoch": 13.438141219070609, "grad_norm": 17.39439582824707, "learning_rate": 1.999973367883304e-05, "loss": 1.9574, "step": 11140 }, { "epoch": 13.450211225105612, "grad_norm": 16.496402740478516, "learning_rate": 1.9999733437578245e-05, "loss": 1.9763, "step": 11150 }, { "epoch": 13.462281231140615, "grad_norm": 16.737417221069336, "learning_rate": 1.999973319632345e-05, "loss": 1.9371, "step": 11160 }, { "epoch": 13.474351237175618, "grad_norm": 16.88422203063965, "learning_rate": 1.9999732955068657e-05, "loss": 1.929, "step": 11170 }, { "epoch": 13.486421243210621, "grad_norm": 17.808181762695312, "learning_rate": 1.9999732713813864e-05, "loss": 1.9483, "step": 11180 }, { "epoch": 13.498491249245625, "grad_norm": 15.598803520202637, "learning_rate": 1.999973247255907e-05, "loss": 1.9569, "step": 11190 }, { "epoch": 13.510561255280628, "grad_norm": 16.709213256835938, "learning_rate": 1.9999732231304276e-05, "loss": 1.9686, "step": 11200 }, { "epoch": 13.522631261315631, "grad_norm": 15.824934005737305, "learning_rate": 1.9999731990049482e-05, "loss": 1.9628, "step": 11210 }, { "epoch": 13.534701267350634, "grad_norm": 16.316328048706055, "learning_rate": 1.999973174879469e-05, "loss": 1.944, "step": 11220 }, { "epoch": 13.546771273385637, "grad_norm": 15.47887897491455, "learning_rate": 1.9999731507539895e-05, "loss": 1.935, "step": 11230 }, { "epoch": 13.55884127942064, "grad_norm": 17.162750244140625, "learning_rate": 1.99997312662851e-05, "loss": 1.9746, "step": 11240 }, { "epoch": 13.570911285455642, "grad_norm": 15.898387908935547, "learning_rate": 1.9999731025030307e-05, "loss": 1.9942, "step": 11250 }, { "epoch": 13.582981291490645, "grad_norm": 16.808244705200195, "learning_rate": 1.9999730783775513e-05, "loss": 1.9526, "step": 11260 }, { "epoch": 13.595051297525648, "grad_norm": 16.55752944946289, "learning_rate": 1.999973054252072e-05, "loss": 1.9493, "step": 11270 }, { "epoch": 13.607121303560652, "grad_norm": 17.20215606689453, "learning_rate": 1.9999730301265926e-05, "loss": 1.9364, "step": 11280 }, { "epoch": 13.619191309595655, "grad_norm": 16.283418655395508, "learning_rate": 1.9999730060011132e-05, "loss": 1.9731, "step": 11290 }, { "epoch": 13.631261315630658, "grad_norm": 16.654403686523438, "learning_rate": 1.9999729818756338e-05, "loss": 2.0234, "step": 11300 }, { "epoch": 13.643331321665661, "grad_norm": 15.415102005004883, "learning_rate": 1.9999729577501544e-05, "loss": 1.9742, "step": 11310 }, { "epoch": 13.655401327700664, "grad_norm": 17.62749671936035, "learning_rate": 1.999972933624675e-05, "loss": 1.9644, "step": 11320 }, { "epoch": 13.667471333735667, "grad_norm": 16.106740951538086, "learning_rate": 1.9999729094991957e-05, "loss": 1.9741, "step": 11330 }, { "epoch": 13.67954133977067, "grad_norm": 16.393247604370117, "learning_rate": 1.9999728853737163e-05, "loss": 1.9952, "step": 11340 }, { "epoch": 13.691611345805672, "grad_norm": 16.46515464782715, "learning_rate": 1.9999728612482366e-05, "loss": 2.0107, "step": 11350 }, { "epoch": 13.703681351840675, "grad_norm": 15.554428100585938, "learning_rate": 1.9999728371227572e-05, "loss": 1.9955, "step": 11360 }, { "epoch": 13.715751357875678, "grad_norm": 17.870649337768555, "learning_rate": 1.999972812997278e-05, "loss": 1.9945, "step": 11370 }, { "epoch": 13.727821363910682, "grad_norm": 15.550113677978516, "learning_rate": 1.9999727888717985e-05, "loss": 1.9881, "step": 11380 }, { "epoch": 13.739891369945685, "grad_norm": 16.11086654663086, "learning_rate": 1.999972764746319e-05, "loss": 1.9863, "step": 11390 }, { "epoch": 13.751961375980688, "grad_norm": 17.91222381591797, "learning_rate": 1.9999727406208397e-05, "loss": 2.0076, "step": 11400 }, { "epoch": 13.764031382015691, "grad_norm": 15.91933536529541, "learning_rate": 1.9999727164953603e-05, "loss": 2.013, "step": 11410 }, { "epoch": 13.776101388050694, "grad_norm": 17.72159767150879, "learning_rate": 1.999972692369881e-05, "loss": 2.0105, "step": 11420 }, { "epoch": 13.788171394085698, "grad_norm": 16.60991859436035, "learning_rate": 1.9999726682444016e-05, "loss": 2.0242, "step": 11430 }, { "epoch": 13.8002414001207, "grad_norm": 15.579922676086426, "learning_rate": 1.9999726441189222e-05, "loss": 2.0243, "step": 11440 }, { "epoch": 13.812311406155704, "grad_norm": 17.300100326538086, "learning_rate": 1.9999726199934428e-05, "loss": 1.9987, "step": 11450 }, { "epoch": 13.824381412190705, "grad_norm": 17.29494857788086, "learning_rate": 1.9999725958679634e-05, "loss": 2.0399, "step": 11460 }, { "epoch": 13.836451418225709, "grad_norm": 16.836076736450195, "learning_rate": 1.999972571742484e-05, "loss": 2.0379, "step": 11470 }, { "epoch": 13.848521424260712, "grad_norm": 16.82439422607422, "learning_rate": 1.9999725476170047e-05, "loss": 2.0385, "step": 11480 }, { "epoch": 13.860591430295715, "grad_norm": 15.771368026733398, "learning_rate": 1.9999725234915253e-05, "loss": 2.0323, "step": 11490 }, { "epoch": 13.872661436330718, "grad_norm": 17.49107551574707, "learning_rate": 1.999972499366046e-05, "loss": 2.0207, "step": 11500 }, { "epoch": 13.872661436330718, "eval_loss": 7.678491115570068, "eval_runtime": 8.121, "eval_samples_per_second": 85.827, "eval_steps_per_second": 10.836, "step": 11500 }, { "epoch": 13.884731442365721, "grad_norm": 16.305810928344727, "learning_rate": 1.9999724752405665e-05, "loss": 2.0493, "step": 11510 }, { "epoch": 13.896801448400725, "grad_norm": 16.558809280395508, "learning_rate": 1.999972451115087e-05, "loss": 1.9992, "step": 11520 }, { "epoch": 13.908871454435728, "grad_norm": 17.749996185302734, "learning_rate": 1.9999724269896078e-05, "loss": 2.0602, "step": 11530 }, { "epoch": 13.920941460470731, "grad_norm": 17.308937072753906, "learning_rate": 1.9999724028641284e-05, "loss": 2.0225, "step": 11540 }, { "epoch": 13.933011466505734, "grad_norm": 15.73917007446289, "learning_rate": 1.999972378738649e-05, "loss": 2.0387, "step": 11550 }, { "epoch": 13.945081472540735, "grad_norm": 16.290281295776367, "learning_rate": 1.9999723546131696e-05, "loss": 2.0619, "step": 11560 }, { "epoch": 13.957151478575739, "grad_norm": 16.166641235351562, "learning_rate": 1.9999723304876903e-05, "loss": 2.0891, "step": 11570 }, { "epoch": 13.969221484610742, "grad_norm": 16.62574577331543, "learning_rate": 1.999972306362211e-05, "loss": 2.0555, "step": 11580 }, { "epoch": 13.981291490645745, "grad_norm": 17.02587127685547, "learning_rate": 1.9999722822367315e-05, "loss": 2.0532, "step": 11590 }, { "epoch": 13.993361496680748, "grad_norm": 17.755285263061523, "learning_rate": 1.9999722581112518e-05, "loss": 2.0617, "step": 11600 }, { "epoch": 14.004828002414001, "grad_norm": 17.396944046020508, "learning_rate": 1.9999722339857724e-05, "loss": 1.7985, "step": 11610 }, { "epoch": 14.016898008449004, "grad_norm": 16.021543502807617, "learning_rate": 1.999972209860293e-05, "loss": 1.4259, "step": 11620 }, { "epoch": 14.028968014484008, "grad_norm": 17.53722381591797, "learning_rate": 1.9999721857348137e-05, "loss": 1.4107, "step": 11630 }, { "epoch": 14.04103802051901, "grad_norm": 14.551675796508789, "learning_rate": 1.9999721616093343e-05, "loss": 1.419, "step": 11640 }, { "epoch": 14.053108026554014, "grad_norm": 15.665108680725098, "learning_rate": 1.999972137483855e-05, "loss": 1.3877, "step": 11650 }, { "epoch": 14.065178032589015, "grad_norm": 15.826916694641113, "learning_rate": 1.9999721133583755e-05, "loss": 1.4506, "step": 11660 }, { "epoch": 14.077248038624019, "grad_norm": 16.541418075561523, "learning_rate": 1.999972089232896e-05, "loss": 1.4253, "step": 11670 }, { "epoch": 14.089318044659022, "grad_norm": 16.262401580810547, "learning_rate": 1.9999720651074168e-05, "loss": 1.4157, "step": 11680 }, { "epoch": 14.101388050694025, "grad_norm": 16.64508056640625, "learning_rate": 1.9999720409819374e-05, "loss": 1.4389, "step": 11690 }, { "epoch": 14.113458056729028, "grad_norm": 15.957758903503418, "learning_rate": 1.999972016856458e-05, "loss": 1.4577, "step": 11700 }, { "epoch": 14.125528062764031, "grad_norm": 15.707578659057617, "learning_rate": 1.9999719927309786e-05, "loss": 1.4257, "step": 11710 }, { "epoch": 14.137598068799035, "grad_norm": 16.44556999206543, "learning_rate": 1.9999719686054996e-05, "loss": 1.4299, "step": 11720 }, { "epoch": 14.149668074834038, "grad_norm": 16.840421676635742, "learning_rate": 1.9999719444800202e-05, "loss": 1.4533, "step": 11730 }, { "epoch": 14.161738080869041, "grad_norm": 16.180770874023438, "learning_rate": 1.999971920354541e-05, "loss": 1.4556, "step": 11740 }, { "epoch": 14.173808086904044, "grad_norm": 16.037639617919922, "learning_rate": 1.9999718962290615e-05, "loss": 1.4469, "step": 11750 }, { "epoch": 14.185878092939047, "grad_norm": 16.625478744506836, "learning_rate": 1.9999718721035817e-05, "loss": 1.4709, "step": 11760 }, { "epoch": 14.197948098974049, "grad_norm": 15.981351852416992, "learning_rate": 1.9999718479781024e-05, "loss": 1.4885, "step": 11770 }, { "epoch": 14.210018105009052, "grad_norm": 16.195390701293945, "learning_rate": 1.999971823852623e-05, "loss": 1.4903, "step": 11780 }, { "epoch": 14.222088111044055, "grad_norm": 16.312498092651367, "learning_rate": 1.9999717997271436e-05, "loss": 1.4682, "step": 11790 }, { "epoch": 14.234158117079058, "grad_norm": 17.315149307250977, "learning_rate": 1.9999717756016642e-05, "loss": 1.4899, "step": 11800 }, { "epoch": 14.246228123114062, "grad_norm": 16.429122924804688, "learning_rate": 1.999971751476185e-05, "loss": 1.4552, "step": 11810 }, { "epoch": 14.258298129149065, "grad_norm": 16.816797256469727, "learning_rate": 1.9999717273507055e-05, "loss": 1.4906, "step": 11820 }, { "epoch": 14.270368135184068, "grad_norm": 16.962148666381836, "learning_rate": 1.999971703225226e-05, "loss": 1.5063, "step": 11830 }, { "epoch": 14.282438141219071, "grad_norm": 16.246761322021484, "learning_rate": 1.9999716790997467e-05, "loss": 1.4975, "step": 11840 }, { "epoch": 14.294508147254074, "grad_norm": 16.60820770263672, "learning_rate": 1.9999716549742673e-05, "loss": 1.5108, "step": 11850 }, { "epoch": 14.306578153289077, "grad_norm": 17.24009895324707, "learning_rate": 1.999971630848788e-05, "loss": 1.5108, "step": 11860 }, { "epoch": 14.318648159324079, "grad_norm": 16.703231811523438, "learning_rate": 1.9999716067233086e-05, "loss": 1.5096, "step": 11870 }, { "epoch": 14.330718165359082, "grad_norm": 16.29560089111328, "learning_rate": 1.9999715825978292e-05, "loss": 1.5043, "step": 11880 }, { "epoch": 14.342788171394085, "grad_norm": 16.379514694213867, "learning_rate": 1.9999715584723498e-05, "loss": 1.518, "step": 11890 }, { "epoch": 14.354858177429088, "grad_norm": 17.365110397338867, "learning_rate": 1.9999715343468704e-05, "loss": 1.4902, "step": 11900 }, { "epoch": 14.366928183464092, "grad_norm": 18.30777359008789, "learning_rate": 1.999971510221391e-05, "loss": 1.5077, "step": 11910 }, { "epoch": 14.378998189499095, "grad_norm": 15.96866512298584, "learning_rate": 1.9999714860959117e-05, "loss": 1.5383, "step": 11920 }, { "epoch": 14.391068195534098, "grad_norm": 18.27117347717285, "learning_rate": 1.9999714619704323e-05, "loss": 1.5385, "step": 11930 }, { "epoch": 14.403138201569101, "grad_norm": 18.704011917114258, "learning_rate": 1.999971437844953e-05, "loss": 1.5425, "step": 11940 }, { "epoch": 14.415208207604104, "grad_norm": 16.288936614990234, "learning_rate": 1.9999714137194735e-05, "loss": 1.5437, "step": 11950 }, { "epoch": 14.427278213639108, "grad_norm": 17.134252548217773, "learning_rate": 1.999971389593994e-05, "loss": 1.5346, "step": 11960 }, { "epoch": 14.439348219674109, "grad_norm": 18.592660903930664, "learning_rate": 1.9999713654685148e-05, "loss": 1.5275, "step": 11970 }, { "epoch": 14.451418225709112, "grad_norm": 18.199220657348633, "learning_rate": 1.9999713413430354e-05, "loss": 1.5153, "step": 11980 }, { "epoch": 14.463488231744115, "grad_norm": 16.33137321472168, "learning_rate": 1.999971317217556e-05, "loss": 1.5432, "step": 11990 }, { "epoch": 14.475558237779119, "grad_norm": 16.70119285583496, "learning_rate": 1.9999712930920767e-05, "loss": 1.5742, "step": 12000 }, { "epoch": 14.475558237779119, "eval_loss": 8.117794036865234, "eval_runtime": 8.1373, "eval_samples_per_second": 85.655, "eval_steps_per_second": 10.814, "step": 12000 }, { "epoch": 14.487628243814122, "grad_norm": 18.232229232788086, "learning_rate": 1.999971268966597e-05, "loss": 1.5283, "step": 12010 }, { "epoch": 14.499698249849125, "grad_norm": 17.983922958374023, "learning_rate": 1.9999712448411176e-05, "loss": 1.5552, "step": 12020 }, { "epoch": 14.511768255884128, "grad_norm": 16.774240493774414, "learning_rate": 1.9999712207156382e-05, "loss": 1.5609, "step": 12030 }, { "epoch": 14.523838261919131, "grad_norm": 16.642826080322266, "learning_rate": 1.9999711965901588e-05, "loss": 1.5709, "step": 12040 }, { "epoch": 14.535908267954134, "grad_norm": 16.849374771118164, "learning_rate": 1.9999711724646794e-05, "loss": 1.5919, "step": 12050 }, { "epoch": 14.547978273989138, "grad_norm": 16.860076904296875, "learning_rate": 1.9999711483392e-05, "loss": 1.5572, "step": 12060 }, { "epoch": 14.56004828002414, "grad_norm": 16.99187660217285, "learning_rate": 1.9999711242137207e-05, "loss": 1.5695, "step": 12070 }, { "epoch": 14.572118286059142, "grad_norm": 17.406206130981445, "learning_rate": 1.9999711000882413e-05, "loss": 1.5652, "step": 12080 }, { "epoch": 14.584188292094145, "grad_norm": 16.936796188354492, "learning_rate": 1.999971075962762e-05, "loss": 1.601, "step": 12090 }, { "epoch": 14.596258298129149, "grad_norm": 15.940117835998535, "learning_rate": 1.9999710518372825e-05, "loss": 1.5891, "step": 12100 }, { "epoch": 14.608328304164152, "grad_norm": 17.645469665527344, "learning_rate": 1.999971027711803e-05, "loss": 1.575, "step": 12110 }, { "epoch": 14.620398310199155, "grad_norm": 17.046573638916016, "learning_rate": 1.9999710035863238e-05, "loss": 1.5872, "step": 12120 }, { "epoch": 14.632468316234158, "grad_norm": 17.821149826049805, "learning_rate": 1.9999709794608444e-05, "loss": 1.5496, "step": 12130 }, { "epoch": 14.644538322269161, "grad_norm": 17.931583404541016, "learning_rate": 1.999970955335365e-05, "loss": 1.5904, "step": 12140 }, { "epoch": 14.656608328304165, "grad_norm": 18.01386260986328, "learning_rate": 1.9999709312098856e-05, "loss": 1.5999, "step": 12150 }, { "epoch": 14.668678334339168, "grad_norm": 16.688825607299805, "learning_rate": 1.9999709070844063e-05, "loss": 1.6016, "step": 12160 }, { "epoch": 14.680748340374171, "grad_norm": 16.43915367126465, "learning_rate": 1.999970882958927e-05, "loss": 1.6306, "step": 12170 }, { "epoch": 14.692818346409172, "grad_norm": 17.72555160522461, "learning_rate": 1.9999708588334475e-05, "loss": 1.5963, "step": 12180 }, { "epoch": 14.704888352444176, "grad_norm": 17.55921173095703, "learning_rate": 1.999970834707968e-05, "loss": 1.5981, "step": 12190 }, { "epoch": 14.716958358479179, "grad_norm": 17.681358337402344, "learning_rate": 1.9999708105824887e-05, "loss": 1.6047, "step": 12200 }, { "epoch": 14.729028364514182, "grad_norm": 17.854795455932617, "learning_rate": 1.9999707864570094e-05, "loss": 1.5998, "step": 12210 }, { "epoch": 14.741098370549185, "grad_norm": 17.674997329711914, "learning_rate": 1.99997076233153e-05, "loss": 1.618, "step": 12220 }, { "epoch": 14.753168376584188, "grad_norm": 17.344860076904297, "learning_rate": 1.9999707382060506e-05, "loss": 1.6433, "step": 12230 }, { "epoch": 14.765238382619192, "grad_norm": 17.467735290527344, "learning_rate": 1.9999707140805712e-05, "loss": 1.6407, "step": 12240 }, { "epoch": 14.777308388654195, "grad_norm": 16.921274185180664, "learning_rate": 1.999970689955092e-05, "loss": 1.6063, "step": 12250 }, { "epoch": 14.789378394689198, "grad_norm": 17.38481903076172, "learning_rate": 1.9999706658296125e-05, "loss": 1.6259, "step": 12260 }, { "epoch": 14.801448400724201, "grad_norm": 17.520204544067383, "learning_rate": 1.999970641704133e-05, "loss": 1.6223, "step": 12270 }, { "epoch": 14.813518406759204, "grad_norm": 18.734834671020508, "learning_rate": 1.9999706175786537e-05, "loss": 1.6512, "step": 12280 }, { "epoch": 14.825588412794206, "grad_norm": 17.4389591217041, "learning_rate": 1.9999705934531743e-05, "loss": 1.6338, "step": 12290 }, { "epoch": 14.837658418829209, "grad_norm": 16.951780319213867, "learning_rate": 1.999970569327695e-05, "loss": 1.6406, "step": 12300 }, { "epoch": 14.849728424864212, "grad_norm": 17.943315505981445, "learning_rate": 1.9999705452022156e-05, "loss": 1.6027, "step": 12310 }, { "epoch": 14.861798430899215, "grad_norm": 16.546611785888672, "learning_rate": 1.9999705210767362e-05, "loss": 1.6207, "step": 12320 }, { "epoch": 14.873868436934218, "grad_norm": 17.49220085144043, "learning_rate": 1.9999704969512568e-05, "loss": 1.6854, "step": 12330 }, { "epoch": 14.885938442969222, "grad_norm": 17.348094940185547, "learning_rate": 1.9999704728257774e-05, "loss": 1.6278, "step": 12340 }, { "epoch": 14.898008449004225, "grad_norm": 17.209396362304688, "learning_rate": 1.999970448700298e-05, "loss": 1.6746, "step": 12350 }, { "epoch": 14.910078455039228, "grad_norm": 18.653799057006836, "learning_rate": 1.9999704245748187e-05, "loss": 1.6361, "step": 12360 }, { "epoch": 14.922148461074231, "grad_norm": 17.806501388549805, "learning_rate": 1.9999704004493393e-05, "loss": 1.6501, "step": 12370 }, { "epoch": 14.934218467109233, "grad_norm": 19.08165168762207, "learning_rate": 1.99997037632386e-05, "loss": 1.6591, "step": 12380 }, { "epoch": 14.946288473144236, "grad_norm": 16.992876052856445, "learning_rate": 1.9999703521983806e-05, "loss": 1.6594, "step": 12390 }, { "epoch": 14.958358479179239, "grad_norm": 16.570180892944336, "learning_rate": 1.9999703280729012e-05, "loss": 1.6666, "step": 12400 }, { "epoch": 14.970428485214242, "grad_norm": 17.942462921142578, "learning_rate": 1.9999703039474218e-05, "loss": 1.6463, "step": 12410 }, { "epoch": 14.982498491249245, "grad_norm": 17.94098472595215, "learning_rate": 1.999970279821942e-05, "loss": 1.6631, "step": 12420 }, { "epoch": 14.994568497284249, "grad_norm": 18.62997817993164, "learning_rate": 1.9999702556964627e-05, "loss": 1.6636, "step": 12430 }, { "epoch": 15.006035003017502, "grad_norm": 16.949800491333008, "learning_rate": 1.9999702315709833e-05, "loss": 1.424, "step": 12440 }, { "epoch": 15.018105009052505, "grad_norm": 16.295940399169922, "learning_rate": 1.999970207445504e-05, "loss": 1.1172, "step": 12450 }, { "epoch": 15.030175015087508, "grad_norm": 15.727396011352539, "learning_rate": 1.9999701833200246e-05, "loss": 1.1098, "step": 12460 }, { "epoch": 15.042245021122511, "grad_norm": 15.481760025024414, "learning_rate": 1.9999701591945452e-05, "loss": 1.0729, "step": 12470 }, { "epoch": 15.054315027157514, "grad_norm": 16.393665313720703, "learning_rate": 1.9999701350690658e-05, "loss": 1.0984, "step": 12480 }, { "epoch": 15.066385033192516, "grad_norm": 16.506723403930664, "learning_rate": 1.9999701109435864e-05, "loss": 1.0966, "step": 12490 }, { "epoch": 15.078455039227519, "grad_norm": 16.393360137939453, "learning_rate": 1.999970086818107e-05, "loss": 1.0982, "step": 12500 }, { "epoch": 15.078455039227519, "eval_loss": 8.45795726776123, "eval_runtime": 8.1289, "eval_samples_per_second": 85.743, "eval_steps_per_second": 10.826, "step": 12500 }, { "epoch": 15.090525045262522, "grad_norm": 16.30900001525879, "learning_rate": 1.9999700626926277e-05, "loss": 1.1125, "step": 12510 }, { "epoch": 15.102595051297525, "grad_norm": 16.28884506225586, "learning_rate": 1.9999700385671483e-05, "loss": 1.1293, "step": 12520 }, { "epoch": 15.114665057332529, "grad_norm": 16.470090866088867, "learning_rate": 1.999970014441669e-05, "loss": 1.0881, "step": 12530 }, { "epoch": 15.126735063367532, "grad_norm": 15.561416625976562, "learning_rate": 1.9999699903161895e-05, "loss": 1.1223, "step": 12540 }, { "epoch": 15.138805069402535, "grad_norm": 16.327594757080078, "learning_rate": 1.99996996619071e-05, "loss": 1.1297, "step": 12550 }, { "epoch": 15.150875075437538, "grad_norm": 15.99807357788086, "learning_rate": 1.9999699420652308e-05, "loss": 1.1377, "step": 12560 }, { "epoch": 15.162945081472541, "grad_norm": 16.139680862426758, "learning_rate": 1.9999699179397514e-05, "loss": 1.1246, "step": 12570 }, { "epoch": 15.175015087507544, "grad_norm": 16.089101791381836, "learning_rate": 1.999969893814272e-05, "loss": 1.1378, "step": 12580 }, { "epoch": 15.187085093542546, "grad_norm": 15.106809616088867, "learning_rate": 1.9999698696887926e-05, "loss": 1.123, "step": 12590 }, { "epoch": 15.199155099577549, "grad_norm": 15.911718368530273, "learning_rate": 1.9999698455633133e-05, "loss": 1.1394, "step": 12600 }, { "epoch": 15.211225105612552, "grad_norm": 17.226573944091797, "learning_rate": 1.999969821437834e-05, "loss": 1.1531, "step": 12610 }, { "epoch": 15.223295111647555, "grad_norm": 16.68464469909668, "learning_rate": 1.9999697973123545e-05, "loss": 1.1649, "step": 12620 }, { "epoch": 15.235365117682559, "grad_norm": 17.034076690673828, "learning_rate": 1.999969773186875e-05, "loss": 1.1393, "step": 12630 }, { "epoch": 15.247435123717562, "grad_norm": 17.908100128173828, "learning_rate": 1.9999697490613958e-05, "loss": 1.1425, "step": 12640 }, { "epoch": 15.259505129752565, "grad_norm": 16.893447875976562, "learning_rate": 1.9999697249359164e-05, "loss": 1.1693, "step": 12650 }, { "epoch": 15.271575135787568, "grad_norm": 15.493969917297363, "learning_rate": 1.999969700810437e-05, "loss": 1.1518, "step": 12660 }, { "epoch": 15.283645141822571, "grad_norm": 17.2970027923584, "learning_rate": 1.9999696766849573e-05, "loss": 1.1834, "step": 12670 }, { "epoch": 15.295715147857575, "grad_norm": 17.05169677734375, "learning_rate": 1.999969652559478e-05, "loss": 1.1746, "step": 12680 }, { "epoch": 15.307785153892578, "grad_norm": 16.846925735473633, "learning_rate": 1.9999696284339985e-05, "loss": 1.1909, "step": 12690 }, { "epoch": 15.31985515992758, "grad_norm": 17.667219161987305, "learning_rate": 1.999969604308519e-05, "loss": 1.1719, "step": 12700 }, { "epoch": 15.331925165962582, "grad_norm": 18.2103328704834, "learning_rate": 1.9999695801830398e-05, "loss": 1.1949, "step": 12710 }, { "epoch": 15.343995171997586, "grad_norm": 16.932260513305664, "learning_rate": 1.9999695560575604e-05, "loss": 1.185, "step": 12720 }, { "epoch": 15.356065178032589, "grad_norm": 17.518117904663086, "learning_rate": 1.999969531932081e-05, "loss": 1.195, "step": 12730 }, { "epoch": 15.368135184067592, "grad_norm": 16.59065818786621, "learning_rate": 1.9999695078066016e-05, "loss": 1.1915, "step": 12740 }, { "epoch": 15.380205190102595, "grad_norm": 17.713390350341797, "learning_rate": 1.9999694836811223e-05, "loss": 1.2372, "step": 12750 }, { "epoch": 15.392275196137598, "grad_norm": 17.212448120117188, "learning_rate": 1.999969459555643e-05, "loss": 1.204, "step": 12760 }, { "epoch": 15.404345202172602, "grad_norm": 17.794832229614258, "learning_rate": 1.9999694354301635e-05, "loss": 1.2206, "step": 12770 }, { "epoch": 15.416415208207605, "grad_norm": 17.449689865112305, "learning_rate": 1.999969411304684e-05, "loss": 1.2067, "step": 12780 }, { "epoch": 15.428485214242608, "grad_norm": 17.65974235534668, "learning_rate": 1.9999693871792047e-05, "loss": 1.205, "step": 12790 }, { "epoch": 15.44055522027761, "grad_norm": 19.41393280029297, "learning_rate": 1.9999693630537257e-05, "loss": 1.1931, "step": 12800 }, { "epoch": 15.452625226312612, "grad_norm": 17.295326232910156, "learning_rate": 1.9999693389282463e-05, "loss": 1.2282, "step": 12810 }, { "epoch": 15.464695232347616, "grad_norm": 17.229963302612305, "learning_rate": 1.999969314802767e-05, "loss": 1.219, "step": 12820 }, { "epoch": 15.476765238382619, "grad_norm": 17.714645385742188, "learning_rate": 1.9999692906772876e-05, "loss": 1.2235, "step": 12830 }, { "epoch": 15.488835244417622, "grad_norm": 17.242633819580078, "learning_rate": 1.999969266551808e-05, "loss": 1.2173, "step": 12840 }, { "epoch": 15.500905250452625, "grad_norm": 17.798999786376953, "learning_rate": 1.9999692424263285e-05, "loss": 1.2168, "step": 12850 }, { "epoch": 15.512975256487628, "grad_norm": 17.012928009033203, "learning_rate": 1.999969218300849e-05, "loss": 1.2283, "step": 12860 }, { "epoch": 15.525045262522632, "grad_norm": 19.382320404052734, "learning_rate": 1.9999691941753697e-05, "loss": 1.2339, "step": 12870 }, { "epoch": 15.537115268557635, "grad_norm": 17.95484161376953, "learning_rate": 1.9999691700498903e-05, "loss": 1.2231, "step": 12880 }, { "epoch": 15.549185274592638, "grad_norm": 17.46329689025879, "learning_rate": 1.999969145924411e-05, "loss": 1.22, "step": 12890 }, { "epoch": 15.561255280627641, "grad_norm": 17.466644287109375, "learning_rate": 1.9999691217989316e-05, "loss": 1.2356, "step": 12900 }, { "epoch": 15.573325286662643, "grad_norm": 17.08428382873535, "learning_rate": 1.9999690976734522e-05, "loss": 1.2481, "step": 12910 }, { "epoch": 15.585395292697646, "grad_norm": 17.76215362548828, "learning_rate": 1.9999690735479728e-05, "loss": 1.2464, "step": 12920 }, { "epoch": 15.597465298732649, "grad_norm": 17.015167236328125, "learning_rate": 1.9999690494224934e-05, "loss": 1.2735, "step": 12930 }, { "epoch": 15.609535304767652, "grad_norm": 18.406227111816406, "learning_rate": 1.999969025297014e-05, "loss": 1.2376, "step": 12940 }, { "epoch": 15.621605310802655, "grad_norm": 17.442489624023438, "learning_rate": 1.9999690011715347e-05, "loss": 1.2723, "step": 12950 }, { "epoch": 15.633675316837659, "grad_norm": 18.513765335083008, "learning_rate": 1.9999689770460553e-05, "loss": 1.2382, "step": 12960 }, { "epoch": 15.645745322872662, "grad_norm": 19.88123893737793, "learning_rate": 1.999968952920576e-05, "loss": 1.2853, "step": 12970 }, { "epoch": 15.657815328907665, "grad_norm": 17.978071212768555, "learning_rate": 1.9999689287950965e-05, "loss": 1.2709, "step": 12980 }, { "epoch": 15.669885334942668, "grad_norm": 17.652408599853516, "learning_rate": 1.9999689046696172e-05, "loss": 1.2682, "step": 12990 }, { "epoch": 15.681955340977671, "grad_norm": 17.781225204467773, "learning_rate": 1.9999688805441378e-05, "loss": 1.2516, "step": 13000 }, { "epoch": 15.681955340977671, "eval_loss": 8.596538543701172, "eval_runtime": 8.1427, "eval_samples_per_second": 85.599, "eval_steps_per_second": 10.807, "step": 13000 }, { "epoch": 15.694025347012673, "grad_norm": 18.202226638793945, "learning_rate": 1.9999688564186584e-05, "loss": 1.2514, "step": 13010 }, { "epoch": 15.706095353047676, "grad_norm": 17.03910255432129, "learning_rate": 1.999968832293179e-05, "loss": 1.2699, "step": 13020 }, { "epoch": 15.718165359082679, "grad_norm": 18.2091007232666, "learning_rate": 1.9999688081676997e-05, "loss": 1.2851, "step": 13030 }, { "epoch": 15.730235365117682, "grad_norm": 17.273143768310547, "learning_rate": 1.9999687840422203e-05, "loss": 1.2723, "step": 13040 }, { "epoch": 15.742305371152685, "grad_norm": 17.296476364135742, "learning_rate": 1.999968759916741e-05, "loss": 1.3076, "step": 13050 }, { "epoch": 15.754375377187689, "grad_norm": 18.42511558532715, "learning_rate": 1.9999687357912615e-05, "loss": 1.272, "step": 13060 }, { "epoch": 15.766445383222692, "grad_norm": 17.971965789794922, "learning_rate": 1.999968711665782e-05, "loss": 1.2916, "step": 13070 }, { "epoch": 15.778515389257695, "grad_norm": 17.56062126159668, "learning_rate": 1.9999686875403028e-05, "loss": 1.3061, "step": 13080 }, { "epoch": 15.790585395292698, "grad_norm": 17.18064308166504, "learning_rate": 1.999968663414823e-05, "loss": 1.2978, "step": 13090 }, { "epoch": 15.802655401327701, "grad_norm": 17.530012130737305, "learning_rate": 1.9999686392893437e-05, "loss": 1.3051, "step": 13100 }, { "epoch": 15.814725407362705, "grad_norm": 16.92384147644043, "learning_rate": 1.9999686151638643e-05, "loss": 1.2947, "step": 13110 }, { "epoch": 15.826795413397706, "grad_norm": 18.93626594543457, "learning_rate": 1.999968591038385e-05, "loss": 1.3348, "step": 13120 }, { "epoch": 15.83886541943271, "grad_norm": 17.209171295166016, "learning_rate": 1.9999685669129055e-05, "loss": 1.2807, "step": 13130 }, { "epoch": 15.850935425467712, "grad_norm": 18.97437286376953, "learning_rate": 1.999968542787426e-05, "loss": 1.304, "step": 13140 }, { "epoch": 15.863005431502716, "grad_norm": 18.758535385131836, "learning_rate": 1.9999685186619468e-05, "loss": 1.2959, "step": 13150 }, { "epoch": 15.875075437537719, "grad_norm": 18.58016586303711, "learning_rate": 1.9999684945364674e-05, "loss": 1.3178, "step": 13160 }, { "epoch": 15.887145443572722, "grad_norm": 18.681476593017578, "learning_rate": 1.999968470410988e-05, "loss": 1.2997, "step": 13170 }, { "epoch": 15.899215449607725, "grad_norm": 18.430103302001953, "learning_rate": 1.9999684462855086e-05, "loss": 1.2878, "step": 13180 }, { "epoch": 15.911285455642728, "grad_norm": 16.966285705566406, "learning_rate": 1.9999684221600293e-05, "loss": 1.2944, "step": 13190 }, { "epoch": 15.923355461677732, "grad_norm": 20.103925704956055, "learning_rate": 1.99996839803455e-05, "loss": 1.3084, "step": 13200 }, { "epoch": 15.935425467712733, "grad_norm": 18.114517211914062, "learning_rate": 1.9999683739090705e-05, "loss": 1.3596, "step": 13210 }, { "epoch": 15.947495473747736, "grad_norm": 18.31344223022461, "learning_rate": 1.999968349783591e-05, "loss": 1.3199, "step": 13220 }, { "epoch": 15.95956547978274, "grad_norm": 17.438278198242188, "learning_rate": 1.9999683256581117e-05, "loss": 1.3122, "step": 13230 }, { "epoch": 15.971635485817743, "grad_norm": 17.11007308959961, "learning_rate": 1.9999683015326324e-05, "loss": 1.3206, "step": 13240 }, { "epoch": 15.983705491852746, "grad_norm": 19.948625564575195, "learning_rate": 1.999968277407153e-05, "loss": 1.3087, "step": 13250 }, { "epoch": 15.995775497887749, "grad_norm": 17.39379119873047, "learning_rate": 1.9999682532816736e-05, "loss": 1.3428, "step": 13260 }, { "epoch": 16.007242003621002, "grad_norm": 15.840539932250977, "learning_rate": 1.9999682291561942e-05, "loss": 1.039, "step": 13270 }, { "epoch": 16.019312009656005, "grad_norm": 15.432873725891113, "learning_rate": 1.999968205030715e-05, "loss": 0.843, "step": 13280 }, { "epoch": 16.03138201569101, "grad_norm": 15.08594799041748, "learning_rate": 1.9999681809052355e-05, "loss": 0.851, "step": 13290 }, { "epoch": 16.04345202172601, "grad_norm": 17.031837463378906, "learning_rate": 1.999968156779756e-05, "loss": 0.846, "step": 13300 }, { "epoch": 16.055522027761015, "grad_norm": 15.916180610656738, "learning_rate": 1.9999681326542767e-05, "loss": 0.8414, "step": 13310 }, { "epoch": 16.067592033796018, "grad_norm": 15.468609809875488, "learning_rate": 1.9999681085287973e-05, "loss": 0.8566, "step": 13320 }, { "epoch": 16.07966203983102, "grad_norm": 15.98613452911377, "learning_rate": 1.999968084403318e-05, "loss": 0.8567, "step": 13330 }, { "epoch": 16.091732045866024, "grad_norm": 16.31556510925293, "learning_rate": 1.9999680602778386e-05, "loss": 0.8711, "step": 13340 }, { "epoch": 16.103802051901027, "grad_norm": 16.000402450561523, "learning_rate": 1.9999680361523592e-05, "loss": 0.84, "step": 13350 }, { "epoch": 16.11587205793603, "grad_norm": 17.209810256958008, "learning_rate": 1.9999680120268798e-05, "loss": 0.8614, "step": 13360 }, { "epoch": 16.12794206397103, "grad_norm": 17.214191436767578, "learning_rate": 1.9999679879014005e-05, "loss": 0.872, "step": 13370 }, { "epoch": 16.140012070006033, "grad_norm": 17.32999610900879, "learning_rate": 1.999967963775921e-05, "loss": 0.8833, "step": 13380 }, { "epoch": 16.152082076041037, "grad_norm": 17.02309226989746, "learning_rate": 1.9999679396504417e-05, "loss": 0.8872, "step": 13390 }, { "epoch": 16.16415208207604, "grad_norm": 16.728313446044922, "learning_rate": 1.9999679155249623e-05, "loss": 0.9015, "step": 13400 }, { "epoch": 16.176222088111043, "grad_norm": 16.583148956298828, "learning_rate": 1.999967891399483e-05, "loss": 0.8895, "step": 13410 }, { "epoch": 16.188292094146046, "grad_norm": 16.60786247253418, "learning_rate": 1.9999678672740036e-05, "loss": 0.8864, "step": 13420 }, { "epoch": 16.20036210018105, "grad_norm": 16.727983474731445, "learning_rate": 1.9999678431485242e-05, "loss": 0.8857, "step": 13430 }, { "epoch": 16.212432106216053, "grad_norm": 16.44685173034668, "learning_rate": 1.9999678190230448e-05, "loss": 0.9152, "step": 13440 }, { "epoch": 16.224502112251056, "grad_norm": 17.312265396118164, "learning_rate": 1.9999677948975654e-05, "loss": 0.9129, "step": 13450 }, { "epoch": 16.23657211828606, "grad_norm": 17.671199798583984, "learning_rate": 1.999967770772086e-05, "loss": 0.8946, "step": 13460 }, { "epoch": 16.248642124321062, "grad_norm": 16.878517150878906, "learning_rate": 1.9999677466466067e-05, "loss": 0.917, "step": 13470 }, { "epoch": 16.260712130356065, "grad_norm": 17.080251693725586, "learning_rate": 1.9999677225211273e-05, "loss": 0.8903, "step": 13480 }, { "epoch": 16.27278213639107, "grad_norm": 17.15581512451172, "learning_rate": 1.999967698395648e-05, "loss": 0.9029, "step": 13490 }, { "epoch": 16.28485214242607, "grad_norm": 17.39693260192871, "learning_rate": 1.9999676742701682e-05, "loss": 0.9132, "step": 13500 }, { "epoch": 16.28485214242607, "eval_loss": 8.924986839294434, "eval_runtime": 8.1241, "eval_samples_per_second": 85.794, "eval_steps_per_second": 10.832, "step": 13500 }, { "epoch": 16.296922148461075, "grad_norm": 16.47953224182129, "learning_rate": 1.9999676501446888e-05, "loss": 0.9172, "step": 13510 }, { "epoch": 16.308992154496078, "grad_norm": 17.444242477416992, "learning_rate": 1.9999676260192094e-05, "loss": 0.9289, "step": 13520 }, { "epoch": 16.32106216053108, "grad_norm": 16.086904525756836, "learning_rate": 1.99996760189373e-05, "loss": 0.9191, "step": 13530 }, { "epoch": 16.333132166566084, "grad_norm": 16.812746047973633, "learning_rate": 1.9999675777682507e-05, "loss": 0.8875, "step": 13540 }, { "epoch": 16.345202172601088, "grad_norm": 16.320812225341797, "learning_rate": 1.9999675536427713e-05, "loss": 0.9434, "step": 13550 }, { "epoch": 16.35727217863609, "grad_norm": 18.76348876953125, "learning_rate": 1.999967529517292e-05, "loss": 0.9098, "step": 13560 }, { "epoch": 16.369342184671094, "grad_norm": 17.36586570739746, "learning_rate": 1.9999675053918125e-05, "loss": 0.9385, "step": 13570 }, { "epoch": 16.381412190706094, "grad_norm": 16.559799194335938, "learning_rate": 1.999967481266333e-05, "loss": 0.932, "step": 13580 }, { "epoch": 16.393482196741097, "grad_norm": 16.357595443725586, "learning_rate": 1.9999674571408538e-05, "loss": 0.9635, "step": 13590 }, { "epoch": 16.4055522027761, "grad_norm": 18.56206512451172, "learning_rate": 1.9999674330153744e-05, "loss": 0.9362, "step": 13600 }, { "epoch": 16.417622208811103, "grad_norm": 17.07723045349121, "learning_rate": 1.999967408889895e-05, "loss": 0.9324, "step": 13610 }, { "epoch": 16.429692214846106, "grad_norm": 17.437870025634766, "learning_rate": 1.9999673847644157e-05, "loss": 0.9463, "step": 13620 }, { "epoch": 16.44176222088111, "grad_norm": 17.553621292114258, "learning_rate": 1.9999673606389363e-05, "loss": 0.9598, "step": 13630 }, { "epoch": 16.453832226916113, "grad_norm": 16.932971954345703, "learning_rate": 1.999967336513457e-05, "loss": 0.9261, "step": 13640 }, { "epoch": 16.465902232951116, "grad_norm": 18.070133209228516, "learning_rate": 1.9999673123879775e-05, "loss": 0.9647, "step": 13650 }, { "epoch": 16.47797223898612, "grad_norm": 17.476158142089844, "learning_rate": 1.999967288262498e-05, "loss": 0.9576, "step": 13660 }, { "epoch": 16.490042245021122, "grad_norm": 17.08171272277832, "learning_rate": 1.9999672641370188e-05, "loss": 0.9621, "step": 13670 }, { "epoch": 16.502112251056126, "grad_norm": 18.153409957885742, "learning_rate": 1.9999672400115394e-05, "loss": 0.9686, "step": 13680 }, { "epoch": 16.51418225709113, "grad_norm": 17.661447525024414, "learning_rate": 1.99996721588606e-05, "loss": 0.9618, "step": 13690 }, { "epoch": 16.526252263126132, "grad_norm": 17.156770706176758, "learning_rate": 1.9999671917605806e-05, "loss": 0.9662, "step": 13700 }, { "epoch": 16.538322269161135, "grad_norm": 19.34084129333496, "learning_rate": 1.9999671676351012e-05, "loss": 0.9743, "step": 13710 }, { "epoch": 16.55039227519614, "grad_norm": 17.663551330566406, "learning_rate": 1.999967143509622e-05, "loss": 0.9596, "step": 13720 }, { "epoch": 16.56246228123114, "grad_norm": 18.790443420410156, "learning_rate": 1.9999671193841425e-05, "loss": 0.9778, "step": 13730 }, { "epoch": 16.574532287266145, "grad_norm": 17.797061920166016, "learning_rate": 1.999967095258663e-05, "loss": 0.9862, "step": 13740 }, { "epoch": 16.586602293301148, "grad_norm": 18.054203033447266, "learning_rate": 1.9999670711331834e-05, "loss": 0.9784, "step": 13750 }, { "epoch": 16.59867229933615, "grad_norm": 17.62456512451172, "learning_rate": 1.999967047007704e-05, "loss": 1.0002, "step": 13760 }, { "epoch": 16.610742305371154, "grad_norm": 17.98691177368164, "learning_rate": 1.9999670228822246e-05, "loss": 0.9891, "step": 13770 }, { "epoch": 16.622812311406157, "grad_norm": 17.790185928344727, "learning_rate": 1.9999669987567453e-05, "loss": 1.0038, "step": 13780 }, { "epoch": 16.634882317441157, "grad_norm": 17.9913330078125, "learning_rate": 1.999966974631266e-05, "loss": 0.9927, "step": 13790 }, { "epoch": 16.64695232347616, "grad_norm": 17.59669303894043, "learning_rate": 1.9999669505057865e-05, "loss": 0.9708, "step": 13800 }, { "epoch": 16.659022329511163, "grad_norm": 18.8429012298584, "learning_rate": 1.999966926380307e-05, "loss": 0.9872, "step": 13810 }, { "epoch": 16.671092335546167, "grad_norm": 18.51820182800293, "learning_rate": 1.9999669022548277e-05, "loss": 0.9956, "step": 13820 }, { "epoch": 16.68316234158117, "grad_norm": 17.86749839782715, "learning_rate": 1.9999668781293484e-05, "loss": 1.0076, "step": 13830 }, { "epoch": 16.695232347616173, "grad_norm": 17.357038497924805, "learning_rate": 1.999966854003869e-05, "loss": 0.9975, "step": 13840 }, { "epoch": 16.707302353651176, "grad_norm": 17.512685775756836, "learning_rate": 1.9999668298783896e-05, "loss": 0.9831, "step": 13850 }, { "epoch": 16.71937235968618, "grad_norm": 17.93054962158203, "learning_rate": 1.9999668057529102e-05, "loss": 1.0102, "step": 13860 }, { "epoch": 16.731442365721183, "grad_norm": 17.406686782836914, "learning_rate": 1.999966781627431e-05, "loss": 1.0041, "step": 13870 }, { "epoch": 16.743512371756186, "grad_norm": 18.369571685791016, "learning_rate": 1.9999667575019518e-05, "loss": 1.0185, "step": 13880 }, { "epoch": 16.75558237779119, "grad_norm": 18.10308837890625, "learning_rate": 1.9999667333764724e-05, "loss": 1.0066, "step": 13890 }, { "epoch": 16.767652383826192, "grad_norm": 18.16658592224121, "learning_rate": 1.999966709250993e-05, "loss": 1.0125, "step": 13900 }, { "epoch": 16.779722389861195, "grad_norm": 17.701480865478516, "learning_rate": 1.9999666851255137e-05, "loss": 1.0164, "step": 13910 }, { "epoch": 16.7917923958962, "grad_norm": 17.855241775512695, "learning_rate": 1.999966661000034e-05, "loss": 1.0169, "step": 13920 }, { "epoch": 16.8038624019312, "grad_norm": 17.47166633605957, "learning_rate": 1.9999666368745546e-05, "loss": 1.0272, "step": 13930 }, { "epoch": 16.815932407966205, "grad_norm": 16.703739166259766, "learning_rate": 1.9999666127490752e-05, "loss": 1.0266, "step": 13940 }, { "epoch": 16.828002414001208, "grad_norm": 18.329273223876953, "learning_rate": 1.9999665886235958e-05, "loss": 1.0323, "step": 13950 }, { "epoch": 16.84007242003621, "grad_norm": 17.822275161743164, "learning_rate": 1.9999665644981164e-05, "loss": 1.0187, "step": 13960 }, { "epoch": 16.852142426071214, "grad_norm": 17.312480926513672, "learning_rate": 1.999966540372637e-05, "loss": 1.0396, "step": 13970 }, { "epoch": 16.864212432106218, "grad_norm": 17.41130256652832, "learning_rate": 1.9999665162471577e-05, "loss": 1.0357, "step": 13980 }, { "epoch": 16.876282438141217, "grad_norm": 17.173137664794922, "learning_rate": 1.9999664921216783e-05, "loss": 1.044, "step": 13990 }, { "epoch": 16.88835244417622, "grad_norm": 19.371667861938477, "learning_rate": 1.999966467996199e-05, "loss": 1.0346, "step": 14000 }, { "epoch": 16.88835244417622, "eval_loss": 9.004250526428223, "eval_runtime": 8.1257, "eval_samples_per_second": 85.778, "eval_steps_per_second": 10.83, "step": 14000 }, { "epoch": 16.900422450211224, "grad_norm": 19.508970260620117, "learning_rate": 1.9999664438707196e-05, "loss": 1.0513, "step": 14010 }, { "epoch": 16.912492456246227, "grad_norm": 17.602014541625977, "learning_rate": 1.9999664197452402e-05, "loss": 1.0309, "step": 14020 }, { "epoch": 16.92456246228123, "grad_norm": 19.011781692504883, "learning_rate": 1.9999663956197608e-05, "loss": 1.047, "step": 14030 }, { "epoch": 16.936632468316233, "grad_norm": 18.08753776550293, "learning_rate": 1.9999663714942814e-05, "loss": 1.0317, "step": 14040 }, { "epoch": 16.948702474351236, "grad_norm": 17.175758361816406, "learning_rate": 1.999966347368802e-05, "loss": 1.0376, "step": 14050 }, { "epoch": 16.96077248038624, "grad_norm": 17.344261169433594, "learning_rate": 1.9999663232433227e-05, "loss": 1.0397, "step": 14060 }, { "epoch": 16.972842486421243, "grad_norm": 17.587818145751953, "learning_rate": 1.9999662991178433e-05, "loss": 1.057, "step": 14070 }, { "epoch": 16.984912492456246, "grad_norm": 19.362796783447266, "learning_rate": 1.999966274992364e-05, "loss": 1.0668, "step": 14080 }, { "epoch": 16.99698249849125, "grad_norm": 18.327550888061523, "learning_rate": 1.9999662508668845e-05, "loss": 1.0631, "step": 14090 }, { "epoch": 17.008449004224502, "grad_norm": 14.840453147888184, "learning_rate": 1.999966226741405e-05, "loss": 0.7984, "step": 14100 }, { "epoch": 17.020519010259505, "grad_norm": 14.721022605895996, "learning_rate": 1.9999662026159258e-05, "loss": 0.6676, "step": 14110 }, { "epoch": 17.03258901629451, "grad_norm": 15.734292030334473, "learning_rate": 1.9999661784904464e-05, "loss": 0.6426, "step": 14120 }, { "epoch": 17.044659022329512, "grad_norm": 15.7647123336792, "learning_rate": 1.999966154364967e-05, "loss": 0.6687, "step": 14130 }, { "epoch": 17.056729028364515, "grad_norm": 14.42798137664795, "learning_rate": 1.9999661302394876e-05, "loss": 0.6519, "step": 14140 }, { "epoch": 17.068799034399518, "grad_norm": 14.518383979797363, "learning_rate": 1.9999661061140083e-05, "loss": 0.6567, "step": 14150 }, { "epoch": 17.08086904043452, "grad_norm": 15.399825096130371, "learning_rate": 1.999966081988529e-05, "loss": 0.6664, "step": 14160 }, { "epoch": 17.092939046469525, "grad_norm": 15.43704891204834, "learning_rate": 1.999966057863049e-05, "loss": 0.6873, "step": 14170 }, { "epoch": 17.105009052504528, "grad_norm": 15.850350379943848, "learning_rate": 1.9999660337375698e-05, "loss": 0.6621, "step": 14180 }, { "epoch": 17.11707905853953, "grad_norm": 16.271047592163086, "learning_rate": 1.9999660096120904e-05, "loss": 0.6663, "step": 14190 }, { "epoch": 17.12914906457453, "grad_norm": 16.89712905883789, "learning_rate": 1.999965985486611e-05, "loss": 0.7055, "step": 14200 }, { "epoch": 17.141219070609534, "grad_norm": 16.674013137817383, "learning_rate": 1.9999659613611316e-05, "loss": 0.6895, "step": 14210 }, { "epoch": 17.153289076644537, "grad_norm": 15.915021896362305, "learning_rate": 1.9999659372356523e-05, "loss": 0.6931, "step": 14220 }, { "epoch": 17.16535908267954, "grad_norm": 15.039719581604004, "learning_rate": 1.999965913110173e-05, "loss": 0.7011, "step": 14230 }, { "epoch": 17.177429088714543, "grad_norm": 16.256803512573242, "learning_rate": 1.9999658889846935e-05, "loss": 0.6776, "step": 14240 }, { "epoch": 17.189499094749547, "grad_norm": 16.66473388671875, "learning_rate": 1.999965864859214e-05, "loss": 0.704, "step": 14250 }, { "epoch": 17.20156910078455, "grad_norm": 15.382439613342285, "learning_rate": 1.9999658407337348e-05, "loss": 0.71, "step": 14260 }, { "epoch": 17.213639106819553, "grad_norm": 16.456069946289062, "learning_rate": 1.9999658166082554e-05, "loss": 0.6842, "step": 14270 }, { "epoch": 17.225709112854556, "grad_norm": 16.363805770874023, "learning_rate": 1.999965792482776e-05, "loss": 0.7121, "step": 14280 }, { "epoch": 17.23777911888956, "grad_norm": 16.51167869567871, "learning_rate": 1.9999657683572966e-05, "loss": 0.7091, "step": 14290 }, { "epoch": 17.249849124924562, "grad_norm": 16.534093856811523, "learning_rate": 1.9999657442318172e-05, "loss": 0.7209, "step": 14300 }, { "epoch": 17.261919130959566, "grad_norm": 17.780738830566406, "learning_rate": 1.999965720106338e-05, "loss": 0.7043, "step": 14310 }, { "epoch": 17.27398913699457, "grad_norm": 15.72205924987793, "learning_rate": 1.9999656959808585e-05, "loss": 0.7055, "step": 14320 }, { "epoch": 17.286059143029572, "grad_norm": 17.050315856933594, "learning_rate": 1.999965671855379e-05, "loss": 0.7154, "step": 14330 }, { "epoch": 17.298129149064575, "grad_norm": 16.24379539489746, "learning_rate": 1.9999656477298997e-05, "loss": 0.7213, "step": 14340 }, { "epoch": 17.31019915509958, "grad_norm": 16.905027389526367, "learning_rate": 1.9999656236044203e-05, "loss": 0.73, "step": 14350 }, { "epoch": 17.32226916113458, "grad_norm": 16.083826065063477, "learning_rate": 1.999965599478941e-05, "loss": 0.716, "step": 14360 }, { "epoch": 17.334339167169585, "grad_norm": 17.053205490112305, "learning_rate": 1.9999655753534616e-05, "loss": 0.7165, "step": 14370 }, { "epoch": 17.346409173204588, "grad_norm": 16.19652557373047, "learning_rate": 1.9999655512279822e-05, "loss": 0.72, "step": 14380 }, { "epoch": 17.35847917923959, "grad_norm": 17.25996971130371, "learning_rate": 1.999965527102503e-05, "loss": 0.7418, "step": 14390 }, { "epoch": 17.370549185274594, "grad_norm": 17.70363426208496, "learning_rate": 1.9999655029770235e-05, "loss": 0.7403, "step": 14400 }, { "epoch": 17.382619191309594, "grad_norm": 17.94729232788086, "learning_rate": 1.999965478851544e-05, "loss": 0.7296, "step": 14410 }, { "epoch": 17.394689197344597, "grad_norm": 18.2227840423584, "learning_rate": 1.9999654547260647e-05, "loss": 0.7273, "step": 14420 }, { "epoch": 17.4067592033796, "grad_norm": 17.615224838256836, "learning_rate": 1.9999654306005853e-05, "loss": 0.7344, "step": 14430 }, { "epoch": 17.418829209414604, "grad_norm": 18.146961212158203, "learning_rate": 1.999965406475106e-05, "loss": 0.7376, "step": 14440 }, { "epoch": 17.430899215449607, "grad_norm": 17.077978134155273, "learning_rate": 1.9999653823496266e-05, "loss": 0.7668, "step": 14450 }, { "epoch": 17.44296922148461, "grad_norm": 16.44934844970703, "learning_rate": 1.9999653582241472e-05, "loss": 0.7516, "step": 14460 }, { "epoch": 17.455039227519613, "grad_norm": 18.482961654663086, "learning_rate": 1.9999653340986678e-05, "loss": 0.7473, "step": 14470 }, { "epoch": 17.467109233554616, "grad_norm": 16.791881561279297, "learning_rate": 1.9999653099731884e-05, "loss": 0.7324, "step": 14480 }, { "epoch": 17.47917923958962, "grad_norm": 16.928783416748047, "learning_rate": 1.999965285847709e-05, "loss": 0.7519, "step": 14490 }, { "epoch": 17.491249245624623, "grad_norm": 16.890684127807617, "learning_rate": 1.9999652617222297e-05, "loss": 0.759, "step": 14500 }, { "epoch": 17.491249245624623, "eval_loss": 9.31740665435791, "eval_runtime": 8.1266, "eval_samples_per_second": 85.767, "eval_steps_per_second": 10.829, "step": 14500 }, { "epoch": 17.503319251659626, "grad_norm": 17.45193862915039, "learning_rate": 1.9999652375967503e-05, "loss": 0.7479, "step": 14510 }, { "epoch": 17.51538925769463, "grad_norm": 17.62733268737793, "learning_rate": 1.999965213471271e-05, "loss": 0.7711, "step": 14520 }, { "epoch": 17.527459263729632, "grad_norm": 18.13010025024414, "learning_rate": 1.9999651893457915e-05, "loss": 0.7743, "step": 14530 }, { "epoch": 17.539529269764635, "grad_norm": 17.265560150146484, "learning_rate": 1.999965165220312e-05, "loss": 0.7664, "step": 14540 }, { "epoch": 17.55159927579964, "grad_norm": 17.847625732421875, "learning_rate": 1.9999651410948328e-05, "loss": 0.7686, "step": 14550 }, { "epoch": 17.563669281834642, "grad_norm": 16.297710418701172, "learning_rate": 1.9999651169693534e-05, "loss": 0.7568, "step": 14560 }, { "epoch": 17.575739287869645, "grad_norm": 17.006595611572266, "learning_rate": 1.999965092843874e-05, "loss": 0.7671, "step": 14570 }, { "epoch": 17.587809293904648, "grad_norm": 17.51896858215332, "learning_rate": 1.9999650687183943e-05, "loss": 0.7583, "step": 14580 }, { "epoch": 17.59987929993965, "grad_norm": 18.224166870117188, "learning_rate": 1.999965044592915e-05, "loss": 0.7789, "step": 14590 }, { "epoch": 17.611949305974655, "grad_norm": 17.820907592773438, "learning_rate": 1.9999650204674355e-05, "loss": 0.7765, "step": 14600 }, { "epoch": 17.624019312009658, "grad_norm": 17.61714744567871, "learning_rate": 1.999964996341956e-05, "loss": 0.781, "step": 14610 }, { "epoch": 17.636089318044657, "grad_norm": 18.082042694091797, "learning_rate": 1.9999649722164768e-05, "loss": 0.7757, "step": 14620 }, { "epoch": 17.64815932407966, "grad_norm": 17.29634666442871, "learning_rate": 1.9999649480909974e-05, "loss": 0.7866, "step": 14630 }, { "epoch": 17.660229330114664, "grad_norm": 17.845888137817383, "learning_rate": 1.999964923965518e-05, "loss": 0.7803, "step": 14640 }, { "epoch": 17.672299336149667, "grad_norm": 17.737060546875, "learning_rate": 1.9999648998400387e-05, "loss": 0.7856, "step": 14650 }, { "epoch": 17.68436934218467, "grad_norm": 17.980716705322266, "learning_rate": 1.9999648757145593e-05, "loss": 0.7903, "step": 14660 }, { "epoch": 17.696439348219673, "grad_norm": 18.24843978881836, "learning_rate": 1.99996485158908e-05, "loss": 0.8039, "step": 14670 }, { "epoch": 17.708509354254677, "grad_norm": 18.473129272460938, "learning_rate": 1.9999648274636005e-05, "loss": 0.786, "step": 14680 }, { "epoch": 17.72057936028968, "grad_norm": 17.2807674407959, "learning_rate": 1.999964803338121e-05, "loss": 0.7956, "step": 14690 }, { "epoch": 17.732649366324683, "grad_norm": 18.512414932250977, "learning_rate": 1.9999647792126418e-05, "loss": 0.8048, "step": 14700 }, { "epoch": 17.744719372359686, "grad_norm": 18.359773635864258, "learning_rate": 1.9999647550871624e-05, "loss": 0.7988, "step": 14710 }, { "epoch": 17.75678937839469, "grad_norm": 16.416231155395508, "learning_rate": 1.999964730961683e-05, "loss": 0.7952, "step": 14720 }, { "epoch": 17.768859384429692, "grad_norm": 17.91224479675293, "learning_rate": 1.9999647068362036e-05, "loss": 0.7894, "step": 14730 }, { "epoch": 17.780929390464696, "grad_norm": 18.40526008605957, "learning_rate": 1.9999646827107242e-05, "loss": 0.8168, "step": 14740 }, { "epoch": 17.7929993964997, "grad_norm": 16.9232120513916, "learning_rate": 1.999964658585245e-05, "loss": 0.7929, "step": 14750 }, { "epoch": 17.805069402534702, "grad_norm": 18.067790985107422, "learning_rate": 1.9999646344597655e-05, "loss": 0.8139, "step": 14760 }, { "epoch": 17.817139408569705, "grad_norm": 17.234561920166016, "learning_rate": 1.999964610334286e-05, "loss": 0.8257, "step": 14770 }, { "epoch": 17.82920941460471, "grad_norm": 17.19988441467285, "learning_rate": 1.9999645862088067e-05, "loss": 0.8144, "step": 14780 }, { "epoch": 17.84127942063971, "grad_norm": 18.185691833496094, "learning_rate": 1.9999645620833274e-05, "loss": 0.8155, "step": 14790 }, { "epoch": 17.853349426674715, "grad_norm": 18.747617721557617, "learning_rate": 1.999964537957848e-05, "loss": 0.8199, "step": 14800 }, { "epoch": 17.865419432709718, "grad_norm": 18.988122940063477, "learning_rate": 1.9999645138323686e-05, "loss": 0.8089, "step": 14810 }, { "epoch": 17.877489438744718, "grad_norm": 18.598392486572266, "learning_rate": 1.9999644897068892e-05, "loss": 0.8237, "step": 14820 }, { "epoch": 17.88955944477972, "grad_norm": 17.785249710083008, "learning_rate": 1.9999644655814095e-05, "loss": 0.8393, "step": 14830 }, { "epoch": 17.901629450814724, "grad_norm": 19.032459259033203, "learning_rate": 1.99996444145593e-05, "loss": 0.8424, "step": 14840 }, { "epoch": 17.913699456849727, "grad_norm": 16.76453399658203, "learning_rate": 1.9999644173304507e-05, "loss": 0.8389, "step": 14850 }, { "epoch": 17.92576946288473, "grad_norm": 17.739147186279297, "learning_rate": 1.9999643932049714e-05, "loss": 0.8294, "step": 14860 }, { "epoch": 17.937839468919734, "grad_norm": 16.759464263916016, "learning_rate": 1.999964369079492e-05, "loss": 0.8135, "step": 14870 }, { "epoch": 17.949909474954737, "grad_norm": 18.131486892700195, "learning_rate": 1.9999643449540126e-05, "loss": 0.8324, "step": 14880 }, { "epoch": 17.96197948098974, "grad_norm": 17.173391342163086, "learning_rate": 1.9999643208285332e-05, "loss": 0.8358, "step": 14890 }, { "epoch": 17.974049487024743, "grad_norm": 18.770038604736328, "learning_rate": 1.999964296703054e-05, "loss": 0.8295, "step": 14900 }, { "epoch": 17.986119493059746, "grad_norm": 18.025442123413086, "learning_rate": 1.9999642725775745e-05, "loss": 0.8602, "step": 14910 }, { "epoch": 17.99818949909475, "grad_norm": 17.774394989013672, "learning_rate": 1.999964248452095e-05, "loss": 0.8287, "step": 14920 }, { "epoch": 18.009656004828003, "grad_norm": 14.55632495880127, "learning_rate": 1.9999642243266157e-05, "loss": 0.5673, "step": 14930 }, { "epoch": 18.021726010863006, "grad_norm": 14.451423645019531, "learning_rate": 1.9999642002011363e-05, "loss": 0.512, "step": 14940 }, { "epoch": 18.03379601689801, "grad_norm": 16.091299057006836, "learning_rate": 1.999964176075657e-05, "loss": 0.5263, "step": 14950 }, { "epoch": 18.045866022933012, "grad_norm": 14.255278587341309, "learning_rate": 1.999964151950178e-05, "loss": 0.5368, "step": 14960 }, { "epoch": 18.057936028968015, "grad_norm": 15.425003051757812, "learning_rate": 1.9999641278246985e-05, "loss": 0.5294, "step": 14970 }, { "epoch": 18.07000603500302, "grad_norm": 15.150923728942871, "learning_rate": 1.999964103699219e-05, "loss": 0.5283, "step": 14980 }, { "epoch": 18.08207604103802, "grad_norm": 14.986013412475586, "learning_rate": 1.9999640795737394e-05, "loss": 0.526, "step": 14990 }, { "epoch": 18.094146047073025, "grad_norm": 15.610698699951172, "learning_rate": 1.99996405544826e-05, "loss": 0.528, "step": 15000 }, { "epoch": 18.094146047073025, "eval_loss": 9.486255645751953, "eval_runtime": 8.1231, "eval_samples_per_second": 85.805, "eval_steps_per_second": 10.833, "step": 15000 }, { "epoch": 18.106216053108028, "grad_norm": 16.030643463134766, "learning_rate": 1.9999640313227807e-05, "loss": 0.5255, "step": 15010 }, { "epoch": 18.11828605914303, "grad_norm": 15.20389461517334, "learning_rate": 1.9999640071973013e-05, "loss": 0.5285, "step": 15020 }, { "epoch": 18.13035606517803, "grad_norm": 15.67986011505127, "learning_rate": 1.999963983071822e-05, "loss": 0.556, "step": 15030 }, { "epoch": 18.142426071213034, "grad_norm": 15.368643760681152, "learning_rate": 1.9999639589463426e-05, "loss": 0.548, "step": 15040 }, { "epoch": 18.154496077248037, "grad_norm": 15.207975387573242, "learning_rate": 1.9999639348208632e-05, "loss": 0.5469, "step": 15050 }, { "epoch": 18.16656608328304, "grad_norm": 16.045578002929688, "learning_rate": 1.9999639106953838e-05, "loss": 0.549, "step": 15060 }, { "epoch": 18.178636089318044, "grad_norm": 15.288904190063477, "learning_rate": 1.9999638865699044e-05, "loss": 0.5387, "step": 15070 }, { "epoch": 18.190706095353047, "grad_norm": 15.267112731933594, "learning_rate": 1.999963862444425e-05, "loss": 0.5389, "step": 15080 }, { "epoch": 18.20277610138805, "grad_norm": 15.33686351776123, "learning_rate": 1.9999638383189457e-05, "loss": 0.5558, "step": 15090 }, { "epoch": 18.214846107423053, "grad_norm": 16.662944793701172, "learning_rate": 1.9999638141934663e-05, "loss": 0.5582, "step": 15100 }, { "epoch": 18.226916113458056, "grad_norm": 16.098674774169922, "learning_rate": 1.999963790067987e-05, "loss": 0.5707, "step": 15110 }, { "epoch": 18.23898611949306, "grad_norm": 15.551654815673828, "learning_rate": 1.9999637659425075e-05, "loss": 0.5548, "step": 15120 }, { "epoch": 18.251056125528063, "grad_norm": 15.436596870422363, "learning_rate": 1.999963741817028e-05, "loss": 0.5616, "step": 15130 }, { "epoch": 18.263126131563066, "grad_norm": 15.24303150177002, "learning_rate": 1.9999637176915488e-05, "loss": 0.573, "step": 15140 }, { "epoch": 18.27519613759807, "grad_norm": 16.579944610595703, "learning_rate": 1.9999636935660694e-05, "loss": 0.5673, "step": 15150 }, { "epoch": 18.287266143633072, "grad_norm": 16.474571228027344, "learning_rate": 1.99996366944059e-05, "loss": 0.5677, "step": 15160 }, { "epoch": 18.299336149668076, "grad_norm": 16.5589656829834, "learning_rate": 1.9999636453151106e-05, "loss": 0.5818, "step": 15170 }, { "epoch": 18.31140615570308, "grad_norm": 15.804871559143066, "learning_rate": 1.9999636211896313e-05, "loss": 0.575, "step": 15180 }, { "epoch": 18.323476161738082, "grad_norm": 17.124874114990234, "learning_rate": 1.999963597064152e-05, "loss": 0.5883, "step": 15190 }, { "epoch": 18.335546167773085, "grad_norm": 16.93877601623535, "learning_rate": 1.9999635729386725e-05, "loss": 0.5757, "step": 15200 }, { "epoch": 18.34761617380809, "grad_norm": 16.621726989746094, "learning_rate": 1.999963548813193e-05, "loss": 0.5936, "step": 15210 }, { "epoch": 18.35968617984309, "grad_norm": 16.879289627075195, "learning_rate": 1.9999635246877137e-05, "loss": 0.5899, "step": 15220 }, { "epoch": 18.371756185878095, "grad_norm": 16.36094093322754, "learning_rate": 1.9999635005622344e-05, "loss": 0.5836, "step": 15230 }, { "epoch": 18.383826191913094, "grad_norm": 16.200117111206055, "learning_rate": 1.9999634764367546e-05, "loss": 0.5975, "step": 15240 }, { "epoch": 18.395896197948097, "grad_norm": 16.080137252807617, "learning_rate": 1.9999634523112753e-05, "loss": 0.592, "step": 15250 }, { "epoch": 18.4079662039831, "grad_norm": 17.791011810302734, "learning_rate": 1.999963428185796e-05, "loss": 0.592, "step": 15260 }, { "epoch": 18.420036210018104, "grad_norm": 15.711163520812988, "learning_rate": 1.9999634040603165e-05, "loss": 0.595, "step": 15270 }, { "epoch": 18.432106216053107, "grad_norm": 17.590171813964844, "learning_rate": 1.999963379934837e-05, "loss": 0.5943, "step": 15280 }, { "epoch": 18.44417622208811, "grad_norm": 17.49735450744629, "learning_rate": 1.9999633558093578e-05, "loss": 0.5981, "step": 15290 }, { "epoch": 18.456246228123113, "grad_norm": 17.878232955932617, "learning_rate": 1.9999633316838784e-05, "loss": 0.6015, "step": 15300 }, { "epoch": 18.468316234158117, "grad_norm": 16.378433227539062, "learning_rate": 1.999963307558399e-05, "loss": 0.599, "step": 15310 }, { "epoch": 18.48038624019312, "grad_norm": 16.039731979370117, "learning_rate": 1.9999632834329196e-05, "loss": 0.6068, "step": 15320 }, { "epoch": 18.492456246228123, "grad_norm": 17.666370391845703, "learning_rate": 1.9999632593074402e-05, "loss": 0.6026, "step": 15330 }, { "epoch": 18.504526252263126, "grad_norm": 18.019081115722656, "learning_rate": 1.999963235181961e-05, "loss": 0.6022, "step": 15340 }, { "epoch": 18.51659625829813, "grad_norm": 16.745664596557617, "learning_rate": 1.9999632110564815e-05, "loss": 0.6061, "step": 15350 }, { "epoch": 18.528666264333133, "grad_norm": 17.119718551635742, "learning_rate": 1.999963186931002e-05, "loss": 0.6104, "step": 15360 }, { "epoch": 18.540736270368136, "grad_norm": 17.208303451538086, "learning_rate": 1.9999631628055227e-05, "loss": 0.611, "step": 15370 }, { "epoch": 18.55280627640314, "grad_norm": 16.531766891479492, "learning_rate": 1.9999631386800433e-05, "loss": 0.6262, "step": 15380 }, { "epoch": 18.564876282438142, "grad_norm": 18.276487350463867, "learning_rate": 1.999963114554564e-05, "loss": 0.6243, "step": 15390 }, { "epoch": 18.576946288473145, "grad_norm": 16.355140686035156, "learning_rate": 1.9999630904290846e-05, "loss": 0.627, "step": 15400 }, { "epoch": 18.58901629450815, "grad_norm": 16.673871994018555, "learning_rate": 1.9999630663036052e-05, "loss": 0.6178, "step": 15410 }, { "epoch": 18.60108630054315, "grad_norm": 17.507278442382812, "learning_rate": 1.999963042178126e-05, "loss": 0.6314, "step": 15420 }, { "epoch": 18.613156306578155, "grad_norm": 17.25235939025879, "learning_rate": 1.9999630180526465e-05, "loss": 0.6158, "step": 15430 }, { "epoch": 18.625226312613158, "grad_norm": 17.29900360107422, "learning_rate": 1.999962993927167e-05, "loss": 0.6153, "step": 15440 }, { "epoch": 18.637296318648158, "grad_norm": 17.15621566772461, "learning_rate": 1.9999629698016877e-05, "loss": 0.6289, "step": 15450 }, { "epoch": 18.64936632468316, "grad_norm": 17.063316345214844, "learning_rate": 1.9999629456762083e-05, "loss": 0.6287, "step": 15460 }, { "epoch": 18.661436330718164, "grad_norm": 16.600431442260742, "learning_rate": 1.999962921550729e-05, "loss": 0.6239, "step": 15470 }, { "epoch": 18.673506336753167, "grad_norm": 16.366914749145508, "learning_rate": 1.9999628974252496e-05, "loss": 0.6344, "step": 15480 }, { "epoch": 18.68557634278817, "grad_norm": 17.14866828918457, "learning_rate": 1.99996287329977e-05, "loss": 0.6243, "step": 15490 }, { "epoch": 18.697646348823174, "grad_norm": 16.71204376220703, "learning_rate": 1.9999628491742908e-05, "loss": 0.6366, "step": 15500 }, { "epoch": 18.697646348823174, "eval_loss": 9.638611793518066, "eval_runtime": 8.1329, "eval_samples_per_second": 85.701, "eval_steps_per_second": 10.82, "step": 15500 }, { "epoch": 18.709716354858177, "grad_norm": 16.21298599243164, "learning_rate": 1.9999628250488114e-05, "loss": 0.6254, "step": 15510 }, { "epoch": 18.72178636089318, "grad_norm": 17.409149169921875, "learning_rate": 1.999962800923332e-05, "loss": 0.6395, "step": 15520 }, { "epoch": 18.733856366928183, "grad_norm": 17.351144790649414, "learning_rate": 1.9999627767978527e-05, "loss": 0.6246, "step": 15530 }, { "epoch": 18.745926372963186, "grad_norm": 16.853511810302734, "learning_rate": 1.9999627526723733e-05, "loss": 0.651, "step": 15540 }, { "epoch": 18.75799637899819, "grad_norm": 17.679428100585938, "learning_rate": 1.999962728546894e-05, "loss": 0.6269, "step": 15550 }, { "epoch": 18.770066385033193, "grad_norm": 18.423263549804688, "learning_rate": 1.9999627044214145e-05, "loss": 0.6343, "step": 15560 }, { "epoch": 18.782136391068196, "grad_norm": 18.157976150512695, "learning_rate": 1.999962680295935e-05, "loss": 0.6432, "step": 15570 }, { "epoch": 18.7942063971032, "grad_norm": 18.3277530670166, "learning_rate": 1.9999626561704558e-05, "loss": 0.6451, "step": 15580 }, { "epoch": 18.806276403138202, "grad_norm": 16.88482093811035, "learning_rate": 1.9999626320449764e-05, "loss": 0.6525, "step": 15590 }, { "epoch": 18.818346409173206, "grad_norm": 17.9451847076416, "learning_rate": 1.999962607919497e-05, "loss": 0.6561, "step": 15600 }, { "epoch": 18.83041641520821, "grad_norm": 17.25674819946289, "learning_rate": 1.9999625837940176e-05, "loss": 0.6522, "step": 15610 }, { "epoch": 18.842486421243212, "grad_norm": 18.205944061279297, "learning_rate": 1.9999625596685383e-05, "loss": 0.6581, "step": 15620 }, { "epoch": 18.854556427278215, "grad_norm": 18.154783248901367, "learning_rate": 1.999962535543059e-05, "loss": 0.6491, "step": 15630 }, { "epoch": 18.86662643331322, "grad_norm": 16.937602996826172, "learning_rate": 1.9999625114175795e-05, "loss": 0.6449, "step": 15640 }, { "epoch": 18.878696439348218, "grad_norm": 16.546588897705078, "learning_rate": 1.9999624872921e-05, "loss": 0.6583, "step": 15650 }, { "epoch": 18.89076644538322, "grad_norm": 16.83953094482422, "learning_rate": 1.9999624631666204e-05, "loss": 0.666, "step": 15660 }, { "epoch": 18.902836451418224, "grad_norm": 17.00275421142578, "learning_rate": 1.999962439041141e-05, "loss": 0.6499, "step": 15670 }, { "epoch": 18.914906457453228, "grad_norm": 18.355648040771484, "learning_rate": 1.9999624149156617e-05, "loss": 0.6621, "step": 15680 }, { "epoch": 18.92697646348823, "grad_norm": 16.83043098449707, "learning_rate": 1.9999623907901823e-05, "loss": 0.6722, "step": 15690 }, { "epoch": 18.939046469523234, "grad_norm": 18.302825927734375, "learning_rate": 1.999962366664703e-05, "loss": 0.6558, "step": 15700 }, { "epoch": 18.951116475558237, "grad_norm": 17.02328109741211, "learning_rate": 1.9999623425392235e-05, "loss": 0.6715, "step": 15710 }, { "epoch": 18.96318648159324, "grad_norm": 17.435935974121094, "learning_rate": 1.999962318413744e-05, "loss": 0.6749, "step": 15720 }, { "epoch": 18.975256487628243, "grad_norm": 17.74407196044922, "learning_rate": 1.9999622942882648e-05, "loss": 0.685, "step": 15730 }, { "epoch": 18.987326493663247, "grad_norm": 17.621095657348633, "learning_rate": 1.9999622701627854e-05, "loss": 0.6728, "step": 15740 }, { "epoch": 18.99939649969825, "grad_norm": 18.257768630981445, "learning_rate": 1.999962246037306e-05, "loss": 0.661, "step": 15750 }, { "epoch": 19.010863005431503, "grad_norm": 15.328895568847656, "learning_rate": 1.9999622219118266e-05, "loss": 0.4333, "step": 15760 }, { "epoch": 19.022933011466506, "grad_norm": 13.982439041137695, "learning_rate": 1.9999621977863472e-05, "loss": 0.4116, "step": 15770 }, { "epoch": 19.03500301750151, "grad_norm": 14.744349479675293, "learning_rate": 1.999962173660868e-05, "loss": 0.4233, "step": 15780 }, { "epoch": 19.047073023536512, "grad_norm": 14.370079040527344, "learning_rate": 1.9999621495353885e-05, "loss": 0.42, "step": 15790 }, { "epoch": 19.059143029571516, "grad_norm": 15.348397254943848, "learning_rate": 1.999962125409909e-05, "loss": 0.4153, "step": 15800 }, { "epoch": 19.07121303560652, "grad_norm": 15.485755920410156, "learning_rate": 1.9999621012844297e-05, "loss": 0.4251, "step": 15810 }, { "epoch": 19.083283041641522, "grad_norm": 14.553577423095703, "learning_rate": 1.9999620771589504e-05, "loss": 0.4305, "step": 15820 }, { "epoch": 19.095353047676525, "grad_norm": 14.363043785095215, "learning_rate": 1.999962053033471e-05, "loss": 0.4323, "step": 15830 }, { "epoch": 19.10742305371153, "grad_norm": 15.440790176391602, "learning_rate": 1.9999620289079916e-05, "loss": 0.4304, "step": 15840 }, { "epoch": 19.11949305974653, "grad_norm": 14.610817909240723, "learning_rate": 1.9999620047825122e-05, "loss": 0.4306, "step": 15850 }, { "epoch": 19.13156306578153, "grad_norm": 15.581945419311523, "learning_rate": 1.999961980657033e-05, "loss": 0.4393, "step": 15860 }, { "epoch": 19.143633071816534, "grad_norm": 15.316642761230469, "learning_rate": 1.9999619565315535e-05, "loss": 0.4372, "step": 15870 }, { "epoch": 19.155703077851538, "grad_norm": 14.809037208557129, "learning_rate": 1.999961932406074e-05, "loss": 0.441, "step": 15880 }, { "epoch": 19.16777308388654, "grad_norm": 13.815247535705566, "learning_rate": 1.9999619082805947e-05, "loss": 0.4495, "step": 15890 }, { "epoch": 19.179843089921544, "grad_norm": 16.354366302490234, "learning_rate": 1.9999618841551153e-05, "loss": 0.4487, "step": 15900 }, { "epoch": 19.191913095956547, "grad_norm": 15.127140045166016, "learning_rate": 1.9999618600296356e-05, "loss": 0.4461, "step": 15910 }, { "epoch": 19.20398310199155, "grad_norm": 15.803045272827148, "learning_rate": 1.9999618359041562e-05, "loss": 0.4544, "step": 15920 }, { "epoch": 19.216053108026554, "grad_norm": 15.596929550170898, "learning_rate": 1.999961811778677e-05, "loss": 0.4521, "step": 15930 }, { "epoch": 19.228123114061557, "grad_norm": 14.764232635498047, "learning_rate": 1.9999617876531975e-05, "loss": 0.4535, "step": 15940 }, { "epoch": 19.24019312009656, "grad_norm": 15.202631950378418, "learning_rate": 1.999961763527718e-05, "loss": 0.4557, "step": 15950 }, { "epoch": 19.252263126131563, "grad_norm": 16.123451232910156, "learning_rate": 1.9999617394022387e-05, "loss": 0.4617, "step": 15960 }, { "epoch": 19.264333132166566, "grad_norm": 15.62359619140625, "learning_rate": 1.9999617152767593e-05, "loss": 0.4627, "step": 15970 }, { "epoch": 19.27640313820157, "grad_norm": 15.166984558105469, "learning_rate": 1.99996169115128e-05, "loss": 0.4654, "step": 15980 }, { "epoch": 19.288473144236573, "grad_norm": 15.412948608398438, "learning_rate": 1.9999616670258006e-05, "loss": 0.4692, "step": 15990 }, { "epoch": 19.300543150271576, "grad_norm": 15.546163558959961, "learning_rate": 1.9999616429003212e-05, "loss": 0.4636, "step": 16000 }, { "epoch": 19.300543150271576, "eval_loss": 9.819067001342773, "eval_runtime": 8.1347, "eval_samples_per_second": 85.683, "eval_steps_per_second": 10.818, "step": 16000 }, { "epoch": 19.31261315630658, "grad_norm": 15.538877487182617, "learning_rate": 1.9999616187748418e-05, "loss": 0.4585, "step": 16010 }, { "epoch": 19.324683162341582, "grad_norm": 14.804139137268066, "learning_rate": 1.9999615946493624e-05, "loss": 0.4712, "step": 16020 }, { "epoch": 19.336753168376585, "grad_norm": 15.977506637573242, "learning_rate": 1.999961570523883e-05, "loss": 0.4702, "step": 16030 }, { "epoch": 19.34882317441159, "grad_norm": 15.108559608459473, "learning_rate": 1.999961546398404e-05, "loss": 0.4877, "step": 16040 }, { "epoch": 19.360893180446592, "grad_norm": 16.361665725708008, "learning_rate": 1.9999615222729247e-05, "loss": 0.4837, "step": 16050 }, { "epoch": 19.372963186481595, "grad_norm": 15.429830551147461, "learning_rate": 1.9999614981474453e-05, "loss": 0.4698, "step": 16060 }, { "epoch": 19.385033192516595, "grad_norm": 16.672494888305664, "learning_rate": 1.9999614740219656e-05, "loss": 0.4873, "step": 16070 }, { "epoch": 19.397103198551598, "grad_norm": 15.736530303955078, "learning_rate": 1.9999614498964862e-05, "loss": 0.4799, "step": 16080 }, { "epoch": 19.4091732045866, "grad_norm": 15.710693359375, "learning_rate": 1.9999614257710068e-05, "loss": 0.4804, "step": 16090 }, { "epoch": 19.421243210621604, "grad_norm": 15.295243263244629, "learning_rate": 1.9999614016455274e-05, "loss": 0.4781, "step": 16100 }, { "epoch": 19.433313216656607, "grad_norm": 17.207490921020508, "learning_rate": 1.999961377520048e-05, "loss": 0.4735, "step": 16110 }, { "epoch": 19.44538322269161, "grad_norm": 15.327972412109375, "learning_rate": 1.9999613533945687e-05, "loss": 0.4904, "step": 16120 }, { "epoch": 19.457453228726614, "grad_norm": 16.55533790588379, "learning_rate": 1.9999613292690893e-05, "loss": 0.4757, "step": 16130 }, { "epoch": 19.469523234761617, "grad_norm": 16.17594337463379, "learning_rate": 1.99996130514361e-05, "loss": 0.4876, "step": 16140 }, { "epoch": 19.48159324079662, "grad_norm": 16.522741317749023, "learning_rate": 1.9999612810181305e-05, "loss": 0.4962, "step": 16150 }, { "epoch": 19.493663246831623, "grad_norm": 16.253026962280273, "learning_rate": 1.999961256892651e-05, "loss": 0.4953, "step": 16160 }, { "epoch": 19.505733252866627, "grad_norm": 17.221467971801758, "learning_rate": 1.9999612327671718e-05, "loss": 0.4922, "step": 16170 }, { "epoch": 19.51780325890163, "grad_norm": 16.148040771484375, "learning_rate": 1.9999612086416924e-05, "loss": 0.5033, "step": 16180 }, { "epoch": 19.529873264936633, "grad_norm": 16.09086036682129, "learning_rate": 1.999961184516213e-05, "loss": 0.504, "step": 16190 }, { "epoch": 19.541943270971636, "grad_norm": 18.690950393676758, "learning_rate": 1.9999611603907336e-05, "loss": 0.5145, "step": 16200 }, { "epoch": 19.55401327700664, "grad_norm": 15.469786643981934, "learning_rate": 1.9999611362652543e-05, "loss": 0.5031, "step": 16210 }, { "epoch": 19.566083283041642, "grad_norm": 16.295499801635742, "learning_rate": 1.999961112139775e-05, "loss": 0.5049, "step": 16220 }, { "epoch": 19.578153289076646, "grad_norm": 15.244696617126465, "learning_rate": 1.9999610880142955e-05, "loss": 0.4949, "step": 16230 }, { "epoch": 19.59022329511165, "grad_norm": 16.314964294433594, "learning_rate": 1.999961063888816e-05, "loss": 0.5218, "step": 16240 }, { "epoch": 19.602293301146652, "grad_norm": 15.796189308166504, "learning_rate": 1.9999610397633367e-05, "loss": 0.4972, "step": 16250 }, { "epoch": 19.614363307181655, "grad_norm": 15.730233192443848, "learning_rate": 1.9999610156378574e-05, "loss": 0.5139, "step": 16260 }, { "epoch": 19.62643331321666, "grad_norm": 15.30257797241211, "learning_rate": 1.999960991512378e-05, "loss": 0.5092, "step": 16270 }, { "epoch": 19.638503319251658, "grad_norm": 15.71346378326416, "learning_rate": 1.9999609673868986e-05, "loss": 0.523, "step": 16280 }, { "epoch": 19.65057332528666, "grad_norm": 16.326555252075195, "learning_rate": 1.9999609432614192e-05, "loss": 0.5183, "step": 16290 }, { "epoch": 19.662643331321664, "grad_norm": 15.183685302734375, "learning_rate": 1.99996091913594e-05, "loss": 0.5095, "step": 16300 }, { "epoch": 19.674713337356668, "grad_norm": 17.306015014648438, "learning_rate": 1.9999608950104605e-05, "loss": 0.5172, "step": 16310 }, { "epoch": 19.68678334339167, "grad_norm": 15.559868812561035, "learning_rate": 1.9999608708849808e-05, "loss": 0.5179, "step": 16320 }, { "epoch": 19.698853349426674, "grad_norm": 16.584325790405273, "learning_rate": 1.9999608467595014e-05, "loss": 0.5162, "step": 16330 }, { "epoch": 19.710923355461677, "grad_norm": 17.2943172454834, "learning_rate": 1.999960822634022e-05, "loss": 0.5299, "step": 16340 }, { "epoch": 19.72299336149668, "grad_norm": 16.42589569091797, "learning_rate": 1.9999607985085426e-05, "loss": 0.5111, "step": 16350 }, { "epoch": 19.735063367531684, "grad_norm": 16.54888153076172, "learning_rate": 1.9999607743830632e-05, "loss": 0.5237, "step": 16360 }, { "epoch": 19.747133373566687, "grad_norm": 16.210325241088867, "learning_rate": 1.999960750257584e-05, "loss": 0.5192, "step": 16370 }, { "epoch": 19.75920337960169, "grad_norm": 15.95534610748291, "learning_rate": 1.9999607261321045e-05, "loss": 0.5212, "step": 16380 }, { "epoch": 19.771273385636693, "grad_norm": 16.8835391998291, "learning_rate": 1.999960702006625e-05, "loss": 0.5356, "step": 16390 }, { "epoch": 19.783343391671696, "grad_norm": 17.422163009643555, "learning_rate": 1.9999606778811457e-05, "loss": 0.5268, "step": 16400 }, { "epoch": 19.7954133977067, "grad_norm": 17.036808013916016, "learning_rate": 1.9999606537556664e-05, "loss": 0.5193, "step": 16410 }, { "epoch": 19.807483403741703, "grad_norm": 16.67237663269043, "learning_rate": 1.999960629630187e-05, "loss": 0.5327, "step": 16420 }, { "epoch": 19.819553409776706, "grad_norm": 17.427715301513672, "learning_rate": 1.9999606055047076e-05, "loss": 0.5253, "step": 16430 }, { "epoch": 19.83162341581171, "grad_norm": 17.5157413482666, "learning_rate": 1.9999605813792282e-05, "loss": 0.5367, "step": 16440 }, { "epoch": 19.843693421846712, "grad_norm": 16.565420150756836, "learning_rate": 1.999960557253749e-05, "loss": 0.5312, "step": 16450 }, { "epoch": 19.855763427881715, "grad_norm": 16.4698486328125, "learning_rate": 1.9999605331282695e-05, "loss": 0.533, "step": 16460 }, { "epoch": 19.86783343391672, "grad_norm": 17.659940719604492, "learning_rate": 1.99996050900279e-05, "loss": 0.5396, "step": 16470 }, { "epoch": 19.87990343995172, "grad_norm": 17.389772415161133, "learning_rate": 1.9999604848773107e-05, "loss": 0.5429, "step": 16480 }, { "epoch": 19.89197344598672, "grad_norm": 17.265621185302734, "learning_rate": 1.9999604607518313e-05, "loss": 0.538, "step": 16490 }, { "epoch": 19.904043452021725, "grad_norm": 17.27803611755371, "learning_rate": 1.999960436626352e-05, "loss": 0.5374, "step": 16500 }, { "epoch": 19.904043452021725, "eval_loss": 9.918048858642578, "eval_runtime": 8.1249, "eval_samples_per_second": 85.785, "eval_steps_per_second": 10.831, "step": 16500 }, { "epoch": 19.916113458056728, "grad_norm": 17.06930160522461, "learning_rate": 1.9999604125008726e-05, "loss": 0.5414, "step": 16510 }, { "epoch": 19.92818346409173, "grad_norm": 16.420366287231445, "learning_rate": 1.9999603883753932e-05, "loss": 0.5381, "step": 16520 }, { "epoch": 19.940253470126734, "grad_norm": 16.787982940673828, "learning_rate": 1.9999603642499138e-05, "loss": 0.5572, "step": 16530 }, { "epoch": 19.952323476161737, "grad_norm": 16.3350830078125, "learning_rate": 1.9999603401244344e-05, "loss": 0.5507, "step": 16540 }, { "epoch": 19.96439348219674, "grad_norm": 17.181777954101562, "learning_rate": 1.999960315998955e-05, "loss": 0.5467, "step": 16550 }, { "epoch": 19.976463488231744, "grad_norm": 16.375267028808594, "learning_rate": 1.9999602918734757e-05, "loss": 0.557, "step": 16560 }, { "epoch": 19.988533494266747, "grad_norm": 18.17564582824707, "learning_rate": 1.999960267747996e-05, "loss": 0.5447, "step": 16570 }, { "epoch": 20.0, "grad_norm": 28.652896881103516, "learning_rate": 1.999960243622517e-05, "loss": 0.5565, "step": 16580 }, { "epoch": 20.012070006035003, "grad_norm": 13.903209686279297, "learning_rate": 1.9999602194970375e-05, "loss": 0.3385, "step": 16590 }, { "epoch": 20.024140012070006, "grad_norm": 14.802945137023926, "learning_rate": 1.999960195371558e-05, "loss": 0.3438, "step": 16600 }, { "epoch": 20.03621001810501, "grad_norm": 14.167360305786133, "learning_rate": 1.9999601712460788e-05, "loss": 0.3442, "step": 16610 }, { "epoch": 20.048280024140013, "grad_norm": 13.603432655334473, "learning_rate": 1.9999601471205994e-05, "loss": 0.3461, "step": 16620 }, { "epoch": 20.060350030175016, "grad_norm": 13.411169052124023, "learning_rate": 1.99996012299512e-05, "loss": 0.3488, "step": 16630 }, { "epoch": 20.07242003621002, "grad_norm": 13.863439559936523, "learning_rate": 1.9999600988696406e-05, "loss": 0.353, "step": 16640 }, { "epoch": 20.084490042245022, "grad_norm": 13.371684074401855, "learning_rate": 1.9999600747441613e-05, "loss": 0.3524, "step": 16650 }, { "epoch": 20.096560048280026, "grad_norm": 14.62384033203125, "learning_rate": 1.999960050618682e-05, "loss": 0.3716, "step": 16660 }, { "epoch": 20.10863005431503, "grad_norm": 13.20703411102295, "learning_rate": 1.9999600264932025e-05, "loss": 0.357, "step": 16670 }, { "epoch": 20.120700060350032, "grad_norm": 14.875126838684082, "learning_rate": 1.999960002367723e-05, "loss": 0.3648, "step": 16680 }, { "epoch": 20.13277006638503, "grad_norm": 14.177694320678711, "learning_rate": 1.9999599782422438e-05, "loss": 0.3655, "step": 16690 }, { "epoch": 20.144840072420035, "grad_norm": 15.681522369384766, "learning_rate": 1.9999599541167644e-05, "loss": 0.3651, "step": 16700 }, { "epoch": 20.156910078455038, "grad_norm": 13.61447811126709, "learning_rate": 1.999959929991285e-05, "loss": 0.3724, "step": 16710 }, { "epoch": 20.16898008449004, "grad_norm": 13.619489669799805, "learning_rate": 1.9999599058658056e-05, "loss": 0.3719, "step": 16720 }, { "epoch": 20.181050090525044, "grad_norm": 14.292237281799316, "learning_rate": 1.9999598817403262e-05, "loss": 0.3608, "step": 16730 }, { "epoch": 20.193120096560047, "grad_norm": 13.7140474319458, "learning_rate": 1.9999598576148465e-05, "loss": 0.3591, "step": 16740 }, { "epoch": 20.20519010259505, "grad_norm": 15.468637466430664, "learning_rate": 1.999959833489367e-05, "loss": 0.3845, "step": 16750 }, { "epoch": 20.217260108630054, "grad_norm": 15.187677383422852, "learning_rate": 1.9999598093638878e-05, "loss": 0.3833, "step": 16760 }, { "epoch": 20.229330114665057, "grad_norm": 15.911016464233398, "learning_rate": 1.9999597852384084e-05, "loss": 0.3881, "step": 16770 }, { "epoch": 20.24140012070006, "grad_norm": 14.159621238708496, "learning_rate": 1.999959761112929e-05, "loss": 0.3831, "step": 16780 }, { "epoch": 20.253470126735063, "grad_norm": 14.483068466186523, "learning_rate": 1.9999597369874496e-05, "loss": 0.3864, "step": 16790 }, { "epoch": 20.265540132770067, "grad_norm": 14.151849746704102, "learning_rate": 1.9999597128619703e-05, "loss": 0.3872, "step": 16800 }, { "epoch": 20.27761013880507, "grad_norm": 14.1467866897583, "learning_rate": 1.999959688736491e-05, "loss": 0.3984, "step": 16810 }, { "epoch": 20.289680144840073, "grad_norm": 14.85405445098877, "learning_rate": 1.9999596646110115e-05, "loss": 0.3761, "step": 16820 }, { "epoch": 20.301750150875076, "grad_norm": 15.44154167175293, "learning_rate": 1.999959640485532e-05, "loss": 0.3994, "step": 16830 }, { "epoch": 20.31382015691008, "grad_norm": 16.189254760742188, "learning_rate": 1.9999596163600527e-05, "loss": 0.3893, "step": 16840 }, { "epoch": 20.325890162945083, "grad_norm": 14.550188064575195, "learning_rate": 1.9999595922345734e-05, "loss": 0.3955, "step": 16850 }, { "epoch": 20.337960168980086, "grad_norm": 14.535367965698242, "learning_rate": 1.999959568109094e-05, "loss": 0.3984, "step": 16860 }, { "epoch": 20.35003017501509, "grad_norm": 14.562722206115723, "learning_rate": 1.9999595439836146e-05, "loss": 0.4039, "step": 16870 }, { "epoch": 20.362100181050092, "grad_norm": 15.284385681152344, "learning_rate": 1.9999595198581352e-05, "loss": 0.4106, "step": 16880 }, { "epoch": 20.37417018708509, "grad_norm": 15.09070873260498, "learning_rate": 1.999959495732656e-05, "loss": 0.3951, "step": 16890 }, { "epoch": 20.386240193120095, "grad_norm": 14.7792329788208, "learning_rate": 1.9999594716071765e-05, "loss": 0.4042, "step": 16900 }, { "epoch": 20.398310199155098, "grad_norm": 13.890677452087402, "learning_rate": 1.999959447481697e-05, "loss": 0.4023, "step": 16910 }, { "epoch": 20.4103802051901, "grad_norm": 13.983966827392578, "learning_rate": 1.9999594233562177e-05, "loss": 0.3906, "step": 16920 }, { "epoch": 20.422450211225105, "grad_norm": 15.314216613769531, "learning_rate": 1.9999593992307383e-05, "loss": 0.3984, "step": 16930 }, { "epoch": 20.434520217260108, "grad_norm": 15.40633487701416, "learning_rate": 1.999959375105259e-05, "loss": 0.4092, "step": 16940 }, { "epoch": 20.44659022329511, "grad_norm": 15.186367988586426, "learning_rate": 1.9999593509797796e-05, "loss": 0.4122, "step": 16950 }, { "epoch": 20.458660229330114, "grad_norm": 16.11627197265625, "learning_rate": 1.9999593268543002e-05, "loss": 0.4054, "step": 16960 }, { "epoch": 20.470730235365117, "grad_norm": 14.729860305786133, "learning_rate": 1.9999593027288208e-05, "loss": 0.4082, "step": 16970 }, { "epoch": 20.48280024140012, "grad_norm": 15.35232925415039, "learning_rate": 1.9999592786033414e-05, "loss": 0.4124, "step": 16980 }, { "epoch": 20.494870247435124, "grad_norm": 15.6321382522583, "learning_rate": 1.9999592544778617e-05, "loss": 0.4108, "step": 16990 }, { "epoch": 20.506940253470127, "grad_norm": 16.001558303833008, "learning_rate": 1.9999592303523823e-05, "loss": 0.4094, "step": 17000 }, { "epoch": 20.506940253470127, "eval_loss": 10.067206382751465, "eval_runtime": 8.1272, "eval_samples_per_second": 85.762, "eval_steps_per_second": 10.828, "step": 17000 }, { "epoch": 20.51901025950513, "grad_norm": 16.834447860717773, "learning_rate": 1.999959206226903e-05, "loss": 0.4056, "step": 17010 }, { "epoch": 20.531080265540133, "grad_norm": 14.93362808227539, "learning_rate": 1.9999591821014236e-05, "loss": 0.4237, "step": 17020 }, { "epoch": 20.543150271575136, "grad_norm": 14.990288734436035, "learning_rate": 1.9999591579759442e-05, "loss": 0.419, "step": 17030 }, { "epoch": 20.55522027761014, "grad_norm": 15.094441413879395, "learning_rate": 1.9999591338504648e-05, "loss": 0.4172, "step": 17040 }, { "epoch": 20.567290283645143, "grad_norm": 14.635119438171387, "learning_rate": 1.9999591097249855e-05, "loss": 0.4196, "step": 17050 }, { "epoch": 20.579360289680146, "grad_norm": 14.942756652832031, "learning_rate": 1.999959085599506e-05, "loss": 0.4214, "step": 17060 }, { "epoch": 20.59143029571515, "grad_norm": 14.854506492614746, "learning_rate": 1.9999590614740267e-05, "loss": 0.423, "step": 17070 }, { "epoch": 20.603500301750152, "grad_norm": 15.111893653869629, "learning_rate": 1.9999590373485473e-05, "loss": 0.4238, "step": 17080 }, { "epoch": 20.615570307785156, "grad_norm": 15.692937850952148, "learning_rate": 1.999959013223068e-05, "loss": 0.431, "step": 17090 }, { "epoch": 20.62764031382016, "grad_norm": 16.162752151489258, "learning_rate": 1.9999589890975886e-05, "loss": 0.4219, "step": 17100 }, { "epoch": 20.63971031985516, "grad_norm": 14.893609046936035, "learning_rate": 1.9999589649721092e-05, "loss": 0.4226, "step": 17110 }, { "epoch": 20.65178032589016, "grad_norm": 15.520092964172363, "learning_rate": 1.99995894084663e-05, "loss": 0.439, "step": 17120 }, { "epoch": 20.663850331925165, "grad_norm": 15.229547500610352, "learning_rate": 1.9999589167211508e-05, "loss": 0.4357, "step": 17130 }, { "epoch": 20.675920337960168, "grad_norm": 16.029531478881836, "learning_rate": 1.9999588925956714e-05, "loss": 0.4297, "step": 17140 }, { "epoch": 20.68799034399517, "grad_norm": 15.888076782226562, "learning_rate": 1.9999588684701917e-05, "loss": 0.4483, "step": 17150 }, { "epoch": 20.700060350030174, "grad_norm": 16.269519805908203, "learning_rate": 1.9999588443447123e-05, "loss": 0.4378, "step": 17160 }, { "epoch": 20.712130356065177, "grad_norm": 14.927374839782715, "learning_rate": 1.999958820219233e-05, "loss": 0.4435, "step": 17170 }, { "epoch": 20.72420036210018, "grad_norm": 15.027469635009766, "learning_rate": 1.9999587960937535e-05, "loss": 0.4394, "step": 17180 }, { "epoch": 20.736270368135184, "grad_norm": 15.437460899353027, "learning_rate": 1.999958771968274e-05, "loss": 0.4371, "step": 17190 }, { "epoch": 20.748340374170187, "grad_norm": 15.665655136108398, "learning_rate": 1.9999587478427948e-05, "loss": 0.4413, "step": 17200 }, { "epoch": 20.76041038020519, "grad_norm": 16.974712371826172, "learning_rate": 1.9999587237173154e-05, "loss": 0.4494, "step": 17210 }, { "epoch": 20.772480386240193, "grad_norm": 16.54533576965332, "learning_rate": 1.999958699591836e-05, "loss": 0.438, "step": 17220 }, { "epoch": 20.784550392275197, "grad_norm": 14.4326171875, "learning_rate": 1.9999586754663566e-05, "loss": 0.4388, "step": 17230 }, { "epoch": 20.7966203983102, "grad_norm": 14.75368881225586, "learning_rate": 1.9999586513408773e-05, "loss": 0.4374, "step": 17240 }, { "epoch": 20.808690404345203, "grad_norm": 14.789094924926758, "learning_rate": 1.999958627215398e-05, "loss": 0.4408, "step": 17250 }, { "epoch": 20.820760410380206, "grad_norm": 16.56182289123535, "learning_rate": 1.9999586030899185e-05, "loss": 0.4468, "step": 17260 }, { "epoch": 20.83283041641521, "grad_norm": 15.171770095825195, "learning_rate": 1.999958578964439e-05, "loss": 0.4555, "step": 17270 }, { "epoch": 20.844900422450213, "grad_norm": 16.252527236938477, "learning_rate": 1.9999585548389597e-05, "loss": 0.4325, "step": 17280 }, { "epoch": 20.856970428485216, "grad_norm": 15.100207328796387, "learning_rate": 1.9999585307134804e-05, "loss": 0.4633, "step": 17290 }, { "epoch": 20.86904043452022, "grad_norm": 16.372690200805664, "learning_rate": 1.999958506588001e-05, "loss": 0.4505, "step": 17300 }, { "epoch": 20.88111044055522, "grad_norm": 15.841266632080078, "learning_rate": 1.9999584824625216e-05, "loss": 0.4577, "step": 17310 }, { "epoch": 20.893180446590222, "grad_norm": 16.063968658447266, "learning_rate": 1.9999584583370422e-05, "loss": 0.4564, "step": 17320 }, { "epoch": 20.905250452625225, "grad_norm": 15.632071495056152, "learning_rate": 1.999958434211563e-05, "loss": 0.4527, "step": 17330 }, { "epoch": 20.917320458660228, "grad_norm": 15.412327766418457, "learning_rate": 1.9999584100860835e-05, "loss": 0.4597, "step": 17340 }, { "epoch": 20.92939046469523, "grad_norm": 16.7193603515625, "learning_rate": 1.999958385960604e-05, "loss": 0.4548, "step": 17350 }, { "epoch": 20.941460470730235, "grad_norm": 16.37327003479004, "learning_rate": 1.9999583618351247e-05, "loss": 0.4552, "step": 17360 }, { "epoch": 20.953530476765238, "grad_norm": 15.440481185913086, "learning_rate": 1.9999583377096453e-05, "loss": 0.4578, "step": 17370 }, { "epoch": 20.96560048280024, "grad_norm": 15.52712631225586, "learning_rate": 1.999958313584166e-05, "loss": 0.4508, "step": 17380 }, { "epoch": 20.977670488835244, "grad_norm": 17.1241512298584, "learning_rate": 1.9999582894586866e-05, "loss": 0.4596, "step": 17390 }, { "epoch": 20.989740494870247, "grad_norm": 15.490154266357422, "learning_rate": 1.999958265333207e-05, "loss": 0.4668, "step": 17400 }, { "epoch": 21.0012070006035, "grad_norm": 13.083903312683105, "learning_rate": 1.9999582412077275e-05, "loss": 0.4422, "step": 17410 }, { "epoch": 21.013277006638504, "grad_norm": 13.060623168945312, "learning_rate": 1.999958217082248e-05, "loss": 0.2854, "step": 17420 }, { "epoch": 21.025347012673507, "grad_norm": 12.578997611999512, "learning_rate": 1.9999581929567687e-05, "loss": 0.2934, "step": 17430 }, { "epoch": 21.03741701870851, "grad_norm": 13.029574394226074, "learning_rate": 1.9999581688312894e-05, "loss": 0.2948, "step": 17440 }, { "epoch": 21.049487024743513, "grad_norm": 13.573758125305176, "learning_rate": 1.99995814470581e-05, "loss": 0.2931, "step": 17450 }, { "epoch": 21.061557030778516, "grad_norm": 12.427323341369629, "learning_rate": 1.9999581205803306e-05, "loss": 0.3105, "step": 17460 }, { "epoch": 21.07362703681352, "grad_norm": 13.212836265563965, "learning_rate": 1.9999580964548512e-05, "loss": 0.3055, "step": 17470 }, { "epoch": 21.085697042848523, "grad_norm": 14.497152328491211, "learning_rate": 1.999958072329372e-05, "loss": 0.3125, "step": 17480 }, { "epoch": 21.097767048883526, "grad_norm": 12.901823997497559, "learning_rate": 1.9999580482038925e-05, "loss": 0.3061, "step": 17490 }, { "epoch": 21.10983705491853, "grad_norm": 14.815119743347168, "learning_rate": 1.999958024078413e-05, "loss": 0.3128, "step": 17500 }, { "epoch": 21.10983705491853, "eval_loss": 10.17077350616455, "eval_runtime": 8.1347, "eval_samples_per_second": 85.682, "eval_steps_per_second": 10.818, "step": 17500 }, { "epoch": 21.121907060953532, "grad_norm": 14.495138168334961, "learning_rate": 1.9999579999529337e-05, "loss": 0.3201, "step": 17510 }, { "epoch": 21.133977066988532, "grad_norm": 13.939275741577148, "learning_rate": 1.9999579758274543e-05, "loss": 0.319, "step": 17520 }, { "epoch": 21.146047073023535, "grad_norm": 14.171784400939941, "learning_rate": 1.999957951701975e-05, "loss": 0.3169, "step": 17530 }, { "epoch": 21.158117079058538, "grad_norm": 13.311067581176758, "learning_rate": 1.9999579275764956e-05, "loss": 0.3177, "step": 17540 }, { "epoch": 21.17018708509354, "grad_norm": 13.40538501739502, "learning_rate": 1.9999579034510162e-05, "loss": 0.3188, "step": 17550 }, { "epoch": 21.182257091128545, "grad_norm": 12.765137672424316, "learning_rate": 1.9999578793255368e-05, "loss": 0.3257, "step": 17560 }, { "epoch": 21.194327097163548, "grad_norm": 14.14867115020752, "learning_rate": 1.9999578552000574e-05, "loss": 0.3181, "step": 17570 }, { "epoch": 21.20639710319855, "grad_norm": 13.9575777053833, "learning_rate": 1.999957831074578e-05, "loss": 0.3257, "step": 17580 }, { "epoch": 21.218467109233554, "grad_norm": 13.838704109191895, "learning_rate": 1.9999578069490987e-05, "loss": 0.3181, "step": 17590 }, { "epoch": 21.230537115268557, "grad_norm": 13.581345558166504, "learning_rate": 1.9999577828236193e-05, "loss": 0.3288, "step": 17600 }, { "epoch": 21.24260712130356, "grad_norm": 12.839767456054688, "learning_rate": 1.99995775869814e-05, "loss": 0.3215, "step": 17610 }, { "epoch": 21.254677127338564, "grad_norm": 13.807430267333984, "learning_rate": 1.9999577345726605e-05, "loss": 0.3347, "step": 17620 }, { "epoch": 21.266747133373567, "grad_norm": 14.427085876464844, "learning_rate": 1.999957710447181e-05, "loss": 0.3278, "step": 17630 }, { "epoch": 21.27881713940857, "grad_norm": 12.672168731689453, "learning_rate": 1.9999576863217018e-05, "loss": 0.3341, "step": 17640 }, { "epoch": 21.290887145443573, "grad_norm": 15.44679069519043, "learning_rate": 1.999957662196222e-05, "loss": 0.3423, "step": 17650 }, { "epoch": 21.302957151478576, "grad_norm": 14.079726219177246, "learning_rate": 1.999957638070743e-05, "loss": 0.3378, "step": 17660 }, { "epoch": 21.31502715751358, "grad_norm": 14.768683433532715, "learning_rate": 1.9999576139452636e-05, "loss": 0.3424, "step": 17670 }, { "epoch": 21.327097163548583, "grad_norm": 14.612085342407227, "learning_rate": 1.9999575898197843e-05, "loss": 0.3386, "step": 17680 }, { "epoch": 21.339167169583586, "grad_norm": 14.82508373260498, "learning_rate": 1.999957565694305e-05, "loss": 0.3379, "step": 17690 }, { "epoch": 21.35123717561859, "grad_norm": 14.027159690856934, "learning_rate": 1.9999575415688255e-05, "loss": 0.3392, "step": 17700 }, { "epoch": 21.363307181653592, "grad_norm": 14.301624298095703, "learning_rate": 1.999957517443346e-05, "loss": 0.354, "step": 17710 }, { "epoch": 21.375377187688592, "grad_norm": 13.50343132019043, "learning_rate": 1.9999574933178668e-05, "loss": 0.3431, "step": 17720 }, { "epoch": 21.387447193723595, "grad_norm": 14.708610534667969, "learning_rate": 1.9999574691923874e-05, "loss": 0.3458, "step": 17730 }, { "epoch": 21.3995171997586, "grad_norm": 14.399552345275879, "learning_rate": 1.999957445066908e-05, "loss": 0.3499, "step": 17740 }, { "epoch": 21.4115872057936, "grad_norm": 14.767723083496094, "learning_rate": 1.9999574209414286e-05, "loss": 0.3496, "step": 17750 }, { "epoch": 21.423657211828605, "grad_norm": 14.75951862335205, "learning_rate": 1.9999573968159492e-05, "loss": 0.3464, "step": 17760 }, { "epoch": 21.435727217863608, "grad_norm": 14.061502456665039, "learning_rate": 1.99995737269047e-05, "loss": 0.3457, "step": 17770 }, { "epoch": 21.44779722389861, "grad_norm": 14.707136154174805, "learning_rate": 1.9999573485649905e-05, "loss": 0.3513, "step": 17780 }, { "epoch": 21.459867229933614, "grad_norm": 14.54099178314209, "learning_rate": 1.999957324439511e-05, "loss": 0.3476, "step": 17790 }, { "epoch": 21.471937235968618, "grad_norm": 13.352415084838867, "learning_rate": 1.9999573003140317e-05, "loss": 0.3563, "step": 17800 }, { "epoch": 21.48400724200362, "grad_norm": 14.306955337524414, "learning_rate": 1.999957276188552e-05, "loss": 0.3466, "step": 17810 }, { "epoch": 21.496077248038624, "grad_norm": 15.991957664489746, "learning_rate": 1.9999572520630726e-05, "loss": 0.357, "step": 17820 }, { "epoch": 21.508147254073627, "grad_norm": 14.474764823913574, "learning_rate": 1.9999572279375933e-05, "loss": 0.3633, "step": 17830 }, { "epoch": 21.52021726010863, "grad_norm": 14.120390892028809, "learning_rate": 1.999957203812114e-05, "loss": 0.3632, "step": 17840 }, { "epoch": 21.532287266143634, "grad_norm": 14.645788192749023, "learning_rate": 1.9999571796866345e-05, "loss": 0.361, "step": 17850 }, { "epoch": 21.544357272178637, "grad_norm": 15.194024085998535, "learning_rate": 1.999957155561155e-05, "loss": 0.3691, "step": 17860 }, { "epoch": 21.55642727821364, "grad_norm": 15.196239471435547, "learning_rate": 1.9999571314356757e-05, "loss": 0.3746, "step": 17870 }, { "epoch": 21.568497284248643, "grad_norm": 14.847043991088867, "learning_rate": 1.9999571073101964e-05, "loss": 0.3626, "step": 17880 }, { "epoch": 21.580567290283646, "grad_norm": 14.802926063537598, "learning_rate": 1.999957083184717e-05, "loss": 0.3619, "step": 17890 }, { "epoch": 21.59263729631865, "grad_norm": 15.28620719909668, "learning_rate": 1.9999570590592376e-05, "loss": 0.3621, "step": 17900 }, { "epoch": 21.604707302353653, "grad_norm": 15.632220268249512, "learning_rate": 1.9999570349337582e-05, "loss": 0.3622, "step": 17910 }, { "epoch": 21.616777308388656, "grad_norm": 15.056783676147461, "learning_rate": 1.999957010808279e-05, "loss": 0.3679, "step": 17920 }, { "epoch": 21.62884731442366, "grad_norm": 14.656344413757324, "learning_rate": 1.9999569866827995e-05, "loss": 0.3657, "step": 17930 }, { "epoch": 21.64091732045866, "grad_norm": 16.149154663085938, "learning_rate": 1.99995696255732e-05, "loss": 0.3652, "step": 17940 }, { "epoch": 21.652987326493662, "grad_norm": 14.033272743225098, "learning_rate": 1.9999569384318407e-05, "loss": 0.3772, "step": 17950 }, { "epoch": 21.665057332528665, "grad_norm": 15.682516098022461, "learning_rate": 1.9999569143063613e-05, "loss": 0.3738, "step": 17960 }, { "epoch": 21.67712733856367, "grad_norm": 15.525935173034668, "learning_rate": 1.999956890180882e-05, "loss": 0.3787, "step": 17970 }, { "epoch": 21.68919734459867, "grad_norm": 15.063936233520508, "learning_rate": 1.9999568660554026e-05, "loss": 0.3683, "step": 17980 }, { "epoch": 21.701267350633675, "grad_norm": 15.034740447998047, "learning_rate": 1.9999568419299232e-05, "loss": 0.3781, "step": 17990 }, { "epoch": 21.713337356668678, "grad_norm": 14.918134689331055, "learning_rate": 1.9999568178044438e-05, "loss": 0.3881, "step": 18000 }, { "epoch": 21.713337356668678, "eval_loss": 10.247665405273438, "eval_runtime": 8.1476, "eval_samples_per_second": 85.547, "eval_steps_per_second": 10.801, "step": 18000 }, { "epoch": 21.72540736270368, "grad_norm": 15.117865562438965, "learning_rate": 1.9999567936789644e-05, "loss": 0.3786, "step": 18010 }, { "epoch": 21.737477368738684, "grad_norm": 14.033202171325684, "learning_rate": 1.999956769553485e-05, "loss": 0.379, "step": 18020 }, { "epoch": 21.749547374773687, "grad_norm": 14.990311622619629, "learning_rate": 1.9999567454280057e-05, "loss": 0.3713, "step": 18030 }, { "epoch": 21.76161738080869, "grad_norm": 14.664165496826172, "learning_rate": 1.9999567213025263e-05, "loss": 0.3761, "step": 18040 }, { "epoch": 21.773687386843694, "grad_norm": 15.075085639953613, "learning_rate": 1.999956697177047e-05, "loss": 0.3856, "step": 18050 }, { "epoch": 21.785757392878697, "grad_norm": 15.277259826660156, "learning_rate": 1.9999566730515672e-05, "loss": 0.3771, "step": 18060 }, { "epoch": 21.7978273989137, "grad_norm": 14.35598373413086, "learning_rate": 1.999956648926088e-05, "loss": 0.3808, "step": 18070 }, { "epoch": 21.809897404948703, "grad_norm": 14.716455459594727, "learning_rate": 1.9999566248006085e-05, "loss": 0.3834, "step": 18080 }, { "epoch": 21.821967410983707, "grad_norm": 15.47046947479248, "learning_rate": 1.999956600675129e-05, "loss": 0.3875, "step": 18090 }, { "epoch": 21.83403741701871, "grad_norm": 15.223883628845215, "learning_rate": 1.9999565765496497e-05, "loss": 0.3771, "step": 18100 }, { "epoch": 21.846107423053713, "grad_norm": 15.500845909118652, "learning_rate": 1.9999565524241703e-05, "loss": 0.3828, "step": 18110 }, { "epoch": 21.858177429088716, "grad_norm": 15.654467582702637, "learning_rate": 1.999956528298691e-05, "loss": 0.387, "step": 18120 }, { "epoch": 21.87024743512372, "grad_norm": 15.244475364685059, "learning_rate": 1.9999565041732116e-05, "loss": 0.3843, "step": 18130 }, { "epoch": 21.88231744115872, "grad_norm": 14.947580337524414, "learning_rate": 1.9999564800477322e-05, "loss": 0.3801, "step": 18140 }, { "epoch": 21.894387447193722, "grad_norm": 14.386096954345703, "learning_rate": 1.9999564559222528e-05, "loss": 0.3949, "step": 18150 }, { "epoch": 21.906457453228725, "grad_norm": 15.819478034973145, "learning_rate": 1.9999564317967734e-05, "loss": 0.3989, "step": 18160 }, { "epoch": 21.91852745926373, "grad_norm": 14.475298881530762, "learning_rate": 1.999956407671294e-05, "loss": 0.3931, "step": 18170 }, { "epoch": 21.93059746529873, "grad_norm": 15.776381492614746, "learning_rate": 1.9999563835458147e-05, "loss": 0.3927, "step": 18180 }, { "epoch": 21.942667471333735, "grad_norm": 15.907670974731445, "learning_rate": 1.9999563594203353e-05, "loss": 0.3924, "step": 18190 }, { "epoch": 21.954737477368738, "grad_norm": 16.85502815246582, "learning_rate": 1.9999563352948563e-05, "loss": 0.3969, "step": 18200 }, { "epoch": 21.96680748340374, "grad_norm": 14.597153663635254, "learning_rate": 1.999956311169377e-05, "loss": 0.3975, "step": 18210 }, { "epoch": 21.978877489438744, "grad_norm": 16.856216430664062, "learning_rate": 1.9999562870438975e-05, "loss": 0.4038, "step": 18220 }, { "epoch": 21.990947495473748, "grad_norm": 15.952351570129395, "learning_rate": 1.9999562629184178e-05, "loss": 0.3915, "step": 18230 }, { "epoch": 22.002414001207, "grad_norm": 11.803852081298828, "learning_rate": 1.9999562387929384e-05, "loss": 0.3679, "step": 18240 }, { "epoch": 22.014484007242004, "grad_norm": 11.927762985229492, "learning_rate": 1.999956214667459e-05, "loss": 0.2537, "step": 18250 }, { "epoch": 22.026554013277007, "grad_norm": 12.132996559143066, "learning_rate": 1.9999561905419796e-05, "loss": 0.2542, "step": 18260 }, { "epoch": 22.03862401931201, "grad_norm": 13.466547012329102, "learning_rate": 1.9999561664165003e-05, "loss": 0.2614, "step": 18270 }, { "epoch": 22.050694025347013, "grad_norm": 12.665739059448242, "learning_rate": 1.999956142291021e-05, "loss": 0.2654, "step": 18280 }, { "epoch": 22.062764031382017, "grad_norm": 12.71487808227539, "learning_rate": 1.9999561181655415e-05, "loss": 0.2633, "step": 18290 }, { "epoch": 22.07483403741702, "grad_norm": 12.898911476135254, "learning_rate": 1.999956094040062e-05, "loss": 0.2621, "step": 18300 }, { "epoch": 22.086904043452023, "grad_norm": 11.623908996582031, "learning_rate": 1.9999560699145827e-05, "loss": 0.2714, "step": 18310 }, { "epoch": 22.098974049487026, "grad_norm": 12.942914962768555, "learning_rate": 1.9999560457891034e-05, "loss": 0.2745, "step": 18320 }, { "epoch": 22.11104405552203, "grad_norm": 12.743661880493164, "learning_rate": 1.999956021663624e-05, "loss": 0.2717, "step": 18330 }, { "epoch": 22.123114061557033, "grad_norm": 12.571853637695312, "learning_rate": 1.9999559975381446e-05, "loss": 0.2795, "step": 18340 }, { "epoch": 22.135184067592032, "grad_norm": 13.03171157836914, "learning_rate": 1.9999559734126652e-05, "loss": 0.2842, "step": 18350 }, { "epoch": 22.147254073627035, "grad_norm": 13.486390113830566, "learning_rate": 1.999955949287186e-05, "loss": 0.2817, "step": 18360 }, { "epoch": 22.15932407966204, "grad_norm": 14.161498069763184, "learning_rate": 1.9999559251617065e-05, "loss": 0.2914, "step": 18370 }, { "epoch": 22.17139408569704, "grad_norm": 13.526592254638672, "learning_rate": 1.999955901036227e-05, "loss": 0.2933, "step": 18380 }, { "epoch": 22.183464091732045, "grad_norm": 12.517003059387207, "learning_rate": 1.9999558769107477e-05, "loss": 0.291, "step": 18390 }, { "epoch": 22.195534097767048, "grad_norm": 13.62547492980957, "learning_rate": 1.9999558527852683e-05, "loss": 0.2782, "step": 18400 }, { "epoch": 22.20760410380205, "grad_norm": 14.7900972366333, "learning_rate": 1.999955828659789e-05, "loss": 0.2908, "step": 18410 }, { "epoch": 22.219674109837054, "grad_norm": 14.33768081665039, "learning_rate": 1.9999558045343096e-05, "loss": 0.2921, "step": 18420 }, { "epoch": 22.231744115872058, "grad_norm": 12.805663108825684, "learning_rate": 1.9999557804088302e-05, "loss": 0.2952, "step": 18430 }, { "epoch": 22.24381412190706, "grad_norm": 13.319748878479004, "learning_rate": 1.9999557562833508e-05, "loss": 0.2816, "step": 18440 }, { "epoch": 22.255884127942064, "grad_norm": 12.898439407348633, "learning_rate": 1.9999557321578715e-05, "loss": 0.2933, "step": 18450 }, { "epoch": 22.267954133977067, "grad_norm": 13.359171867370605, "learning_rate": 1.999955708032392e-05, "loss": 0.2996, "step": 18460 }, { "epoch": 22.28002414001207, "grad_norm": 13.120288848876953, "learning_rate": 1.9999556839069127e-05, "loss": 0.2965, "step": 18470 }, { "epoch": 22.292094146047074, "grad_norm": 12.439910888671875, "learning_rate": 1.999955659781433e-05, "loss": 0.2967, "step": 18480 }, { "epoch": 22.304164152082077, "grad_norm": 14.129825592041016, "learning_rate": 1.9999556356559536e-05, "loss": 0.29, "step": 18490 }, { "epoch": 22.31623415811708, "grad_norm": 14.116714477539062, "learning_rate": 1.9999556115304742e-05, "loss": 0.301, "step": 18500 }, { "epoch": 22.31623415811708, "eval_loss": 10.355953216552734, "eval_runtime": 8.1256, "eval_samples_per_second": 85.778, "eval_steps_per_second": 10.83, "step": 18500 }, { "epoch": 22.328304164152083, "grad_norm": 13.20102596282959, "learning_rate": 1.999955587404995e-05, "loss": 0.3049, "step": 18510 }, { "epoch": 22.340374170187086, "grad_norm": 13.707938194274902, "learning_rate": 1.9999555632795155e-05, "loss": 0.3048, "step": 18520 }, { "epoch": 22.35244417622209, "grad_norm": 13.867488861083984, "learning_rate": 1.999955539154036e-05, "loss": 0.3048, "step": 18530 }, { "epoch": 22.364514182257093, "grad_norm": 13.365896224975586, "learning_rate": 1.9999555150285567e-05, "loss": 0.3102, "step": 18540 }, { "epoch": 22.376584188292092, "grad_norm": 14.221571922302246, "learning_rate": 1.9999554909030773e-05, "loss": 0.3078, "step": 18550 }, { "epoch": 22.388654194327096, "grad_norm": 14.147294044494629, "learning_rate": 1.999955466777598e-05, "loss": 0.3121, "step": 18560 }, { "epoch": 22.4007242003621, "grad_norm": 13.85913372039795, "learning_rate": 1.9999554426521186e-05, "loss": 0.3007, "step": 18570 }, { "epoch": 22.412794206397102, "grad_norm": 13.824056625366211, "learning_rate": 1.9999554185266392e-05, "loss": 0.3013, "step": 18580 }, { "epoch": 22.424864212432105, "grad_norm": 12.765989303588867, "learning_rate": 1.9999553944011598e-05, "loss": 0.3041, "step": 18590 }, { "epoch": 22.43693421846711, "grad_norm": 13.731369018554688, "learning_rate": 1.9999553702756804e-05, "loss": 0.3056, "step": 18600 }, { "epoch": 22.44900422450211, "grad_norm": 14.093430519104004, "learning_rate": 1.999955346150201e-05, "loss": 0.3106, "step": 18610 }, { "epoch": 22.461074230537115, "grad_norm": 13.07478141784668, "learning_rate": 1.9999553220247217e-05, "loss": 0.3055, "step": 18620 }, { "epoch": 22.473144236572118, "grad_norm": 13.108185768127441, "learning_rate": 1.9999552978992423e-05, "loss": 0.3161, "step": 18630 }, { "epoch": 22.48521424260712, "grad_norm": 14.242752075195312, "learning_rate": 1.999955273773763e-05, "loss": 0.3213, "step": 18640 }, { "epoch": 22.497284248642124, "grad_norm": 13.891339302062988, "learning_rate": 1.9999552496482835e-05, "loss": 0.3195, "step": 18650 }, { "epoch": 22.509354254677127, "grad_norm": 13.832612991333008, "learning_rate": 1.999955225522804e-05, "loss": 0.3145, "step": 18660 }, { "epoch": 22.52142426071213, "grad_norm": 15.330731391906738, "learning_rate": 1.9999552013973248e-05, "loss": 0.3195, "step": 18670 }, { "epoch": 22.533494266747134, "grad_norm": 14.159749031066895, "learning_rate": 1.9999551772718454e-05, "loss": 0.321, "step": 18680 }, { "epoch": 22.545564272782137, "grad_norm": 12.964016914367676, "learning_rate": 1.999955153146366e-05, "loss": 0.323, "step": 18690 }, { "epoch": 22.55763427881714, "grad_norm": 13.321394920349121, "learning_rate": 1.9999551290208867e-05, "loss": 0.3165, "step": 18700 }, { "epoch": 22.569704284852143, "grad_norm": 14.328272819519043, "learning_rate": 1.9999551048954073e-05, "loss": 0.3185, "step": 18710 }, { "epoch": 22.581774290887147, "grad_norm": 13.482905387878418, "learning_rate": 1.999955080769928e-05, "loss": 0.321, "step": 18720 }, { "epoch": 22.59384429692215, "grad_norm": 14.286419868469238, "learning_rate": 1.9999550566444482e-05, "loss": 0.3193, "step": 18730 }, { "epoch": 22.605914302957153, "grad_norm": 14.63395881652832, "learning_rate": 1.999955032518969e-05, "loss": 0.317, "step": 18740 }, { "epoch": 22.617984308992156, "grad_norm": 14.584534645080566, "learning_rate": 1.9999550083934898e-05, "loss": 0.3201, "step": 18750 }, { "epoch": 22.630054315027156, "grad_norm": 14.250128746032715, "learning_rate": 1.9999549842680104e-05, "loss": 0.3358, "step": 18760 }, { "epoch": 22.64212432106216, "grad_norm": 13.663447380065918, "learning_rate": 1.999954960142531e-05, "loss": 0.3262, "step": 18770 }, { "epoch": 22.654194327097162, "grad_norm": 13.974159240722656, "learning_rate": 1.9999549360170516e-05, "loss": 0.3315, "step": 18780 }, { "epoch": 22.666264333132165, "grad_norm": 13.949405670166016, "learning_rate": 1.9999549118915722e-05, "loss": 0.326, "step": 18790 }, { "epoch": 22.67833433916717, "grad_norm": 15.09975814819336, "learning_rate": 1.999954887766093e-05, "loss": 0.3221, "step": 18800 }, { "epoch": 22.69040434520217, "grad_norm": 14.696736335754395, "learning_rate": 1.9999548636406135e-05, "loss": 0.3293, "step": 18810 }, { "epoch": 22.702474351237175, "grad_norm": 13.609739303588867, "learning_rate": 1.999954839515134e-05, "loss": 0.3341, "step": 18820 }, { "epoch": 22.714544357272178, "grad_norm": 15.430875778198242, "learning_rate": 1.9999548153896547e-05, "loss": 0.3297, "step": 18830 }, { "epoch": 22.72661436330718, "grad_norm": 15.032858848571777, "learning_rate": 1.9999547912641754e-05, "loss": 0.3393, "step": 18840 }, { "epoch": 22.738684369342185, "grad_norm": 13.38953971862793, "learning_rate": 1.999954767138696e-05, "loss": 0.3283, "step": 18850 }, { "epoch": 22.750754375377188, "grad_norm": 14.43060302734375, "learning_rate": 1.9999547430132166e-05, "loss": 0.3309, "step": 18860 }, { "epoch": 22.76282438141219, "grad_norm": 14.105151176452637, "learning_rate": 1.9999547188877372e-05, "loss": 0.329, "step": 18870 }, { "epoch": 22.774894387447194, "grad_norm": 13.70396900177002, "learning_rate": 1.999954694762258e-05, "loss": 0.3255, "step": 18880 }, { "epoch": 22.786964393482197, "grad_norm": 14.897878646850586, "learning_rate": 1.999954670636778e-05, "loss": 0.3394, "step": 18890 }, { "epoch": 22.7990343995172, "grad_norm": 14.756895065307617, "learning_rate": 1.9999546465112987e-05, "loss": 0.3393, "step": 18900 }, { "epoch": 22.811104405552204, "grad_norm": 13.861629486083984, "learning_rate": 1.9999546223858194e-05, "loss": 0.3325, "step": 18910 }, { "epoch": 22.823174411587207, "grad_norm": 14.146031379699707, "learning_rate": 1.99995459826034e-05, "loss": 0.3332, "step": 18920 }, { "epoch": 22.83524441762221, "grad_norm": 14.833703994750977, "learning_rate": 1.9999545741348606e-05, "loss": 0.3363, "step": 18930 }, { "epoch": 22.847314423657213, "grad_norm": 16.36979103088379, "learning_rate": 1.9999545500093812e-05, "loss": 0.3385, "step": 18940 }, { "epoch": 22.859384429692216, "grad_norm": 13.932124137878418, "learning_rate": 1.999954525883902e-05, "loss": 0.3539, "step": 18950 }, { "epoch": 22.871454435727216, "grad_norm": 13.434460639953613, "learning_rate": 1.9999545017584225e-05, "loss": 0.34, "step": 18960 }, { "epoch": 22.88352444176222, "grad_norm": 14.419950485229492, "learning_rate": 1.999954477632943e-05, "loss": 0.3421, "step": 18970 }, { "epoch": 22.895594447797222, "grad_norm": 15.00802993774414, "learning_rate": 1.9999544535074637e-05, "loss": 0.3429, "step": 18980 }, { "epoch": 22.907664453832226, "grad_norm": 14.53161334991455, "learning_rate": 1.9999544293819843e-05, "loss": 0.3451, "step": 18990 }, { "epoch": 22.91973445986723, "grad_norm": 14.457475662231445, "learning_rate": 1.999954405256505e-05, "loss": 0.3497, "step": 19000 }, { "epoch": 22.91973445986723, "eval_loss": 10.411765098571777, "eval_runtime": 8.1259, "eval_samples_per_second": 85.776, "eval_steps_per_second": 10.83, "step": 19000 }, { "epoch": 22.931804465902232, "grad_norm": 14.691095352172852, "learning_rate": 1.9999543811310256e-05, "loss": 0.3517, "step": 19010 }, { "epoch": 22.943874471937235, "grad_norm": 14.264322280883789, "learning_rate": 1.9999543570055462e-05, "loss": 0.349, "step": 19020 }, { "epoch": 22.95594447797224, "grad_norm": 14.389098167419434, "learning_rate": 1.9999543328800668e-05, "loss": 0.3439, "step": 19030 }, { "epoch": 22.96801448400724, "grad_norm": 16.272417068481445, "learning_rate": 1.9999543087545874e-05, "loss": 0.3472, "step": 19040 }, { "epoch": 22.980084490042245, "grad_norm": 12.844181060791016, "learning_rate": 1.999954284629108e-05, "loss": 0.345, "step": 19050 }, { "epoch": 22.992154496077248, "grad_norm": 14.457566261291504, "learning_rate": 1.9999542605036287e-05, "loss": 0.3443, "step": 19060 }, { "epoch": 23.0036210018105, "grad_norm": 11.641766548156738, "learning_rate": 1.9999542363781493e-05, "loss": 0.3148, "step": 19070 }, { "epoch": 23.015691007845504, "grad_norm": 11.250316619873047, "learning_rate": 1.99995421225267e-05, "loss": 0.2246, "step": 19080 }, { "epoch": 23.027761013880507, "grad_norm": 11.912137985229492, "learning_rate": 1.9999541881271906e-05, "loss": 0.2304, "step": 19090 }, { "epoch": 23.03983101991551, "grad_norm": 12.417069435119629, "learning_rate": 1.9999541640017112e-05, "loss": 0.2349, "step": 19100 }, { "epoch": 23.051901025950514, "grad_norm": 12.372785568237305, "learning_rate": 1.9999541398762318e-05, "loss": 0.2328, "step": 19110 }, { "epoch": 23.063971031985517, "grad_norm": 12.917871475219727, "learning_rate": 1.9999541157507524e-05, "loss": 0.2391, "step": 19120 }, { "epoch": 23.07604103802052, "grad_norm": 12.086236953735352, "learning_rate": 1.999954091625273e-05, "loss": 0.2384, "step": 19130 }, { "epoch": 23.088111044055523, "grad_norm": 12.60556411743164, "learning_rate": 1.9999540674997933e-05, "loss": 0.2431, "step": 19140 }, { "epoch": 23.100181050090526, "grad_norm": 13.297846794128418, "learning_rate": 1.999954043374314e-05, "loss": 0.2406, "step": 19150 }, { "epoch": 23.11225105612553, "grad_norm": 13.543648719787598, "learning_rate": 1.9999540192488346e-05, "loss": 0.2509, "step": 19160 }, { "epoch": 23.124321062160533, "grad_norm": 13.436534881591797, "learning_rate": 1.9999539951233552e-05, "loss": 0.25, "step": 19170 }, { "epoch": 23.136391068195532, "grad_norm": 12.35329532623291, "learning_rate": 1.9999539709978758e-05, "loss": 0.2465, "step": 19180 }, { "epoch": 23.148461074230536, "grad_norm": 12.136104583740234, "learning_rate": 1.9999539468723964e-05, "loss": 0.2582, "step": 19190 }, { "epoch": 23.16053108026554, "grad_norm": 13.014857292175293, "learning_rate": 1.999953922746917e-05, "loss": 0.2504, "step": 19200 }, { "epoch": 23.172601086300542, "grad_norm": 12.555720329284668, "learning_rate": 1.9999538986214377e-05, "loss": 0.2563, "step": 19210 }, { "epoch": 23.184671092335545, "grad_norm": 13.008694648742676, "learning_rate": 1.9999538744959583e-05, "loss": 0.2553, "step": 19220 }, { "epoch": 23.19674109837055, "grad_norm": 12.983438491821289, "learning_rate": 1.999953850370479e-05, "loss": 0.2537, "step": 19230 }, { "epoch": 23.20881110440555, "grad_norm": 12.572404861450195, "learning_rate": 1.9999538262449995e-05, "loss": 0.2596, "step": 19240 }, { "epoch": 23.220881110440555, "grad_norm": 13.268893241882324, "learning_rate": 1.99995380211952e-05, "loss": 0.2584, "step": 19250 }, { "epoch": 23.232951116475558, "grad_norm": 13.221393585205078, "learning_rate": 1.9999537779940408e-05, "loss": 0.2656, "step": 19260 }, { "epoch": 23.24502112251056, "grad_norm": 13.60205364227295, "learning_rate": 1.9999537538685614e-05, "loss": 0.2667, "step": 19270 }, { "epoch": 23.257091128545564, "grad_norm": 12.569549560546875, "learning_rate": 1.9999537297430824e-05, "loss": 0.2647, "step": 19280 }, { "epoch": 23.269161134580568, "grad_norm": 12.754693984985352, "learning_rate": 1.999953705617603e-05, "loss": 0.2625, "step": 19290 }, { "epoch": 23.28123114061557, "grad_norm": 12.10505199432373, "learning_rate": 1.9999536814921233e-05, "loss": 0.2626, "step": 19300 }, { "epoch": 23.293301146650574, "grad_norm": 13.053898811340332, "learning_rate": 1.999953657366644e-05, "loss": 0.2675, "step": 19310 }, { "epoch": 23.305371152685577, "grad_norm": 12.448768615722656, "learning_rate": 1.9999536332411645e-05, "loss": 0.2609, "step": 19320 }, { "epoch": 23.31744115872058, "grad_norm": 12.864468574523926, "learning_rate": 1.999953609115685e-05, "loss": 0.2694, "step": 19330 }, { "epoch": 23.329511164755584, "grad_norm": 12.503049850463867, "learning_rate": 1.9999535849902058e-05, "loss": 0.2706, "step": 19340 }, { "epoch": 23.341581170790587, "grad_norm": 13.098746299743652, "learning_rate": 1.9999535608647264e-05, "loss": 0.2734, "step": 19350 }, { "epoch": 23.35365117682559, "grad_norm": 13.053861618041992, "learning_rate": 1.999953536739247e-05, "loss": 0.2664, "step": 19360 }, { "epoch": 23.365721182860593, "grad_norm": 14.048847198486328, "learning_rate": 1.9999535126137676e-05, "loss": 0.2742, "step": 19370 }, { "epoch": 23.377791188895593, "grad_norm": 12.131243705749512, "learning_rate": 1.9999534884882882e-05, "loss": 0.2691, "step": 19380 }, { "epoch": 23.389861194930596, "grad_norm": 13.500530242919922, "learning_rate": 1.999953464362809e-05, "loss": 0.2692, "step": 19390 }, { "epoch": 23.4019312009656, "grad_norm": 12.286332130432129, "learning_rate": 1.9999534402373295e-05, "loss": 0.2684, "step": 19400 }, { "epoch": 23.414001207000602, "grad_norm": 13.484957695007324, "learning_rate": 1.99995341611185e-05, "loss": 0.2779, "step": 19410 }, { "epoch": 23.426071213035605, "grad_norm": 13.234942436218262, "learning_rate": 1.9999533919863707e-05, "loss": 0.2697, "step": 19420 }, { "epoch": 23.43814121907061, "grad_norm": 13.189998626708984, "learning_rate": 1.9999533678608913e-05, "loss": 0.2768, "step": 19430 }, { "epoch": 23.450211225105612, "grad_norm": 13.180237770080566, "learning_rate": 1.999953343735412e-05, "loss": 0.2807, "step": 19440 }, { "epoch": 23.462281231140615, "grad_norm": 13.426108360290527, "learning_rate": 1.9999533196099326e-05, "loss": 0.2755, "step": 19450 }, { "epoch": 23.474351237175618, "grad_norm": 13.426348686218262, "learning_rate": 1.9999532954844532e-05, "loss": 0.2885, "step": 19460 }, { "epoch": 23.48642124321062, "grad_norm": 12.595351219177246, "learning_rate": 1.999953271358974e-05, "loss": 0.2839, "step": 19470 }, { "epoch": 23.498491249245625, "grad_norm": 14.211012840270996, "learning_rate": 1.9999532472334945e-05, "loss": 0.2816, "step": 19480 }, { "epoch": 23.510561255280628, "grad_norm": 13.602468490600586, "learning_rate": 1.999953223108015e-05, "loss": 0.2887, "step": 19490 }, { "epoch": 23.52263126131563, "grad_norm": 13.452590942382812, "learning_rate": 1.9999531989825357e-05, "loss": 0.2857, "step": 19500 }, { "epoch": 23.52263126131563, "eval_loss": 10.491438865661621, "eval_runtime": 8.1277, "eval_samples_per_second": 85.756, "eval_steps_per_second": 10.827, "step": 19500 }, { "epoch": 23.534701267350634, "grad_norm": 13.38037395477295, "learning_rate": 1.9999531748570563e-05, "loss": 0.2833, "step": 19510 }, { "epoch": 23.546771273385637, "grad_norm": 13.903613090515137, "learning_rate": 1.999953150731577e-05, "loss": 0.2787, "step": 19520 }, { "epoch": 23.55884127942064, "grad_norm": 13.020818710327148, "learning_rate": 1.9999531266060976e-05, "loss": 0.2872, "step": 19530 }, { "epoch": 23.570911285455644, "grad_norm": 13.168354988098145, "learning_rate": 1.9999531024806182e-05, "loss": 0.2868, "step": 19540 }, { "epoch": 23.582981291490647, "grad_norm": 13.172264099121094, "learning_rate": 1.9999530783551385e-05, "loss": 0.2908, "step": 19550 }, { "epoch": 23.59505129752565, "grad_norm": 13.667034149169922, "learning_rate": 1.999953054229659e-05, "loss": 0.2917, "step": 19560 }, { "epoch": 23.607121303560653, "grad_norm": 14.02136516571045, "learning_rate": 1.9999530301041797e-05, "loss": 0.2895, "step": 19570 }, { "epoch": 23.619191309595656, "grad_norm": 14.132126808166504, "learning_rate": 1.9999530059787003e-05, "loss": 0.2969, "step": 19580 }, { "epoch": 23.631261315630656, "grad_norm": 13.228663444519043, "learning_rate": 1.999952981853221e-05, "loss": 0.2946, "step": 19590 }, { "epoch": 23.64333132166566, "grad_norm": 13.641292572021484, "learning_rate": 1.9999529577277416e-05, "loss": 0.2906, "step": 19600 }, { "epoch": 23.655401327700663, "grad_norm": 13.228596687316895, "learning_rate": 1.9999529336022622e-05, "loss": 0.2927, "step": 19610 }, { "epoch": 23.667471333735666, "grad_norm": 13.745280265808105, "learning_rate": 1.9999529094767828e-05, "loss": 0.2924, "step": 19620 }, { "epoch": 23.67954133977067, "grad_norm": 14.010147094726562, "learning_rate": 1.9999528853513034e-05, "loss": 0.2924, "step": 19630 }, { "epoch": 23.691611345805672, "grad_norm": 13.574163436889648, "learning_rate": 1.999952861225824e-05, "loss": 0.2913, "step": 19640 }, { "epoch": 23.703681351840675, "grad_norm": 13.51381778717041, "learning_rate": 1.9999528371003447e-05, "loss": 0.2936, "step": 19650 }, { "epoch": 23.71575135787568, "grad_norm": 13.301969528198242, "learning_rate": 1.9999528129748653e-05, "loss": 0.2964, "step": 19660 }, { "epoch": 23.72782136391068, "grad_norm": 14.137504577636719, "learning_rate": 1.999952788849386e-05, "loss": 0.2945, "step": 19670 }, { "epoch": 23.739891369945685, "grad_norm": 13.961593627929688, "learning_rate": 1.9999527647239065e-05, "loss": 0.2993, "step": 19680 }, { "epoch": 23.751961375980688, "grad_norm": 13.299009323120117, "learning_rate": 1.999952740598427e-05, "loss": 0.2971, "step": 19690 }, { "epoch": 23.76403138201569, "grad_norm": 13.120593070983887, "learning_rate": 1.9999527164729478e-05, "loss": 0.3003, "step": 19700 }, { "epoch": 23.776101388050694, "grad_norm": 13.35731315612793, "learning_rate": 1.9999526923474684e-05, "loss": 0.3031, "step": 19710 }, { "epoch": 23.788171394085698, "grad_norm": 13.648883819580078, "learning_rate": 1.999952668221989e-05, "loss": 0.2984, "step": 19720 }, { "epoch": 23.8002414001207, "grad_norm": 13.840371131896973, "learning_rate": 1.9999526440965097e-05, "loss": 0.3046, "step": 19730 }, { "epoch": 23.812311406155704, "grad_norm": 13.295636177062988, "learning_rate": 1.9999526199710303e-05, "loss": 0.3079, "step": 19740 }, { "epoch": 23.824381412190707, "grad_norm": 14.884610176086426, "learning_rate": 1.999952595845551e-05, "loss": 0.3027, "step": 19750 }, { "epoch": 23.83645141822571, "grad_norm": 12.818931579589844, "learning_rate": 1.9999525717200715e-05, "loss": 0.3063, "step": 19760 }, { "epoch": 23.848521424260714, "grad_norm": 14.367683410644531, "learning_rate": 1.999952547594592e-05, "loss": 0.3037, "step": 19770 }, { "epoch": 23.860591430295717, "grad_norm": 14.862786293029785, "learning_rate": 1.9999525234691128e-05, "loss": 0.2999, "step": 19780 }, { "epoch": 23.872661436330716, "grad_norm": 13.110206604003906, "learning_rate": 1.9999524993436334e-05, "loss": 0.3014, "step": 19790 }, { "epoch": 23.88473144236572, "grad_norm": 13.524511337280273, "learning_rate": 1.9999524752181537e-05, "loss": 0.3074, "step": 19800 }, { "epoch": 23.896801448400723, "grad_norm": 13.695358276367188, "learning_rate": 1.9999524510926743e-05, "loss": 0.3109, "step": 19810 }, { "epoch": 23.908871454435726, "grad_norm": 14.052223205566406, "learning_rate": 1.9999524269671952e-05, "loss": 0.3099, "step": 19820 }, { "epoch": 23.92094146047073, "grad_norm": 14.001981735229492, "learning_rate": 1.999952402841716e-05, "loss": 0.3034, "step": 19830 }, { "epoch": 23.933011466505732, "grad_norm": 14.2471923828125, "learning_rate": 1.9999523787162365e-05, "loss": 0.3062, "step": 19840 }, { "epoch": 23.945081472540735, "grad_norm": 13.810246467590332, "learning_rate": 1.999952354590757e-05, "loss": 0.3019, "step": 19850 }, { "epoch": 23.95715147857574, "grad_norm": 14.199981689453125, "learning_rate": 1.9999523304652777e-05, "loss": 0.3153, "step": 19860 }, { "epoch": 23.969221484610742, "grad_norm": 13.98547649383545, "learning_rate": 1.9999523063397984e-05, "loss": 0.3096, "step": 19870 }, { "epoch": 23.981291490645745, "grad_norm": 13.416516304016113, "learning_rate": 1.999952282214319e-05, "loss": 0.3103, "step": 19880 }, { "epoch": 23.99336149668075, "grad_norm": 13.513847351074219, "learning_rate": 1.9999522580888396e-05, "loss": 0.3119, "step": 19890 }, { "epoch": 24.004828002414, "grad_norm": 11.667850494384766, "learning_rate": 1.9999522339633602e-05, "loss": 0.265, "step": 19900 }, { "epoch": 24.016898008449004, "grad_norm": 11.197464942932129, "learning_rate": 1.999952209837881e-05, "loss": 0.2064, "step": 19910 }, { "epoch": 24.028968014484008, "grad_norm": 11.428786277770996, "learning_rate": 1.9999521857124015e-05, "loss": 0.2117, "step": 19920 }, { "epoch": 24.04103802051901, "grad_norm": 12.414345741271973, "learning_rate": 1.999952161586922e-05, "loss": 0.2121, "step": 19930 }, { "epoch": 24.053108026554014, "grad_norm": 11.669108390808105, "learning_rate": 1.9999521374614427e-05, "loss": 0.2192, "step": 19940 }, { "epoch": 24.065178032589017, "grad_norm": 11.713314056396484, "learning_rate": 1.9999521133359633e-05, "loss": 0.2214, "step": 19950 }, { "epoch": 24.07724803862402, "grad_norm": 11.71852970123291, "learning_rate": 1.999952089210484e-05, "loss": 0.2146, "step": 19960 }, { "epoch": 24.089318044659024, "grad_norm": 11.718681335449219, "learning_rate": 1.9999520650850042e-05, "loss": 0.2256, "step": 19970 }, { "epoch": 24.101388050694027, "grad_norm": 11.692678451538086, "learning_rate": 1.999952040959525e-05, "loss": 0.2256, "step": 19980 }, { "epoch": 24.11345805672903, "grad_norm": 11.94239616394043, "learning_rate": 1.9999520168340455e-05, "loss": 0.2274, "step": 19990 }, { "epoch": 24.125528062764033, "grad_norm": 10.19051742553711, "learning_rate": 1.999951992708566e-05, "loss": 0.2228, "step": 20000 }, { "epoch": 24.125528062764033, "eval_loss": 10.560831069946289, "eval_runtime": 8.1301, "eval_samples_per_second": 85.731, "eval_steps_per_second": 10.824, "step": 20000 }, { "epoch": 24.137598068799033, "grad_norm": 11.699931144714355, "learning_rate": 1.9999519685830867e-05, "loss": 0.2283, "step": 20010 }, { "epoch": 24.149668074834036, "grad_norm": 11.991548538208008, "learning_rate": 1.9999519444576073e-05, "loss": 0.229, "step": 20020 }, { "epoch": 24.16173808086904, "grad_norm": 11.9141263961792, "learning_rate": 1.999951920332128e-05, "loss": 0.234, "step": 20030 }, { "epoch": 24.173808086904042, "grad_norm": 13.145759582519531, "learning_rate": 1.9999518962066486e-05, "loss": 0.2322, "step": 20040 }, { "epoch": 24.185878092939046, "grad_norm": 11.663484573364258, "learning_rate": 1.9999518720811692e-05, "loss": 0.2343, "step": 20050 }, { "epoch": 24.19794809897405, "grad_norm": 12.165745735168457, "learning_rate": 1.9999518479556898e-05, "loss": 0.2288, "step": 20060 }, { "epoch": 24.210018105009052, "grad_norm": 11.756146430969238, "learning_rate": 1.9999518238302104e-05, "loss": 0.2343, "step": 20070 }, { "epoch": 24.222088111044055, "grad_norm": 12.810198783874512, "learning_rate": 1.999951799704731e-05, "loss": 0.2368, "step": 20080 }, { "epoch": 24.23415811707906, "grad_norm": 11.537394523620605, "learning_rate": 1.9999517755792517e-05, "loss": 0.2348, "step": 20090 }, { "epoch": 24.24622812311406, "grad_norm": 12.279565811157227, "learning_rate": 1.9999517514537723e-05, "loss": 0.2382, "step": 20100 }, { "epoch": 24.258298129149065, "grad_norm": 12.06338882446289, "learning_rate": 1.999951727328293e-05, "loss": 0.236, "step": 20110 }, { "epoch": 24.270368135184068, "grad_norm": 13.126385688781738, "learning_rate": 1.9999517032028136e-05, "loss": 0.2388, "step": 20120 }, { "epoch": 24.28243814121907, "grad_norm": 13.292683601379395, "learning_rate": 1.9999516790773342e-05, "loss": 0.2371, "step": 20130 }, { "epoch": 24.294508147254074, "grad_norm": 12.641128540039062, "learning_rate": 1.9999516549518548e-05, "loss": 0.2407, "step": 20140 }, { "epoch": 24.306578153289077, "grad_norm": 12.56368637084961, "learning_rate": 1.9999516308263754e-05, "loss": 0.2438, "step": 20150 }, { "epoch": 24.31864815932408, "grad_norm": 12.020295143127441, "learning_rate": 1.999951606700896e-05, "loss": 0.2367, "step": 20160 }, { "epoch": 24.330718165359084, "grad_norm": 13.591099739074707, "learning_rate": 1.9999515825754167e-05, "loss": 0.2402, "step": 20170 }, { "epoch": 24.342788171394087, "grad_norm": 12.31505012512207, "learning_rate": 1.9999515584499373e-05, "loss": 0.2372, "step": 20180 }, { "epoch": 24.35485817742909, "grad_norm": 12.794053077697754, "learning_rate": 1.999951534324458e-05, "loss": 0.2439, "step": 20190 }, { "epoch": 24.366928183464093, "grad_norm": 12.377053260803223, "learning_rate": 1.9999515101989785e-05, "loss": 0.2467, "step": 20200 }, { "epoch": 24.378998189499093, "grad_norm": 12.698912620544434, "learning_rate": 1.999951486073499e-05, "loss": 0.2427, "step": 20210 }, { "epoch": 24.391068195534096, "grad_norm": 12.063161849975586, "learning_rate": 1.9999514619480194e-05, "loss": 0.2509, "step": 20220 }, { "epoch": 24.4031382015691, "grad_norm": 12.005499839782715, "learning_rate": 1.99995143782254e-05, "loss": 0.249, "step": 20230 }, { "epoch": 24.415208207604103, "grad_norm": 13.947732925415039, "learning_rate": 1.9999514136970607e-05, "loss": 0.2508, "step": 20240 }, { "epoch": 24.427278213639106, "grad_norm": 12.222932815551758, "learning_rate": 1.9999513895715813e-05, "loss": 0.2537, "step": 20250 }, { "epoch": 24.43934821967411, "grad_norm": 13.263693809509277, "learning_rate": 1.999951365446102e-05, "loss": 0.246, "step": 20260 }, { "epoch": 24.451418225709112, "grad_norm": 12.019664764404297, "learning_rate": 1.9999513413206225e-05, "loss": 0.2491, "step": 20270 }, { "epoch": 24.463488231744115, "grad_norm": 11.942339897155762, "learning_rate": 1.999951317195143e-05, "loss": 0.2541, "step": 20280 }, { "epoch": 24.47555823777912, "grad_norm": 13.257699012756348, "learning_rate": 1.9999512930696638e-05, "loss": 0.2577, "step": 20290 }, { "epoch": 24.48762824381412, "grad_norm": 13.09839153289795, "learning_rate": 1.9999512689441844e-05, "loss": 0.2596, "step": 20300 }, { "epoch": 24.499698249849125, "grad_norm": 13.487357139587402, "learning_rate": 1.999951244818705e-05, "loss": 0.2555, "step": 20310 }, { "epoch": 24.511768255884128, "grad_norm": 14.136542320251465, "learning_rate": 1.9999512206932256e-05, "loss": 0.2635, "step": 20320 }, { "epoch": 24.52383826191913, "grad_norm": 12.958622932434082, "learning_rate": 1.9999511965677463e-05, "loss": 0.2602, "step": 20330 }, { "epoch": 24.535908267954134, "grad_norm": 12.886746406555176, "learning_rate": 1.999951172442267e-05, "loss": 0.2585, "step": 20340 }, { "epoch": 24.547978273989138, "grad_norm": 13.442541122436523, "learning_rate": 1.9999511483167875e-05, "loss": 0.256, "step": 20350 }, { "epoch": 24.56004828002414, "grad_norm": 13.809309959411621, "learning_rate": 1.9999511241913085e-05, "loss": 0.2612, "step": 20360 }, { "epoch": 24.572118286059144, "grad_norm": 11.959115028381348, "learning_rate": 1.999951100065829e-05, "loss": 0.2602, "step": 20370 }, { "epoch": 24.584188292094147, "grad_norm": 13.246687889099121, "learning_rate": 1.9999510759403494e-05, "loss": 0.2602, "step": 20380 }, { "epoch": 24.59625829812915, "grad_norm": 13.457758903503418, "learning_rate": 1.99995105181487e-05, "loss": 0.2543, "step": 20390 }, { "epoch": 24.608328304164154, "grad_norm": 13.02320384979248, "learning_rate": 1.9999510276893906e-05, "loss": 0.264, "step": 20400 }, { "epoch": 24.620398310199157, "grad_norm": 14.013025283813477, "learning_rate": 1.9999510035639112e-05, "loss": 0.2596, "step": 20410 }, { "epoch": 24.632468316234156, "grad_norm": 13.165828704833984, "learning_rate": 1.999950979438432e-05, "loss": 0.2607, "step": 20420 }, { "epoch": 24.64453832226916, "grad_norm": 13.296303749084473, "learning_rate": 1.9999509553129525e-05, "loss": 0.2637, "step": 20430 }, { "epoch": 24.656608328304163, "grad_norm": 14.301431655883789, "learning_rate": 1.999950931187473e-05, "loss": 0.2708, "step": 20440 }, { "epoch": 24.668678334339166, "grad_norm": 12.411490440368652, "learning_rate": 1.9999509070619937e-05, "loss": 0.2587, "step": 20450 }, { "epoch": 24.68074834037417, "grad_norm": 13.311765670776367, "learning_rate": 1.9999508829365143e-05, "loss": 0.2614, "step": 20460 }, { "epoch": 24.692818346409172, "grad_norm": 13.636337280273438, "learning_rate": 1.999950858811035e-05, "loss": 0.2731, "step": 20470 }, { "epoch": 24.704888352444176, "grad_norm": 12.74411392211914, "learning_rate": 1.9999508346855556e-05, "loss": 0.2674, "step": 20480 }, { "epoch": 24.71695835847918, "grad_norm": 12.917356491088867, "learning_rate": 1.9999508105600762e-05, "loss": 0.2661, "step": 20490 }, { "epoch": 24.729028364514182, "grad_norm": 14.503663063049316, "learning_rate": 1.999950786434597e-05, "loss": 0.2691, "step": 20500 }, { "epoch": 24.729028364514182, "eval_loss": 10.653016090393066, "eval_runtime": 8.1334, "eval_samples_per_second": 85.697, "eval_steps_per_second": 10.82, "step": 20500 }, { "epoch": 24.741098370549185, "grad_norm": 12.044464111328125, "learning_rate": 1.9999507623091175e-05, "loss": 0.2669, "step": 20510 }, { "epoch": 24.75316837658419, "grad_norm": 12.576997756958008, "learning_rate": 1.999950738183638e-05, "loss": 0.2703, "step": 20520 }, { "epoch": 24.76523838261919, "grad_norm": 13.449885368347168, "learning_rate": 1.9999507140581587e-05, "loss": 0.272, "step": 20530 }, { "epoch": 24.777308388654195, "grad_norm": 12.43346118927002, "learning_rate": 1.9999506899326793e-05, "loss": 0.2659, "step": 20540 }, { "epoch": 24.789378394689198, "grad_norm": 12.984991073608398, "learning_rate": 1.9999506658072e-05, "loss": 0.2681, "step": 20550 }, { "epoch": 24.8014484007242, "grad_norm": 12.384934425354004, "learning_rate": 1.9999506416817206e-05, "loss": 0.2762, "step": 20560 }, { "epoch": 24.813518406759204, "grad_norm": 13.117884635925293, "learning_rate": 1.9999506175562412e-05, "loss": 0.2701, "step": 20570 }, { "epoch": 24.825588412794207, "grad_norm": 12.807600021362305, "learning_rate": 1.9999505934307618e-05, "loss": 0.27, "step": 20580 }, { "epoch": 24.83765841882921, "grad_norm": 13.230265617370605, "learning_rate": 1.9999505693052824e-05, "loss": 0.269, "step": 20590 }, { "epoch": 24.849728424864214, "grad_norm": 12.769135475158691, "learning_rate": 1.999950545179803e-05, "loss": 0.2751, "step": 20600 }, { "epoch": 24.861798430899217, "grad_norm": 13.072405815124512, "learning_rate": 1.9999505210543237e-05, "loss": 0.274, "step": 20610 }, { "epoch": 24.873868436934217, "grad_norm": 13.053104400634766, "learning_rate": 1.9999504969288443e-05, "loss": 0.2798, "step": 20620 }, { "epoch": 24.88593844296922, "grad_norm": 13.557482719421387, "learning_rate": 1.9999504728033646e-05, "loss": 0.2768, "step": 20630 }, { "epoch": 24.898008449004223, "grad_norm": 13.680904388427734, "learning_rate": 1.9999504486778852e-05, "loss": 0.281, "step": 20640 }, { "epoch": 24.910078455039226, "grad_norm": 13.044319152832031, "learning_rate": 1.9999504245524058e-05, "loss": 0.2811, "step": 20650 }, { "epoch": 24.92214846107423, "grad_norm": 13.686800956726074, "learning_rate": 1.9999504004269264e-05, "loss": 0.2793, "step": 20660 }, { "epoch": 24.934218467109233, "grad_norm": 13.297164916992188, "learning_rate": 1.999950376301447e-05, "loss": 0.2834, "step": 20670 }, { "epoch": 24.946288473144236, "grad_norm": 13.367912292480469, "learning_rate": 1.9999503521759677e-05, "loss": 0.2764, "step": 20680 }, { "epoch": 24.95835847917924, "grad_norm": 12.975577354431152, "learning_rate": 1.9999503280504883e-05, "loss": 0.289, "step": 20690 }, { "epoch": 24.970428485214242, "grad_norm": 13.556283950805664, "learning_rate": 1.999950303925009e-05, "loss": 0.2782, "step": 20700 }, { "epoch": 24.982498491249245, "grad_norm": 13.344411849975586, "learning_rate": 1.9999502797995295e-05, "loss": 0.276, "step": 20710 }, { "epoch": 24.99456849728425, "grad_norm": 14.1496000289917, "learning_rate": 1.99995025567405e-05, "loss": 0.2824, "step": 20720 }, { "epoch": 25.0060350030175, "grad_norm": 10.526139259338379, "learning_rate": 1.9999502315485708e-05, "loss": 0.2311, "step": 20730 }, { "epoch": 25.018105009052505, "grad_norm": 11.411677360534668, "learning_rate": 1.9999502074230914e-05, "loss": 0.1872, "step": 20740 }, { "epoch": 25.030175015087508, "grad_norm": 11.97055435180664, "learning_rate": 1.999950183297612e-05, "loss": 0.1894, "step": 20750 }, { "epoch": 25.04224502112251, "grad_norm": 11.941352844238281, "learning_rate": 1.9999501591721327e-05, "loss": 0.2005, "step": 20760 }, { "epoch": 25.054315027157514, "grad_norm": 10.196296691894531, "learning_rate": 1.9999501350466533e-05, "loss": 0.1991, "step": 20770 }, { "epoch": 25.066385033192518, "grad_norm": 11.521965026855469, "learning_rate": 1.999950110921174e-05, "loss": 0.1925, "step": 20780 }, { "epoch": 25.07845503922752, "grad_norm": 11.03392505645752, "learning_rate": 1.9999500867956945e-05, "loss": 0.197, "step": 20790 }, { "epoch": 25.090525045262524, "grad_norm": 12.300382614135742, "learning_rate": 1.999950062670215e-05, "loss": 0.2066, "step": 20800 }, { "epoch": 25.102595051297527, "grad_norm": 11.152935981750488, "learning_rate": 1.9999500385447358e-05, "loss": 0.2093, "step": 20810 }, { "epoch": 25.11466505733253, "grad_norm": 12.586202621459961, "learning_rate": 1.9999500144192564e-05, "loss": 0.2067, "step": 20820 }, { "epoch": 25.12673506336753, "grad_norm": 11.153509140014648, "learning_rate": 1.999949990293777e-05, "loss": 0.2073, "step": 20830 }, { "epoch": 25.138805069402533, "grad_norm": 12.512395858764648, "learning_rate": 1.9999499661682976e-05, "loss": 0.215, "step": 20840 }, { "epoch": 25.150875075437536, "grad_norm": 12.13071346282959, "learning_rate": 1.9999499420428182e-05, "loss": 0.2196, "step": 20850 }, { "epoch": 25.16294508147254, "grad_norm": 11.715035438537598, "learning_rate": 1.999949917917339e-05, "loss": 0.2168, "step": 20860 }, { "epoch": 25.175015087507543, "grad_norm": 11.84085464477539, "learning_rate": 1.9999498937918595e-05, "loss": 0.2102, "step": 20870 }, { "epoch": 25.187085093542546, "grad_norm": 11.755614280700684, "learning_rate": 1.9999498696663798e-05, "loss": 0.2185, "step": 20880 }, { "epoch": 25.19915509957755, "grad_norm": 11.692583084106445, "learning_rate": 1.9999498455409004e-05, "loss": 0.2165, "step": 20890 }, { "epoch": 25.211225105612552, "grad_norm": 11.121865272521973, "learning_rate": 1.9999498214154214e-05, "loss": 0.2141, "step": 20900 }, { "epoch": 25.223295111647555, "grad_norm": 10.374176025390625, "learning_rate": 1.999949797289942e-05, "loss": 0.2155, "step": 20910 }, { "epoch": 25.23536511768256, "grad_norm": 11.215051651000977, "learning_rate": 1.9999497731644626e-05, "loss": 0.2195, "step": 20920 }, { "epoch": 25.247435123717562, "grad_norm": 11.10447883605957, "learning_rate": 1.9999497490389832e-05, "loss": 0.2183, "step": 20930 }, { "epoch": 25.259505129752565, "grad_norm": 11.216809272766113, "learning_rate": 1.999949724913504e-05, "loss": 0.2229, "step": 20940 }, { "epoch": 25.271575135787568, "grad_norm": 11.404537200927734, "learning_rate": 1.9999497007880245e-05, "loss": 0.2122, "step": 20950 }, { "epoch": 25.28364514182257, "grad_norm": 11.548205375671387, "learning_rate": 1.999949676662545e-05, "loss": 0.2185, "step": 20960 }, { "epoch": 25.295715147857575, "grad_norm": 11.483918190002441, "learning_rate": 1.9999496525370657e-05, "loss": 0.2209, "step": 20970 }, { "epoch": 25.307785153892578, "grad_norm": 12.425530433654785, "learning_rate": 1.9999496284115863e-05, "loss": 0.2253, "step": 20980 }, { "epoch": 25.31985515992758, "grad_norm": 11.8209228515625, "learning_rate": 1.999949604286107e-05, "loss": 0.2251, "step": 20990 }, { "epoch": 25.331925165962584, "grad_norm": 12.454113006591797, "learning_rate": 1.9999495801606276e-05, "loss": 0.2306, "step": 21000 }, { "epoch": 25.331925165962584, "eval_loss": 10.70341968536377, "eval_runtime": 8.135, "eval_samples_per_second": 85.679, "eval_steps_per_second": 10.817, "step": 21000 }, { "epoch": 25.343995171997587, "grad_norm": 13.26988697052002, "learning_rate": 1.9999495560351482e-05, "loss": 0.2307, "step": 21010 }, { "epoch": 25.35606517803259, "grad_norm": 12.554073333740234, "learning_rate": 1.9999495319096688e-05, "loss": 0.2286, "step": 21020 }, { "epoch": 25.368135184067594, "grad_norm": 12.062408447265625, "learning_rate": 1.9999495077841894e-05, "loss": 0.2247, "step": 21030 }, { "epoch": 25.380205190102593, "grad_norm": 12.386098861694336, "learning_rate": 1.99994948365871e-05, "loss": 0.2228, "step": 21040 }, { "epoch": 25.392275196137597, "grad_norm": 12.649396896362305, "learning_rate": 1.9999494595332303e-05, "loss": 0.2332, "step": 21050 }, { "epoch": 25.4043452021726, "grad_norm": 12.75873851776123, "learning_rate": 1.999949435407751e-05, "loss": 0.2312, "step": 21060 }, { "epoch": 25.416415208207603, "grad_norm": 13.261529922485352, "learning_rate": 1.9999494112822716e-05, "loss": 0.2307, "step": 21070 }, { "epoch": 25.428485214242606, "grad_norm": 10.908228874206543, "learning_rate": 1.9999493871567922e-05, "loss": 0.2316, "step": 21080 }, { "epoch": 25.44055522027761, "grad_norm": 12.154084205627441, "learning_rate": 1.9999493630313128e-05, "loss": 0.2319, "step": 21090 }, { "epoch": 25.452625226312612, "grad_norm": 12.063490867614746, "learning_rate": 1.9999493389058334e-05, "loss": 0.2307, "step": 21100 }, { "epoch": 25.464695232347616, "grad_norm": 12.269184112548828, "learning_rate": 1.999949314780354e-05, "loss": 0.2321, "step": 21110 }, { "epoch": 25.47676523838262, "grad_norm": 13.165206909179688, "learning_rate": 1.9999492906548747e-05, "loss": 0.2253, "step": 21120 }, { "epoch": 25.488835244417622, "grad_norm": 11.699442863464355, "learning_rate": 1.9999492665293953e-05, "loss": 0.2358, "step": 21130 }, { "epoch": 25.500905250452625, "grad_norm": 12.320382118225098, "learning_rate": 1.999949242403916e-05, "loss": 0.2359, "step": 21140 }, { "epoch": 25.51297525648763, "grad_norm": 12.028386116027832, "learning_rate": 1.9999492182784366e-05, "loss": 0.2378, "step": 21150 }, { "epoch": 25.52504526252263, "grad_norm": 12.95829963684082, "learning_rate": 1.9999491941529572e-05, "loss": 0.2362, "step": 21160 }, { "epoch": 25.537115268557635, "grad_norm": 11.279809951782227, "learning_rate": 1.9999491700274778e-05, "loss": 0.2383, "step": 21170 }, { "epoch": 25.549185274592638, "grad_norm": 11.747113227844238, "learning_rate": 1.9999491459019984e-05, "loss": 0.2358, "step": 21180 }, { "epoch": 25.56125528062764, "grad_norm": 12.857048988342285, "learning_rate": 1.999949121776519e-05, "loss": 0.2389, "step": 21190 }, { "epoch": 25.573325286662644, "grad_norm": 12.870372772216797, "learning_rate": 1.9999490976510397e-05, "loss": 0.2451, "step": 21200 }, { "epoch": 25.585395292697648, "grad_norm": 11.34208869934082, "learning_rate": 1.9999490735255603e-05, "loss": 0.2375, "step": 21210 }, { "epoch": 25.59746529873265, "grad_norm": 12.903071403503418, "learning_rate": 1.999949049400081e-05, "loss": 0.2415, "step": 21220 }, { "epoch": 25.609535304767654, "grad_norm": 12.304178237915039, "learning_rate": 1.9999490252746015e-05, "loss": 0.2427, "step": 21230 }, { "epoch": 25.621605310802657, "grad_norm": 12.541735649108887, "learning_rate": 1.999949001149122e-05, "loss": 0.2439, "step": 21240 }, { "epoch": 25.633675316837657, "grad_norm": 13.021693229675293, "learning_rate": 1.9999489770236428e-05, "loss": 0.2424, "step": 21250 }, { "epoch": 25.64574532287266, "grad_norm": 13.058255195617676, "learning_rate": 1.9999489528981634e-05, "loss": 0.2397, "step": 21260 }, { "epoch": 25.657815328907663, "grad_norm": 13.406496047973633, "learning_rate": 1.999948928772684e-05, "loss": 0.2431, "step": 21270 }, { "epoch": 25.669885334942666, "grad_norm": 12.31867790222168, "learning_rate": 1.9999489046472046e-05, "loss": 0.2415, "step": 21280 }, { "epoch": 25.68195534097767, "grad_norm": 12.266294479370117, "learning_rate": 1.9999488805217253e-05, "loss": 0.2404, "step": 21290 }, { "epoch": 25.694025347012673, "grad_norm": 12.813672065734863, "learning_rate": 1.9999488563962455e-05, "loss": 0.2468, "step": 21300 }, { "epoch": 25.706095353047676, "grad_norm": 13.013755798339844, "learning_rate": 1.999948832270766e-05, "loss": 0.2486, "step": 21310 }, { "epoch": 25.71816535908268, "grad_norm": 12.831327438354492, "learning_rate": 1.9999488081452868e-05, "loss": 0.2365, "step": 21320 }, { "epoch": 25.730235365117682, "grad_norm": 11.858291625976562, "learning_rate": 1.9999487840198074e-05, "loss": 0.2475, "step": 21330 }, { "epoch": 25.742305371152685, "grad_norm": 12.631282806396484, "learning_rate": 1.999948759894328e-05, "loss": 0.2477, "step": 21340 }, { "epoch": 25.75437537718769, "grad_norm": 12.364007949829102, "learning_rate": 1.9999487357688486e-05, "loss": 0.2496, "step": 21350 }, { "epoch": 25.766445383222692, "grad_norm": 14.081683158874512, "learning_rate": 1.9999487116433693e-05, "loss": 0.2529, "step": 21360 }, { "epoch": 25.778515389257695, "grad_norm": 12.690943717956543, "learning_rate": 1.99994868751789e-05, "loss": 0.2504, "step": 21370 }, { "epoch": 25.790585395292698, "grad_norm": 13.374493598937988, "learning_rate": 1.9999486633924105e-05, "loss": 0.2495, "step": 21380 }, { "epoch": 25.8026554013277, "grad_norm": 13.013587951660156, "learning_rate": 1.999948639266931e-05, "loss": 0.2531, "step": 21390 }, { "epoch": 25.814725407362705, "grad_norm": 13.53576374053955, "learning_rate": 1.9999486151414518e-05, "loss": 0.247, "step": 21400 }, { "epoch": 25.826795413397708, "grad_norm": 12.62624740600586, "learning_rate": 1.9999485910159724e-05, "loss": 0.2524, "step": 21410 }, { "epoch": 25.83886541943271, "grad_norm": 12.947402954101562, "learning_rate": 1.999948566890493e-05, "loss": 0.2545, "step": 21420 }, { "epoch": 25.850935425467714, "grad_norm": 12.489056587219238, "learning_rate": 1.9999485427650136e-05, "loss": 0.2527, "step": 21430 }, { "epoch": 25.863005431502717, "grad_norm": 12.586615562438965, "learning_rate": 1.9999485186395346e-05, "loss": 0.2521, "step": 21440 }, { "epoch": 25.875075437537717, "grad_norm": 13.126222610473633, "learning_rate": 1.9999484945140552e-05, "loss": 0.2595, "step": 21450 }, { "epoch": 25.88714544357272, "grad_norm": 13.1371488571167, "learning_rate": 1.9999484703885755e-05, "loss": 0.252, "step": 21460 }, { "epoch": 25.899215449607723, "grad_norm": 12.115607261657715, "learning_rate": 1.999948446263096e-05, "loss": 0.2604, "step": 21470 }, { "epoch": 25.911285455642727, "grad_norm": 12.734490394592285, "learning_rate": 1.9999484221376167e-05, "loss": 0.2471, "step": 21480 }, { "epoch": 25.92335546167773, "grad_norm": 13.26484203338623, "learning_rate": 1.9999483980121373e-05, "loss": 0.2623, "step": 21490 }, { "epoch": 25.935425467712733, "grad_norm": 13.264881134033203, "learning_rate": 1.999948373886658e-05, "loss": 0.2568, "step": 21500 }, { "epoch": 25.935425467712733, "eval_loss": 10.73649787902832, "eval_runtime": 8.1332, "eval_samples_per_second": 85.698, "eval_steps_per_second": 10.82, "step": 21500 }, { "epoch": 25.947495473747736, "grad_norm": 12.561868667602539, "learning_rate": 1.9999483497611786e-05, "loss": 0.2616, "step": 21510 }, { "epoch": 25.95956547978274, "grad_norm": 12.79563045501709, "learning_rate": 1.9999483256356992e-05, "loss": 0.2565, "step": 21520 }, { "epoch": 25.971635485817743, "grad_norm": 12.619697570800781, "learning_rate": 1.99994830151022e-05, "loss": 0.2526, "step": 21530 }, { "epoch": 25.983705491852746, "grad_norm": 13.112431526184082, "learning_rate": 1.9999482773847405e-05, "loss": 0.263, "step": 21540 }, { "epoch": 25.99577549788775, "grad_norm": 13.619861602783203, "learning_rate": 1.999948253259261e-05, "loss": 0.2583, "step": 21550 }, { "epoch": 26.007242003621002, "grad_norm": 9.910261154174805, "learning_rate": 1.9999482291337817e-05, "loss": 0.2071, "step": 21560 }, { "epoch": 26.019312009656005, "grad_norm": 10.227588653564453, "learning_rate": 1.9999482050083023e-05, "loss": 0.1767, "step": 21570 }, { "epoch": 26.03138201569101, "grad_norm": 12.102587699890137, "learning_rate": 1.999948180882823e-05, "loss": 0.1811, "step": 21580 }, { "epoch": 26.04345202172601, "grad_norm": 10.865530967712402, "learning_rate": 1.9999481567573436e-05, "loss": 0.1812, "step": 21590 }, { "epoch": 26.055522027761015, "grad_norm": 10.71815013885498, "learning_rate": 1.9999481326318642e-05, "loss": 0.1904, "step": 21600 }, { "epoch": 26.067592033796018, "grad_norm": 10.647429466247559, "learning_rate": 1.9999481085063848e-05, "loss": 0.1908, "step": 21610 }, { "epoch": 26.07966203983102, "grad_norm": 10.489212989807129, "learning_rate": 1.9999480843809054e-05, "loss": 0.1894, "step": 21620 }, { "epoch": 26.091732045866024, "grad_norm": 10.646719932556152, "learning_rate": 1.999948060255426e-05, "loss": 0.1916, "step": 21630 }, { "epoch": 26.103802051901027, "grad_norm": 10.136890411376953, "learning_rate": 1.9999480361299467e-05, "loss": 0.1932, "step": 21640 }, { "epoch": 26.11587205793603, "grad_norm": 11.7883882522583, "learning_rate": 1.9999480120044673e-05, "loss": 0.1924, "step": 21650 }, { "epoch": 26.12794206397103, "grad_norm": 11.331929206848145, "learning_rate": 1.999947987878988e-05, "loss": 0.1972, "step": 21660 }, { "epoch": 26.140012070006033, "grad_norm": 10.678112030029297, "learning_rate": 1.9999479637535085e-05, "loss": 0.1964, "step": 21670 }, { "epoch": 26.152082076041037, "grad_norm": 11.089125633239746, "learning_rate": 1.999947939628029e-05, "loss": 0.1945, "step": 21680 }, { "epoch": 26.16415208207604, "grad_norm": 11.8392915725708, "learning_rate": 1.9999479155025498e-05, "loss": 0.1945, "step": 21690 }, { "epoch": 26.176222088111043, "grad_norm": 11.208880424499512, "learning_rate": 1.9999478913770704e-05, "loss": 0.2035, "step": 21700 }, { "epoch": 26.188292094146046, "grad_norm": 11.805027961730957, "learning_rate": 1.9999478672515907e-05, "loss": 0.2023, "step": 21710 }, { "epoch": 26.20036210018105, "grad_norm": 11.455142974853516, "learning_rate": 1.9999478431261113e-05, "loss": 0.2017, "step": 21720 }, { "epoch": 26.212432106216053, "grad_norm": 11.114120483398438, "learning_rate": 1.999947819000632e-05, "loss": 0.2004, "step": 21730 }, { "epoch": 26.224502112251056, "grad_norm": 11.123705863952637, "learning_rate": 1.9999477948751525e-05, "loss": 0.2006, "step": 21740 }, { "epoch": 26.23657211828606, "grad_norm": 10.587043762207031, "learning_rate": 1.9999477707496732e-05, "loss": 0.2038, "step": 21750 }, { "epoch": 26.248642124321062, "grad_norm": 10.866671562194824, "learning_rate": 1.9999477466241938e-05, "loss": 0.2042, "step": 21760 }, { "epoch": 26.260712130356065, "grad_norm": 10.727182388305664, "learning_rate": 1.9999477224987144e-05, "loss": 0.2034, "step": 21770 }, { "epoch": 26.27278213639107, "grad_norm": 11.520475387573242, "learning_rate": 1.999947698373235e-05, "loss": 0.2039, "step": 21780 }, { "epoch": 26.28485214242607, "grad_norm": 11.520177841186523, "learning_rate": 1.9999476742477557e-05, "loss": 0.2109, "step": 21790 }, { "epoch": 26.296922148461075, "grad_norm": 11.471441268920898, "learning_rate": 1.9999476501222763e-05, "loss": 0.2068, "step": 21800 }, { "epoch": 26.308992154496078, "grad_norm": 11.291136741638184, "learning_rate": 1.999947625996797e-05, "loss": 0.2072, "step": 21810 }, { "epoch": 26.32106216053108, "grad_norm": 11.868663787841797, "learning_rate": 1.9999476018713175e-05, "loss": 0.2111, "step": 21820 }, { "epoch": 26.333132166566084, "grad_norm": 11.705162048339844, "learning_rate": 1.999947577745838e-05, "loss": 0.2127, "step": 21830 }, { "epoch": 26.345202172601088, "grad_norm": 12.715254783630371, "learning_rate": 1.9999475536203588e-05, "loss": 0.2104, "step": 21840 }, { "epoch": 26.35727217863609, "grad_norm": 12.306236267089844, "learning_rate": 1.9999475294948794e-05, "loss": 0.2183, "step": 21850 }, { "epoch": 26.369342184671094, "grad_norm": 11.838712692260742, "learning_rate": 1.9999475053694e-05, "loss": 0.2121, "step": 21860 }, { "epoch": 26.381412190706094, "grad_norm": 11.407219886779785, "learning_rate": 1.9999474812439206e-05, "loss": 0.2107, "step": 21870 }, { "epoch": 26.393482196741097, "grad_norm": 11.298849105834961, "learning_rate": 1.9999474571184413e-05, "loss": 0.2156, "step": 21880 }, { "epoch": 26.4055522027761, "grad_norm": 11.87114429473877, "learning_rate": 1.999947432992962e-05, "loss": 0.2117, "step": 21890 }, { "epoch": 26.417622208811103, "grad_norm": 11.812271118164062, "learning_rate": 1.9999474088674825e-05, "loss": 0.2164, "step": 21900 }, { "epoch": 26.429692214846106, "grad_norm": 11.709311485290527, "learning_rate": 1.999947384742003e-05, "loss": 0.216, "step": 21910 }, { "epoch": 26.44176222088111, "grad_norm": 11.62240219116211, "learning_rate": 1.9999473606165237e-05, "loss": 0.2153, "step": 21920 }, { "epoch": 26.453832226916113, "grad_norm": 12.524971008300781, "learning_rate": 1.9999473364910444e-05, "loss": 0.2183, "step": 21930 }, { "epoch": 26.465902232951116, "grad_norm": 10.64269733428955, "learning_rate": 1.999947312365565e-05, "loss": 0.22, "step": 21940 }, { "epoch": 26.47797223898612, "grad_norm": 11.939125061035156, "learning_rate": 1.9999472882400856e-05, "loss": 0.226, "step": 21950 }, { "epoch": 26.490042245021122, "grad_norm": 11.497417449951172, "learning_rate": 1.999947264114606e-05, "loss": 0.2185, "step": 21960 }, { "epoch": 26.502112251056126, "grad_norm": 11.135834693908691, "learning_rate": 1.9999472399891265e-05, "loss": 0.2258, "step": 21970 }, { "epoch": 26.51418225709113, "grad_norm": 12.532482147216797, "learning_rate": 1.9999472158636475e-05, "loss": 0.2198, "step": 21980 }, { "epoch": 26.526252263126132, "grad_norm": 12.333076477050781, "learning_rate": 1.999947191738168e-05, "loss": 0.2206, "step": 21990 }, { "epoch": 26.538322269161135, "grad_norm": 12.091165542602539, "learning_rate": 1.9999471676126887e-05, "loss": 0.2181, "step": 22000 }, { "epoch": 26.538322269161135, "eval_loss": 10.798569679260254, "eval_runtime": 8.1424, "eval_samples_per_second": 85.601, "eval_steps_per_second": 10.808, "step": 22000 }, { "epoch": 26.55039227519614, "grad_norm": 11.355403900146484, "learning_rate": 1.9999471434872093e-05, "loss": 0.2145, "step": 22010 }, { "epoch": 26.56246228123114, "grad_norm": 12.739715576171875, "learning_rate": 1.99994711936173e-05, "loss": 0.22, "step": 22020 }, { "epoch": 26.574532287266145, "grad_norm": 11.85799789428711, "learning_rate": 1.9999470952362506e-05, "loss": 0.226, "step": 22030 }, { "epoch": 26.586602293301148, "grad_norm": 12.680950164794922, "learning_rate": 1.9999470711107712e-05, "loss": 0.2232, "step": 22040 }, { "epoch": 26.59867229933615, "grad_norm": 11.460783004760742, "learning_rate": 1.9999470469852918e-05, "loss": 0.2264, "step": 22050 }, { "epoch": 26.610742305371154, "grad_norm": 11.97572135925293, "learning_rate": 1.9999470228598124e-05, "loss": 0.2188, "step": 22060 }, { "epoch": 26.622812311406157, "grad_norm": 12.317198753356934, "learning_rate": 1.999946998734333e-05, "loss": 0.2289, "step": 22070 }, { "epoch": 26.634882317441157, "grad_norm": 11.5936279296875, "learning_rate": 1.9999469746088537e-05, "loss": 0.2242, "step": 22080 }, { "epoch": 26.64695232347616, "grad_norm": 12.906058311462402, "learning_rate": 1.9999469504833743e-05, "loss": 0.2313, "step": 22090 }, { "epoch": 26.659022329511163, "grad_norm": 12.22015380859375, "learning_rate": 1.999946926357895e-05, "loss": 0.2314, "step": 22100 }, { "epoch": 26.671092335546167, "grad_norm": 13.087827682495117, "learning_rate": 1.9999469022324155e-05, "loss": 0.2326, "step": 22110 }, { "epoch": 26.68316234158117, "grad_norm": 12.30292797088623, "learning_rate": 1.9999468781069358e-05, "loss": 0.227, "step": 22120 }, { "epoch": 26.695232347616173, "grad_norm": 12.204242706298828, "learning_rate": 1.9999468539814565e-05, "loss": 0.2321, "step": 22130 }, { "epoch": 26.707302353651176, "grad_norm": 12.287890434265137, "learning_rate": 1.999946829855977e-05, "loss": 0.2228, "step": 22140 }, { "epoch": 26.71937235968618, "grad_norm": 13.92843246459961, "learning_rate": 1.9999468057304977e-05, "loss": 0.2236, "step": 22150 }, { "epoch": 26.731442365721183, "grad_norm": 12.438054084777832, "learning_rate": 1.9999467816050183e-05, "loss": 0.2272, "step": 22160 }, { "epoch": 26.743512371756186, "grad_norm": 13.12863540649414, "learning_rate": 1.999946757479539e-05, "loss": 0.224, "step": 22170 }, { "epoch": 26.75558237779119, "grad_norm": 12.139604568481445, "learning_rate": 1.9999467333540596e-05, "loss": 0.2289, "step": 22180 }, { "epoch": 26.767652383826192, "grad_norm": 12.336463928222656, "learning_rate": 1.9999467092285802e-05, "loss": 0.2337, "step": 22190 }, { "epoch": 26.779722389861195, "grad_norm": 12.432232856750488, "learning_rate": 1.9999466851031008e-05, "loss": 0.2331, "step": 22200 }, { "epoch": 26.7917923958962, "grad_norm": 12.402278900146484, "learning_rate": 1.9999466609776214e-05, "loss": 0.2328, "step": 22210 }, { "epoch": 26.8038624019312, "grad_norm": 12.383450508117676, "learning_rate": 1.999946636852142e-05, "loss": 0.2315, "step": 22220 }, { "epoch": 26.815932407966205, "grad_norm": 13.879523277282715, "learning_rate": 1.9999466127266627e-05, "loss": 0.2314, "step": 22230 }, { "epoch": 26.828002414001208, "grad_norm": 13.755219459533691, "learning_rate": 1.9999465886011833e-05, "loss": 0.2352, "step": 22240 }, { "epoch": 26.84007242003621, "grad_norm": 12.921009063720703, "learning_rate": 1.999946564475704e-05, "loss": 0.236, "step": 22250 }, { "epoch": 26.852142426071214, "grad_norm": 12.903682708740234, "learning_rate": 1.9999465403502245e-05, "loss": 0.2429, "step": 22260 }, { "epoch": 26.864212432106218, "grad_norm": 13.565681457519531, "learning_rate": 1.999946516224745e-05, "loss": 0.2307, "step": 22270 }, { "epoch": 26.876282438141217, "grad_norm": 12.637162208557129, "learning_rate": 1.9999464920992658e-05, "loss": 0.2373, "step": 22280 }, { "epoch": 26.88835244417622, "grad_norm": 11.219658851623535, "learning_rate": 1.9999464679737864e-05, "loss": 0.2322, "step": 22290 }, { "epoch": 26.900422450211224, "grad_norm": 12.915096282958984, "learning_rate": 1.999946443848307e-05, "loss": 0.2378, "step": 22300 }, { "epoch": 26.912492456246227, "grad_norm": 12.23090934753418, "learning_rate": 1.9999464197228276e-05, "loss": 0.237, "step": 22310 }, { "epoch": 26.92456246228123, "grad_norm": 12.162592887878418, "learning_rate": 1.9999463955973483e-05, "loss": 0.2266, "step": 22320 }, { "epoch": 26.936632468316233, "grad_norm": 12.417464256286621, "learning_rate": 1.999946371471869e-05, "loss": 0.2341, "step": 22330 }, { "epoch": 26.948702474351236, "grad_norm": 12.384352684020996, "learning_rate": 1.9999463473463895e-05, "loss": 0.2407, "step": 22340 }, { "epoch": 26.96077248038624, "grad_norm": 13.19217300415039, "learning_rate": 1.99994632322091e-05, "loss": 0.2401, "step": 22350 }, { "epoch": 26.972842486421243, "grad_norm": 13.211677551269531, "learning_rate": 1.9999462990954307e-05, "loss": 0.2457, "step": 22360 }, { "epoch": 26.984912492456246, "grad_norm": 11.75502872467041, "learning_rate": 1.999946274969951e-05, "loss": 0.2429, "step": 22370 }, { "epoch": 26.99698249849125, "grad_norm": 12.42489242553711, "learning_rate": 1.9999462508444717e-05, "loss": 0.2463, "step": 22380 }, { "epoch": 27.008449004224502, "grad_norm": 10.925806999206543, "learning_rate": 1.9999462267189923e-05, "loss": 0.1859, "step": 22390 }, { "epoch": 27.020519010259505, "grad_norm": 10.438081741333008, "learning_rate": 1.999946202593513e-05, "loss": 0.1632, "step": 22400 }, { "epoch": 27.03258901629451, "grad_norm": 10.832222938537598, "learning_rate": 1.9999461784680335e-05, "loss": 0.1695, "step": 22410 }, { "epoch": 27.044659022329512, "grad_norm": 10.835794448852539, "learning_rate": 1.999946154342554e-05, "loss": 0.1702, "step": 22420 }, { "epoch": 27.056729028364515, "grad_norm": 10.652441024780273, "learning_rate": 1.9999461302170748e-05, "loss": 0.1749, "step": 22430 }, { "epoch": 27.068799034399518, "grad_norm": 10.317359924316406, "learning_rate": 1.9999461060915954e-05, "loss": 0.1707, "step": 22440 }, { "epoch": 27.08086904043452, "grad_norm": 10.759320259094238, "learning_rate": 1.999946081966116e-05, "loss": 0.1778, "step": 22450 }, { "epoch": 27.092939046469525, "grad_norm": 10.883888244628906, "learning_rate": 1.9999460578406366e-05, "loss": 0.1793, "step": 22460 }, { "epoch": 27.105009052504528, "grad_norm": 10.298149108886719, "learning_rate": 1.9999460337151572e-05, "loss": 0.1776, "step": 22470 }, { "epoch": 27.11707905853953, "grad_norm": 11.353090286254883, "learning_rate": 1.999946009589678e-05, "loss": 0.1777, "step": 22480 }, { "epoch": 27.12914906457453, "grad_norm": 10.713643074035645, "learning_rate": 1.9999459854641985e-05, "loss": 0.1754, "step": 22490 }, { "epoch": 27.141219070609534, "grad_norm": 9.760894775390625, "learning_rate": 1.999945961338719e-05, "loss": 0.1799, "step": 22500 }, { "epoch": 27.141219070609534, "eval_loss": 10.84368896484375, "eval_runtime": 8.1405, "eval_samples_per_second": 85.621, "eval_steps_per_second": 10.81, "step": 22500 }, { "epoch": 27.153289076644537, "grad_norm": 11.546426773071289, "learning_rate": 1.9999459372132397e-05, "loss": 0.1857, "step": 22510 }, { "epoch": 27.16535908267954, "grad_norm": 10.860307693481445, "learning_rate": 1.9999459130877607e-05, "loss": 0.1792, "step": 22520 }, { "epoch": 27.177429088714543, "grad_norm": 9.958005905151367, "learning_rate": 1.9999458889622813e-05, "loss": 0.1841, "step": 22530 }, { "epoch": 27.189499094749547, "grad_norm": 12.11418342590332, "learning_rate": 1.9999458648368016e-05, "loss": 0.1873, "step": 22540 }, { "epoch": 27.20156910078455, "grad_norm": 10.441254615783691, "learning_rate": 1.9999458407113222e-05, "loss": 0.1875, "step": 22550 }, { "epoch": 27.213639106819553, "grad_norm": 10.98290729522705, "learning_rate": 1.999945816585843e-05, "loss": 0.1905, "step": 22560 }, { "epoch": 27.225709112854556, "grad_norm": 11.014912605285645, "learning_rate": 1.9999457924603635e-05, "loss": 0.1934, "step": 22570 }, { "epoch": 27.23777911888956, "grad_norm": 10.924641609191895, "learning_rate": 1.999945768334884e-05, "loss": 0.1885, "step": 22580 }, { "epoch": 27.249849124924562, "grad_norm": 10.475966453552246, "learning_rate": 1.9999457442094047e-05, "loss": 0.1884, "step": 22590 }, { "epoch": 27.261919130959566, "grad_norm": 12.580035209655762, "learning_rate": 1.9999457200839253e-05, "loss": 0.1938, "step": 22600 }, { "epoch": 27.27398913699457, "grad_norm": 11.803628921508789, "learning_rate": 1.999945695958446e-05, "loss": 0.196, "step": 22610 }, { "epoch": 27.286059143029572, "grad_norm": 10.641236305236816, "learning_rate": 1.9999456718329666e-05, "loss": 0.195, "step": 22620 }, { "epoch": 27.298129149064575, "grad_norm": 11.592552185058594, "learning_rate": 1.9999456477074872e-05, "loss": 0.1878, "step": 22630 }, { "epoch": 27.31019915509958, "grad_norm": 11.040590286254883, "learning_rate": 1.9999456235820078e-05, "loss": 0.1939, "step": 22640 }, { "epoch": 27.32226916113458, "grad_norm": 11.91409969329834, "learning_rate": 1.9999455994565284e-05, "loss": 0.1958, "step": 22650 }, { "epoch": 27.334339167169585, "grad_norm": 11.666473388671875, "learning_rate": 1.999945575331049e-05, "loss": 0.1985, "step": 22660 }, { "epoch": 27.346409173204588, "grad_norm": 10.803607940673828, "learning_rate": 1.9999455512055697e-05, "loss": 0.1972, "step": 22670 }, { "epoch": 27.35847917923959, "grad_norm": 11.931965827941895, "learning_rate": 1.9999455270800903e-05, "loss": 0.1928, "step": 22680 }, { "epoch": 27.370549185274594, "grad_norm": 11.987936973571777, "learning_rate": 1.999945502954611e-05, "loss": 0.1988, "step": 22690 }, { "epoch": 27.382619191309594, "grad_norm": 11.012763977050781, "learning_rate": 1.9999454788291315e-05, "loss": 0.1975, "step": 22700 }, { "epoch": 27.394689197344597, "grad_norm": 11.239973068237305, "learning_rate": 1.999945454703652e-05, "loss": 0.1922, "step": 22710 }, { "epoch": 27.4067592033796, "grad_norm": 11.550983428955078, "learning_rate": 1.9999454305781728e-05, "loss": 0.2004, "step": 22720 }, { "epoch": 27.418829209414604, "grad_norm": 10.511107444763184, "learning_rate": 1.9999454064526934e-05, "loss": 0.2003, "step": 22730 }, { "epoch": 27.430899215449607, "grad_norm": 11.62814712524414, "learning_rate": 1.999945382327214e-05, "loss": 0.2013, "step": 22740 }, { "epoch": 27.44296922148461, "grad_norm": 11.703084945678711, "learning_rate": 1.9999453582017346e-05, "loss": 0.2022, "step": 22750 }, { "epoch": 27.455039227519613, "grad_norm": 11.102737426757812, "learning_rate": 1.9999453340762553e-05, "loss": 0.1998, "step": 22760 }, { "epoch": 27.467109233554616, "grad_norm": 11.393044471740723, "learning_rate": 1.999945309950776e-05, "loss": 0.1995, "step": 22770 }, { "epoch": 27.47917923958962, "grad_norm": 12.071272850036621, "learning_rate": 1.9999452858252965e-05, "loss": 0.2059, "step": 22780 }, { "epoch": 27.491249245624623, "grad_norm": 11.823110580444336, "learning_rate": 1.9999452616998168e-05, "loss": 0.2049, "step": 22790 }, { "epoch": 27.503319251659626, "grad_norm": 11.56924819946289, "learning_rate": 1.9999452375743374e-05, "loss": 0.2045, "step": 22800 }, { "epoch": 27.51538925769463, "grad_norm": 12.096574783325195, "learning_rate": 1.999945213448858e-05, "loss": 0.2005, "step": 22810 }, { "epoch": 27.527459263729632, "grad_norm": 11.450089454650879, "learning_rate": 1.9999451893233787e-05, "loss": 0.2079, "step": 22820 }, { "epoch": 27.539529269764635, "grad_norm": 12.069808006286621, "learning_rate": 1.9999451651978993e-05, "loss": 0.2097, "step": 22830 }, { "epoch": 27.55159927579964, "grad_norm": 11.403156280517578, "learning_rate": 1.99994514107242e-05, "loss": 0.203, "step": 22840 }, { "epoch": 27.563669281834642, "grad_norm": 11.427160263061523, "learning_rate": 1.9999451169469405e-05, "loss": 0.2056, "step": 22850 }, { "epoch": 27.575739287869645, "grad_norm": 11.16104793548584, "learning_rate": 1.999945092821461e-05, "loss": 0.2115, "step": 22860 }, { "epoch": 27.587809293904648, "grad_norm": 12.215459823608398, "learning_rate": 1.9999450686959818e-05, "loss": 0.2137, "step": 22870 }, { "epoch": 27.59987929993965, "grad_norm": 12.263824462890625, "learning_rate": 1.9999450445705024e-05, "loss": 0.216, "step": 22880 }, { "epoch": 27.611949305974655, "grad_norm": 13.03971004486084, "learning_rate": 1.999945020445023e-05, "loss": 0.2141, "step": 22890 }, { "epoch": 27.624019312009658, "grad_norm": 11.145282745361328, "learning_rate": 1.9999449963195436e-05, "loss": 0.2085, "step": 22900 }, { "epoch": 27.636089318044657, "grad_norm": 11.108855247497559, "learning_rate": 1.9999449721940643e-05, "loss": 0.2109, "step": 22910 }, { "epoch": 27.64815932407966, "grad_norm": 11.742668151855469, "learning_rate": 1.999944948068585e-05, "loss": 0.2105, "step": 22920 }, { "epoch": 27.660229330114664, "grad_norm": 12.694267272949219, "learning_rate": 1.9999449239431055e-05, "loss": 0.2133, "step": 22930 }, { "epoch": 27.672299336149667, "grad_norm": 12.04580307006836, "learning_rate": 1.999944899817626e-05, "loss": 0.215, "step": 22940 }, { "epoch": 27.68436934218467, "grad_norm": 11.860087394714355, "learning_rate": 1.9999448756921467e-05, "loss": 0.2214, "step": 22950 }, { "epoch": 27.696439348219673, "grad_norm": 13.478748321533203, "learning_rate": 1.9999448515666674e-05, "loss": 0.2089, "step": 22960 }, { "epoch": 27.708509354254677, "grad_norm": 11.717103004455566, "learning_rate": 1.999944827441188e-05, "loss": 0.2104, "step": 22970 }, { "epoch": 27.72057936028968, "grad_norm": 12.653969764709473, "learning_rate": 1.9999448033157086e-05, "loss": 0.2145, "step": 22980 }, { "epoch": 27.732649366324683, "grad_norm": 11.927763938903809, "learning_rate": 1.9999447791902292e-05, "loss": 0.2143, "step": 22990 }, { "epoch": 27.744719372359686, "grad_norm": 12.024202346801758, "learning_rate": 1.99994475506475e-05, "loss": 0.2169, "step": 23000 }, { "epoch": 27.744719372359686, "eval_loss": 10.906641960144043, "eval_runtime": 8.135, "eval_samples_per_second": 85.679, "eval_steps_per_second": 10.817, "step": 23000 }, { "epoch": 27.75678937839469, "grad_norm": 11.818063735961914, "learning_rate": 1.9999447309392705e-05, "loss": 0.2113, "step": 23010 }, { "epoch": 27.768859384429692, "grad_norm": 11.939412117004395, "learning_rate": 1.999944706813791e-05, "loss": 0.2203, "step": 23020 }, { "epoch": 27.780929390464696, "grad_norm": 11.882866859436035, "learning_rate": 1.9999446826883117e-05, "loss": 0.2176, "step": 23030 }, { "epoch": 27.7929993964997, "grad_norm": 11.854211807250977, "learning_rate": 1.999944658562832e-05, "loss": 0.2162, "step": 23040 }, { "epoch": 27.805069402534702, "grad_norm": 12.08410930633545, "learning_rate": 1.9999446344373526e-05, "loss": 0.212, "step": 23050 }, { "epoch": 27.817139408569705, "grad_norm": 11.821004867553711, "learning_rate": 1.9999446103118736e-05, "loss": 0.2198, "step": 23060 }, { "epoch": 27.82920941460471, "grad_norm": 13.322479248046875, "learning_rate": 1.9999445861863942e-05, "loss": 0.2171, "step": 23070 }, { "epoch": 27.84127942063971, "grad_norm": 13.09875202178955, "learning_rate": 1.9999445620609148e-05, "loss": 0.2215, "step": 23080 }, { "epoch": 27.853349426674715, "grad_norm": 12.955581665039062, "learning_rate": 1.9999445379354354e-05, "loss": 0.2221, "step": 23090 }, { "epoch": 27.865419432709718, "grad_norm": 12.133014678955078, "learning_rate": 1.999944513809956e-05, "loss": 0.2233, "step": 23100 }, { "epoch": 27.877489438744718, "grad_norm": 10.88289737701416, "learning_rate": 1.9999444896844767e-05, "loss": 0.2224, "step": 23110 }, { "epoch": 27.88955944477972, "grad_norm": 12.310105323791504, "learning_rate": 1.9999444655589973e-05, "loss": 0.2189, "step": 23120 }, { "epoch": 27.901629450814724, "grad_norm": 11.615823745727539, "learning_rate": 1.999944441433518e-05, "loss": 0.2301, "step": 23130 }, { "epoch": 27.913699456849727, "grad_norm": 11.423310279846191, "learning_rate": 1.9999444173080385e-05, "loss": 0.2181, "step": 23140 }, { "epoch": 27.92576946288473, "grad_norm": 12.007137298583984, "learning_rate": 1.9999443931825592e-05, "loss": 0.2191, "step": 23150 }, { "epoch": 27.937839468919734, "grad_norm": 11.957732200622559, "learning_rate": 1.9999443690570798e-05, "loss": 0.2203, "step": 23160 }, { "epoch": 27.949909474954737, "grad_norm": 12.2783842086792, "learning_rate": 1.9999443449316004e-05, "loss": 0.2243, "step": 23170 }, { "epoch": 27.96197948098974, "grad_norm": 11.609381675720215, "learning_rate": 1.999944320806121e-05, "loss": 0.2241, "step": 23180 }, { "epoch": 27.974049487024743, "grad_norm": 11.96996021270752, "learning_rate": 1.9999442966806417e-05, "loss": 0.2222, "step": 23190 }, { "epoch": 27.986119493059746, "grad_norm": 12.320900917053223, "learning_rate": 1.999944272555162e-05, "loss": 0.2283, "step": 23200 }, { "epoch": 27.99818949909475, "grad_norm": 12.64803409576416, "learning_rate": 1.9999442484296826e-05, "loss": 0.2227, "step": 23210 }, { "epoch": 28.009656004828003, "grad_norm": 10.020003318786621, "learning_rate": 1.9999442243042032e-05, "loss": 0.1654, "step": 23220 }, { "epoch": 28.021726010863006, "grad_norm": 9.956634521484375, "learning_rate": 1.9999442001787238e-05, "loss": 0.1535, "step": 23230 }, { "epoch": 28.03379601689801, "grad_norm": 10.451099395751953, "learning_rate": 1.9999441760532444e-05, "loss": 0.1589, "step": 23240 }, { "epoch": 28.045866022933012, "grad_norm": 9.36735725402832, "learning_rate": 1.999944151927765e-05, "loss": 0.1591, "step": 23250 }, { "epoch": 28.057936028968015, "grad_norm": 9.795157432556152, "learning_rate": 1.9999441278022857e-05, "loss": 0.1598, "step": 23260 }, { "epoch": 28.07000603500302, "grad_norm": 9.721526145935059, "learning_rate": 1.9999441036768063e-05, "loss": 0.1639, "step": 23270 }, { "epoch": 28.08207604103802, "grad_norm": 10.672928810119629, "learning_rate": 1.999944079551327e-05, "loss": 0.1658, "step": 23280 }, { "epoch": 28.094146047073025, "grad_norm": 9.864944458007812, "learning_rate": 1.9999440554258475e-05, "loss": 0.1648, "step": 23290 }, { "epoch": 28.106216053108028, "grad_norm": 10.254487991333008, "learning_rate": 1.999944031300368e-05, "loss": 0.1665, "step": 23300 }, { "epoch": 28.11828605914303, "grad_norm": 11.159327507019043, "learning_rate": 1.9999440071748888e-05, "loss": 0.1678, "step": 23310 }, { "epoch": 28.13035606517803, "grad_norm": 9.979256629943848, "learning_rate": 1.9999439830494094e-05, "loss": 0.1714, "step": 23320 }, { "epoch": 28.142426071213034, "grad_norm": 10.815245628356934, "learning_rate": 1.99994395892393e-05, "loss": 0.1695, "step": 23330 }, { "epoch": 28.154496077248037, "grad_norm": 10.931547164916992, "learning_rate": 1.9999439347984506e-05, "loss": 0.1735, "step": 23340 }, { "epoch": 28.16656608328304, "grad_norm": 10.429627418518066, "learning_rate": 1.9999439106729713e-05, "loss": 0.1797, "step": 23350 }, { "epoch": 28.178636089318044, "grad_norm": 10.502449035644531, "learning_rate": 1.999943886547492e-05, "loss": 0.1731, "step": 23360 }, { "epoch": 28.190706095353047, "grad_norm": 10.457746505737305, "learning_rate": 1.9999438624220125e-05, "loss": 0.1752, "step": 23370 }, { "epoch": 28.20277610138805, "grad_norm": 10.308084487915039, "learning_rate": 1.999943838296533e-05, "loss": 0.1729, "step": 23380 }, { "epoch": 28.214846107423053, "grad_norm": 10.114986419677734, "learning_rate": 1.9999438141710537e-05, "loss": 0.1749, "step": 23390 }, { "epoch": 28.226916113458056, "grad_norm": 11.489965438842773, "learning_rate": 1.9999437900455744e-05, "loss": 0.1758, "step": 23400 }, { "epoch": 28.23898611949306, "grad_norm": 11.15743637084961, "learning_rate": 1.999943765920095e-05, "loss": 0.1798, "step": 23410 }, { "epoch": 28.251056125528063, "grad_norm": 11.367542266845703, "learning_rate": 1.9999437417946156e-05, "loss": 0.1805, "step": 23420 }, { "epoch": 28.263126131563066, "grad_norm": 10.423439979553223, "learning_rate": 1.9999437176691362e-05, "loss": 0.1789, "step": 23430 }, { "epoch": 28.27519613759807, "grad_norm": 10.025616645812988, "learning_rate": 1.999943693543657e-05, "loss": 0.1769, "step": 23440 }, { "epoch": 28.287266143633072, "grad_norm": 10.259586334228516, "learning_rate": 1.999943669418177e-05, "loss": 0.177, "step": 23450 }, { "epoch": 28.299336149668076, "grad_norm": 10.79153060913086, "learning_rate": 1.9999436452926978e-05, "loss": 0.1781, "step": 23460 }, { "epoch": 28.31140615570308, "grad_norm": 11.20706558227539, "learning_rate": 1.9999436211672184e-05, "loss": 0.1799, "step": 23470 }, { "epoch": 28.323476161738082, "grad_norm": 10.652043342590332, "learning_rate": 1.999943597041739e-05, "loss": 0.1839, "step": 23480 }, { "epoch": 28.335546167773085, "grad_norm": 9.882771492004395, "learning_rate": 1.9999435729162596e-05, "loss": 0.1823, "step": 23490 }, { "epoch": 28.34761617380809, "grad_norm": 11.237621307373047, "learning_rate": 1.9999435487907802e-05, "loss": 0.1871, "step": 23500 }, { "epoch": 28.34761617380809, "eval_loss": 10.935832977294922, "eval_runtime": 8.1374, "eval_samples_per_second": 85.654, "eval_steps_per_second": 10.814, "step": 23500 }, { "epoch": 28.35968617984309, "grad_norm": 11.29205322265625, "learning_rate": 1.999943524665301e-05, "loss": 0.1853, "step": 23510 }, { "epoch": 28.371756185878095, "grad_norm": 10.050454139709473, "learning_rate": 1.9999435005398215e-05, "loss": 0.1854, "step": 23520 }, { "epoch": 28.383826191913094, "grad_norm": 12.630990028381348, "learning_rate": 1.999943476414342e-05, "loss": 0.1878, "step": 23530 }, { "epoch": 28.395896197948097, "grad_norm": 10.890552520751953, "learning_rate": 1.9999434522888627e-05, "loss": 0.1861, "step": 23540 }, { "epoch": 28.4079662039831, "grad_norm": 11.630450248718262, "learning_rate": 1.9999434281633834e-05, "loss": 0.1919, "step": 23550 }, { "epoch": 28.420036210018104, "grad_norm": 10.482986450195312, "learning_rate": 1.999943404037904e-05, "loss": 0.1924, "step": 23560 }, { "epoch": 28.432106216053107, "grad_norm": 11.062761306762695, "learning_rate": 1.9999433799124246e-05, "loss": 0.1905, "step": 23570 }, { "epoch": 28.44417622208811, "grad_norm": 11.204758644104004, "learning_rate": 1.9999433557869452e-05, "loss": 0.1897, "step": 23580 }, { "epoch": 28.456246228123113, "grad_norm": 10.9208345413208, "learning_rate": 1.999943331661466e-05, "loss": 0.1874, "step": 23590 }, { "epoch": 28.468316234158117, "grad_norm": 11.508627891540527, "learning_rate": 1.9999433075359868e-05, "loss": 0.1903, "step": 23600 }, { "epoch": 28.48038624019312, "grad_norm": 11.632482528686523, "learning_rate": 1.999943283410507e-05, "loss": 0.1868, "step": 23610 }, { "epoch": 28.492456246228123, "grad_norm": 11.14676570892334, "learning_rate": 1.9999432592850277e-05, "loss": 0.1898, "step": 23620 }, { "epoch": 28.504526252263126, "grad_norm": 11.772725105285645, "learning_rate": 1.9999432351595483e-05, "loss": 0.1935, "step": 23630 }, { "epoch": 28.51659625829813, "grad_norm": 11.242461204528809, "learning_rate": 1.999943211034069e-05, "loss": 0.19, "step": 23640 }, { "epoch": 28.528666264333133, "grad_norm": 11.44407844543457, "learning_rate": 1.9999431869085896e-05, "loss": 0.1901, "step": 23650 }, { "epoch": 28.540736270368136, "grad_norm": 11.546648025512695, "learning_rate": 1.9999431627831102e-05, "loss": 0.1945, "step": 23660 }, { "epoch": 28.55280627640314, "grad_norm": 11.494682312011719, "learning_rate": 1.9999431386576308e-05, "loss": 0.1932, "step": 23670 }, { "epoch": 28.564876282438142, "grad_norm": 11.401710510253906, "learning_rate": 1.9999431145321514e-05, "loss": 0.1928, "step": 23680 }, { "epoch": 28.576946288473145, "grad_norm": 11.534285545349121, "learning_rate": 1.999943090406672e-05, "loss": 0.1974, "step": 23690 }, { "epoch": 28.58901629450815, "grad_norm": 10.951064109802246, "learning_rate": 1.9999430662811927e-05, "loss": 0.1944, "step": 23700 }, { "epoch": 28.60108630054315, "grad_norm": 11.957467079162598, "learning_rate": 1.9999430421557133e-05, "loss": 0.1972, "step": 23710 }, { "epoch": 28.613156306578155, "grad_norm": 11.185625076293945, "learning_rate": 1.999943018030234e-05, "loss": 0.196, "step": 23720 }, { "epoch": 28.625226312613158, "grad_norm": 11.054397583007812, "learning_rate": 1.9999429939047545e-05, "loss": 0.1942, "step": 23730 }, { "epoch": 28.637296318648158, "grad_norm": 10.825431823730469, "learning_rate": 1.999942969779275e-05, "loss": 0.1961, "step": 23740 }, { "epoch": 28.64936632468316, "grad_norm": 11.866022109985352, "learning_rate": 1.9999429456537958e-05, "loss": 0.1968, "step": 23750 }, { "epoch": 28.661436330718164, "grad_norm": 12.447956085205078, "learning_rate": 1.9999429215283164e-05, "loss": 0.2001, "step": 23760 }, { "epoch": 28.673506336753167, "grad_norm": 11.653702735900879, "learning_rate": 1.999942897402837e-05, "loss": 0.1978, "step": 23770 }, { "epoch": 28.68557634278817, "grad_norm": 12.184391021728516, "learning_rate": 1.9999428732773576e-05, "loss": 0.1963, "step": 23780 }, { "epoch": 28.697646348823174, "grad_norm": 10.85653018951416, "learning_rate": 1.9999428491518783e-05, "loss": 0.1964, "step": 23790 }, { "epoch": 28.709716354858177, "grad_norm": 10.907907485961914, "learning_rate": 1.999942825026399e-05, "loss": 0.2037, "step": 23800 }, { "epoch": 28.72178636089318, "grad_norm": 12.104214668273926, "learning_rate": 1.9999428009009195e-05, "loss": 0.201, "step": 23810 }, { "epoch": 28.733856366928183, "grad_norm": 11.679441452026367, "learning_rate": 1.99994277677544e-05, "loss": 0.2021, "step": 23820 }, { "epoch": 28.745926372963186, "grad_norm": 11.05268669128418, "learning_rate": 1.9999427526499608e-05, "loss": 0.2008, "step": 23830 }, { "epoch": 28.75799637899819, "grad_norm": 11.793630599975586, "learning_rate": 1.9999427285244814e-05, "loss": 0.2018, "step": 23840 }, { "epoch": 28.770066385033193, "grad_norm": 11.380136489868164, "learning_rate": 1.999942704399002e-05, "loss": 0.2067, "step": 23850 }, { "epoch": 28.782136391068196, "grad_norm": 11.926124572753906, "learning_rate": 1.9999426802735223e-05, "loss": 0.2034, "step": 23860 }, { "epoch": 28.7942063971032, "grad_norm": 11.641789436340332, "learning_rate": 1.999942656148043e-05, "loss": 0.2078, "step": 23870 }, { "epoch": 28.806276403138202, "grad_norm": 12.03778076171875, "learning_rate": 1.9999426320225635e-05, "loss": 0.2085, "step": 23880 }, { "epoch": 28.818346409173206, "grad_norm": 11.722253799438477, "learning_rate": 1.999942607897084e-05, "loss": 0.2081, "step": 23890 }, { "epoch": 28.83041641520821, "grad_norm": 13.641056060791016, "learning_rate": 1.9999425837716048e-05, "loss": 0.2085, "step": 23900 }, { "epoch": 28.842486421243212, "grad_norm": 11.776001930236816, "learning_rate": 1.9999425596461254e-05, "loss": 0.2042, "step": 23910 }, { "epoch": 28.854556427278215, "grad_norm": 11.783409118652344, "learning_rate": 1.999942535520646e-05, "loss": 0.2087, "step": 23920 }, { "epoch": 28.86662643331322, "grad_norm": 11.38039493560791, "learning_rate": 1.9999425113951666e-05, "loss": 0.2052, "step": 23930 }, { "epoch": 28.878696439348218, "grad_norm": 11.031106948852539, "learning_rate": 1.9999424872696873e-05, "loss": 0.2055, "step": 23940 }, { "epoch": 28.89076644538322, "grad_norm": 11.766709327697754, "learning_rate": 1.999942463144208e-05, "loss": 0.2086, "step": 23950 }, { "epoch": 28.902836451418224, "grad_norm": 11.953241348266602, "learning_rate": 1.9999424390187285e-05, "loss": 0.2081, "step": 23960 }, { "epoch": 28.914906457453228, "grad_norm": 12.260711669921875, "learning_rate": 1.999942414893249e-05, "loss": 0.2067, "step": 23970 }, { "epoch": 28.92697646348823, "grad_norm": 11.860315322875977, "learning_rate": 1.9999423907677697e-05, "loss": 0.2184, "step": 23980 }, { "epoch": 28.939046469523234, "grad_norm": 11.378873825073242, "learning_rate": 1.9999423666422904e-05, "loss": 0.2145, "step": 23990 }, { "epoch": 28.951116475558237, "grad_norm": 11.446940422058105, "learning_rate": 1.999942342516811e-05, "loss": 0.2132, "step": 24000 }, { "epoch": 28.951116475558237, "eval_loss": 10.980401039123535, "eval_runtime": 8.1386, "eval_samples_per_second": 85.642, "eval_steps_per_second": 10.813, "step": 24000 }, { "epoch": 28.96318648159324, "grad_norm": 12.492013931274414, "learning_rate": 1.9999423183913316e-05, "loss": 0.2107, "step": 24010 }, { "epoch": 28.975256487628243, "grad_norm": 11.965349197387695, "learning_rate": 1.9999422942658522e-05, "loss": 0.2181, "step": 24020 }, { "epoch": 28.987326493663247, "grad_norm": 12.11598014831543, "learning_rate": 1.999942270140373e-05, "loss": 0.2093, "step": 24030 }, { "epoch": 28.99939649969825, "grad_norm": 12.006636619567871, "learning_rate": 1.9999422460148935e-05, "loss": 0.2169, "step": 24040 }, { "epoch": 29.010863005431503, "grad_norm": 9.018631935119629, "learning_rate": 1.999942221889414e-05, "loss": 0.1518, "step": 24050 }, { "epoch": 29.022933011466506, "grad_norm": 10.14444637298584, "learning_rate": 1.9999421977639347e-05, "loss": 0.1459, "step": 24060 }, { "epoch": 29.03500301750151, "grad_norm": 9.1492919921875, "learning_rate": 1.9999421736384553e-05, "loss": 0.1513, "step": 24070 }, { "epoch": 29.047073023536512, "grad_norm": 10.287590980529785, "learning_rate": 1.999942149512976e-05, "loss": 0.1534, "step": 24080 }, { "epoch": 29.059143029571516, "grad_norm": 9.035078048706055, "learning_rate": 1.9999421253874966e-05, "loss": 0.1567, "step": 24090 }, { "epoch": 29.07121303560652, "grad_norm": 9.18061351776123, "learning_rate": 1.9999421012620172e-05, "loss": 0.1523, "step": 24100 }, { "epoch": 29.083283041641522, "grad_norm": 10.505483627319336, "learning_rate": 1.9999420771365375e-05, "loss": 0.1528, "step": 24110 }, { "epoch": 29.095353047676525, "grad_norm": 10.854731559753418, "learning_rate": 1.999942053011058e-05, "loss": 0.1561, "step": 24120 }, { "epoch": 29.10742305371153, "grad_norm": 10.091670989990234, "learning_rate": 1.9999420288855787e-05, "loss": 0.1595, "step": 24130 }, { "epoch": 29.11949305974653, "grad_norm": 9.963739395141602, "learning_rate": 1.9999420047600997e-05, "loss": 0.1566, "step": 24140 }, { "epoch": 29.13156306578153, "grad_norm": 10.145078659057617, "learning_rate": 1.9999419806346203e-05, "loss": 0.1632, "step": 24150 }, { "epoch": 29.143633071816534, "grad_norm": 9.640326499938965, "learning_rate": 1.999941956509141e-05, "loss": 0.1631, "step": 24160 }, { "epoch": 29.155703077851538, "grad_norm": 10.715333938598633, "learning_rate": 1.9999419323836616e-05, "loss": 0.164, "step": 24170 }, { "epoch": 29.16777308388654, "grad_norm": 10.34982681274414, "learning_rate": 1.9999419082581822e-05, "loss": 0.1656, "step": 24180 }, { "epoch": 29.179843089921544, "grad_norm": 10.241085052490234, "learning_rate": 1.9999418841327028e-05, "loss": 0.1712, "step": 24190 }, { "epoch": 29.191913095956547, "grad_norm": 9.869625091552734, "learning_rate": 1.9999418600072234e-05, "loss": 0.1689, "step": 24200 }, { "epoch": 29.20398310199155, "grad_norm": 9.555562973022461, "learning_rate": 1.999941835881744e-05, "loss": 0.1664, "step": 24210 }, { "epoch": 29.216053108026554, "grad_norm": 10.343679428100586, "learning_rate": 1.9999418117562647e-05, "loss": 0.1694, "step": 24220 }, { "epoch": 29.228123114061557, "grad_norm": 10.749216079711914, "learning_rate": 1.9999417876307853e-05, "loss": 0.167, "step": 24230 }, { "epoch": 29.24019312009656, "grad_norm": 10.86333179473877, "learning_rate": 1.999941763505306e-05, "loss": 0.17, "step": 24240 }, { "epoch": 29.252263126131563, "grad_norm": 10.671010971069336, "learning_rate": 1.9999417393798265e-05, "loss": 0.1696, "step": 24250 }, { "epoch": 29.264333132166566, "grad_norm": 10.888466835021973, "learning_rate": 1.999941715254347e-05, "loss": 0.1702, "step": 24260 }, { "epoch": 29.27640313820157, "grad_norm": 10.72258186340332, "learning_rate": 1.9999416911288678e-05, "loss": 0.1675, "step": 24270 }, { "epoch": 29.288473144236573, "grad_norm": 10.515636444091797, "learning_rate": 1.999941667003388e-05, "loss": 0.1752, "step": 24280 }, { "epoch": 29.300543150271576, "grad_norm": 10.538119316101074, "learning_rate": 1.9999416428779087e-05, "loss": 0.1765, "step": 24290 }, { "epoch": 29.31261315630658, "grad_norm": 10.34943675994873, "learning_rate": 1.9999416187524293e-05, "loss": 0.1738, "step": 24300 }, { "epoch": 29.324683162341582, "grad_norm": 11.256940841674805, "learning_rate": 1.99994159462695e-05, "loss": 0.1756, "step": 24310 }, { "epoch": 29.336753168376585, "grad_norm": 11.45192813873291, "learning_rate": 1.9999415705014705e-05, "loss": 0.1698, "step": 24320 }, { "epoch": 29.34882317441159, "grad_norm": 10.285582542419434, "learning_rate": 1.999941546375991e-05, "loss": 0.1719, "step": 24330 }, { "epoch": 29.360893180446592, "grad_norm": 11.785667419433594, "learning_rate": 1.9999415222505118e-05, "loss": 0.1716, "step": 24340 }, { "epoch": 29.372963186481595, "grad_norm": 11.486804962158203, "learning_rate": 1.9999414981250324e-05, "loss": 0.1719, "step": 24350 }, { "epoch": 29.385033192516595, "grad_norm": 11.712807655334473, "learning_rate": 1.999941473999553e-05, "loss": 0.1782, "step": 24360 }, { "epoch": 29.397103198551598, "grad_norm": 11.032100677490234, "learning_rate": 1.9999414498740736e-05, "loss": 0.1798, "step": 24370 }, { "epoch": 29.4091732045866, "grad_norm": 10.580870628356934, "learning_rate": 1.9999414257485943e-05, "loss": 0.1809, "step": 24380 }, { "epoch": 29.421243210621604, "grad_norm": 11.291820526123047, "learning_rate": 1.999941401623115e-05, "loss": 0.1778, "step": 24390 }, { "epoch": 29.433313216656607, "grad_norm": 10.522809982299805, "learning_rate": 1.9999413774976355e-05, "loss": 0.1783, "step": 24400 }, { "epoch": 29.44538322269161, "grad_norm": 11.072992324829102, "learning_rate": 1.999941353372156e-05, "loss": 0.1823, "step": 24410 }, { "epoch": 29.457453228726614, "grad_norm": 10.825872421264648, "learning_rate": 1.9999413292466768e-05, "loss": 0.1845, "step": 24420 }, { "epoch": 29.469523234761617, "grad_norm": 11.032371520996094, "learning_rate": 1.9999413051211974e-05, "loss": 0.1836, "step": 24430 }, { "epoch": 29.48159324079662, "grad_norm": 10.224273681640625, "learning_rate": 1.999941280995718e-05, "loss": 0.178, "step": 24440 }, { "epoch": 29.493663246831623, "grad_norm": 11.186830520629883, "learning_rate": 1.9999412568702386e-05, "loss": 0.1847, "step": 24450 }, { "epoch": 29.505733252866627, "grad_norm": 10.460297584533691, "learning_rate": 1.9999412327447592e-05, "loss": 0.1807, "step": 24460 }, { "epoch": 29.51780325890163, "grad_norm": 11.108163833618164, "learning_rate": 1.99994120861928e-05, "loss": 0.1805, "step": 24470 }, { "epoch": 29.529873264936633, "grad_norm": 11.30748462677002, "learning_rate": 1.9999411844938005e-05, "loss": 0.1889, "step": 24480 }, { "epoch": 29.541943270971636, "grad_norm": 10.44050121307373, "learning_rate": 1.999941160368321e-05, "loss": 0.1851, "step": 24490 }, { "epoch": 29.55401327700664, "grad_norm": 10.783740043640137, "learning_rate": 1.9999411362428417e-05, "loss": 0.1861, "step": 24500 }, { "epoch": 29.55401327700664, "eval_loss": 11.021925926208496, "eval_runtime": 8.1388, "eval_samples_per_second": 85.639, "eval_steps_per_second": 10.812, "step": 24500 }, { "epoch": 29.566083283041642, "grad_norm": 10.426368713378906, "learning_rate": 1.9999411121173623e-05, "loss": 0.1798, "step": 24510 }, { "epoch": 29.578153289076646, "grad_norm": 10.093259811401367, "learning_rate": 1.999941087991883e-05, "loss": 0.1776, "step": 24520 }, { "epoch": 29.59022329511165, "grad_norm": 11.949729919433594, "learning_rate": 1.9999410638664032e-05, "loss": 0.186, "step": 24530 }, { "epoch": 29.602293301146652, "grad_norm": 10.888249397277832, "learning_rate": 1.999941039740924e-05, "loss": 0.1845, "step": 24540 }, { "epoch": 29.614363307181655, "grad_norm": 11.534008979797363, "learning_rate": 1.9999410156154445e-05, "loss": 0.1861, "step": 24550 }, { "epoch": 29.62643331321666, "grad_norm": 11.45606803894043, "learning_rate": 1.999940991489965e-05, "loss": 0.1839, "step": 24560 }, { "epoch": 29.638503319251658, "grad_norm": 11.28010368347168, "learning_rate": 1.9999409673644857e-05, "loss": 0.1834, "step": 24570 }, { "epoch": 29.65057332528666, "grad_norm": 11.616080284118652, "learning_rate": 1.9999409432390064e-05, "loss": 0.1876, "step": 24580 }, { "epoch": 29.662643331321664, "grad_norm": 10.6399564743042, "learning_rate": 1.999940919113527e-05, "loss": 0.1879, "step": 24590 }, { "epoch": 29.674713337356668, "grad_norm": 10.903552055358887, "learning_rate": 1.9999408949880476e-05, "loss": 0.1835, "step": 24600 }, { "epoch": 29.68678334339167, "grad_norm": 10.510683059692383, "learning_rate": 1.9999408708625682e-05, "loss": 0.1858, "step": 24610 }, { "epoch": 29.698853349426674, "grad_norm": 10.858545303344727, "learning_rate": 1.999940846737089e-05, "loss": 0.1893, "step": 24620 }, { "epoch": 29.710923355461677, "grad_norm": 10.919438362121582, "learning_rate": 1.9999408226116095e-05, "loss": 0.188, "step": 24630 }, { "epoch": 29.72299336149668, "grad_norm": 10.941174507141113, "learning_rate": 1.99994079848613e-05, "loss": 0.19, "step": 24640 }, { "epoch": 29.735063367531684, "grad_norm": 11.067841529846191, "learning_rate": 1.9999407743606507e-05, "loss": 0.1914, "step": 24650 }, { "epoch": 29.747133373566687, "grad_norm": 12.060553550720215, "learning_rate": 1.9999407502351713e-05, "loss": 0.1956, "step": 24660 }, { "epoch": 29.75920337960169, "grad_norm": 11.530546188354492, "learning_rate": 1.999940726109692e-05, "loss": 0.1923, "step": 24670 }, { "epoch": 29.771273385636693, "grad_norm": 10.972773551940918, "learning_rate": 1.999940701984213e-05, "loss": 0.1942, "step": 24680 }, { "epoch": 29.783343391671696, "grad_norm": 12.042237281799316, "learning_rate": 1.9999406778587332e-05, "loss": 0.1996, "step": 24690 }, { "epoch": 29.7954133977067, "grad_norm": 11.40119743347168, "learning_rate": 1.9999406537332538e-05, "loss": 0.1943, "step": 24700 }, { "epoch": 29.807483403741703, "grad_norm": 9.694221496582031, "learning_rate": 1.9999406296077744e-05, "loss": 0.1991, "step": 24710 }, { "epoch": 29.819553409776706, "grad_norm": 11.318115234375, "learning_rate": 1.999940605482295e-05, "loss": 0.1987, "step": 24720 }, { "epoch": 29.83162341581171, "grad_norm": 10.389748573303223, "learning_rate": 1.9999405813568157e-05, "loss": 0.192, "step": 24730 }, { "epoch": 29.843693421846712, "grad_norm": 11.371533393859863, "learning_rate": 1.9999405572313363e-05, "loss": 0.1923, "step": 24740 }, { "epoch": 29.855763427881715, "grad_norm": 10.485976219177246, "learning_rate": 1.999940533105857e-05, "loss": 0.1917, "step": 24750 }, { "epoch": 29.86783343391672, "grad_norm": 12.720316886901855, "learning_rate": 1.9999405089803775e-05, "loss": 0.1971, "step": 24760 }, { "epoch": 29.87990343995172, "grad_norm": 11.740938186645508, "learning_rate": 1.999940484854898e-05, "loss": 0.1966, "step": 24770 }, { "epoch": 29.89197344598672, "grad_norm": 12.620857238769531, "learning_rate": 1.9999404607294188e-05, "loss": 0.1914, "step": 24780 }, { "epoch": 29.904043452021725, "grad_norm": 12.072465896606445, "learning_rate": 1.9999404366039394e-05, "loss": 0.204, "step": 24790 }, { "epoch": 29.916113458056728, "grad_norm": 11.970772743225098, "learning_rate": 1.99994041247846e-05, "loss": 0.1994, "step": 24800 }, { "epoch": 29.92818346409173, "grad_norm": 11.768716812133789, "learning_rate": 1.9999403883529807e-05, "loss": 0.1972, "step": 24810 }, { "epoch": 29.940253470126734, "grad_norm": 11.2447509765625, "learning_rate": 1.9999403642275013e-05, "loss": 0.2002, "step": 24820 }, { "epoch": 29.952323476161737, "grad_norm": 12.015101432800293, "learning_rate": 1.999940340102022e-05, "loss": 0.1986, "step": 24830 }, { "epoch": 29.96439348219674, "grad_norm": 11.633752822875977, "learning_rate": 1.9999403159765425e-05, "loss": 0.1936, "step": 24840 }, { "epoch": 29.976463488231744, "grad_norm": 11.797868728637695, "learning_rate": 1.999940291851063e-05, "loss": 0.1949, "step": 24850 }, { "epoch": 29.988533494266747, "grad_norm": 11.555895805358887, "learning_rate": 1.9999402677255838e-05, "loss": 0.1953, "step": 24860 }, { "epoch": 30.0, "grad_norm": 18.36157989501953, "learning_rate": 1.9999402436001044e-05, "loss": 0.1978, "step": 24870 }, { "epoch": 30.012070006035003, "grad_norm": 9.484288215637207, "learning_rate": 1.999940219474625e-05, "loss": 0.1313, "step": 24880 }, { "epoch": 30.024140012070006, "grad_norm": 9.04455852508545, "learning_rate": 1.9999401953491456e-05, "loss": 0.1347, "step": 24890 }, { "epoch": 30.03621001810501, "grad_norm": 8.84843921661377, "learning_rate": 1.9999401712236662e-05, "loss": 0.1442, "step": 24900 }, { "epoch": 30.048280024140013, "grad_norm": 9.006903648376465, "learning_rate": 1.999940147098187e-05, "loss": 0.1427, "step": 24910 }, { "epoch": 30.060350030175016, "grad_norm": 10.148063659667969, "learning_rate": 1.9999401229727075e-05, "loss": 0.1465, "step": 24920 }, { "epoch": 30.07242003621002, "grad_norm": 9.83035945892334, "learning_rate": 1.999940098847228e-05, "loss": 0.1473, "step": 24930 }, { "epoch": 30.084490042245022, "grad_norm": 10.542352676391602, "learning_rate": 1.9999400747217484e-05, "loss": 0.1481, "step": 24940 }, { "epoch": 30.096560048280026, "grad_norm": 9.477688789367676, "learning_rate": 1.999940050596269e-05, "loss": 0.1454, "step": 24950 }, { "epoch": 30.10863005431503, "grad_norm": 9.69674301147461, "learning_rate": 1.9999400264707896e-05, "loss": 0.149, "step": 24960 }, { "epoch": 30.120700060350032, "grad_norm": 9.813857078552246, "learning_rate": 1.9999400023453103e-05, "loss": 0.1574, "step": 24970 }, { "epoch": 30.13277006638503, "grad_norm": 11.212791442871094, "learning_rate": 1.999939978219831e-05, "loss": 0.1551, "step": 24980 }, { "epoch": 30.144840072420035, "grad_norm": 9.550793647766113, "learning_rate": 1.9999399540943515e-05, "loss": 0.1553, "step": 24990 }, { "epoch": 30.156910078455038, "grad_norm": 10.435635566711426, "learning_rate": 1.999939929968872e-05, "loss": 0.1552, "step": 25000 }, { "epoch": 30.156910078455038, "eval_loss": 11.060230255126953, "eval_runtime": 8.1261, "eval_samples_per_second": 85.773, "eval_steps_per_second": 10.829, "step": 25000 }, { "epoch": 30.16898008449004, "grad_norm": 10.259568214416504, "learning_rate": 1.9999399058433927e-05, "loss": 0.1578, "step": 25010 }, { "epoch": 30.181050090525044, "grad_norm": 9.662225723266602, "learning_rate": 1.9999398817179134e-05, "loss": 0.1586, "step": 25020 }, { "epoch": 30.193120096560047, "grad_norm": 9.7044038772583, "learning_rate": 1.999939857592434e-05, "loss": 0.1578, "step": 25030 }, { "epoch": 30.20519010259505, "grad_norm": 10.872008323669434, "learning_rate": 1.9999398334669546e-05, "loss": 0.1575, "step": 25040 }, { "epoch": 30.217260108630054, "grad_norm": 9.663793563842773, "learning_rate": 1.9999398093414752e-05, "loss": 0.1569, "step": 25050 }, { "epoch": 30.229330114665057, "grad_norm": 10.278862953186035, "learning_rate": 1.999939785215996e-05, "loss": 0.1569, "step": 25060 }, { "epoch": 30.24140012070006, "grad_norm": 11.129411697387695, "learning_rate": 1.9999397610905165e-05, "loss": 0.1571, "step": 25070 }, { "epoch": 30.253470126735063, "grad_norm": 10.031952857971191, "learning_rate": 1.999939736965037e-05, "loss": 0.1591, "step": 25080 }, { "epoch": 30.265540132770067, "grad_norm": 9.7171049118042, "learning_rate": 1.9999397128395577e-05, "loss": 0.1623, "step": 25090 }, { "epoch": 30.27761013880507, "grad_norm": 10.997227668762207, "learning_rate": 1.9999396887140783e-05, "loss": 0.163, "step": 25100 }, { "epoch": 30.289680144840073, "grad_norm": 10.489875793457031, "learning_rate": 1.999939664588599e-05, "loss": 0.1654, "step": 25110 }, { "epoch": 30.301750150875076, "grad_norm": 10.222273826599121, "learning_rate": 1.9999396404631196e-05, "loss": 0.1636, "step": 25120 }, { "epoch": 30.31382015691008, "grad_norm": 9.876115798950195, "learning_rate": 1.9999396163376402e-05, "loss": 0.167, "step": 25130 }, { "epoch": 30.325890162945083, "grad_norm": 10.499923706054688, "learning_rate": 1.9999395922121608e-05, "loss": 0.1641, "step": 25140 }, { "epoch": 30.337960168980086, "grad_norm": 11.045660972595215, "learning_rate": 1.9999395680866814e-05, "loss": 0.1663, "step": 25150 }, { "epoch": 30.35003017501509, "grad_norm": 10.239863395690918, "learning_rate": 1.999939543961202e-05, "loss": 0.1644, "step": 25160 }, { "epoch": 30.362100181050092, "grad_norm": 10.799995422363281, "learning_rate": 1.9999395198357227e-05, "loss": 0.1624, "step": 25170 }, { "epoch": 30.37417018708509, "grad_norm": 11.07650375366211, "learning_rate": 1.9999394957102433e-05, "loss": 0.164, "step": 25180 }, { "epoch": 30.386240193120095, "grad_norm": 10.551032066345215, "learning_rate": 1.9999394715847636e-05, "loss": 0.1659, "step": 25190 }, { "epoch": 30.398310199155098, "grad_norm": 10.859048843383789, "learning_rate": 1.9999394474592842e-05, "loss": 0.1689, "step": 25200 }, { "epoch": 30.4103802051901, "grad_norm": 10.403471946716309, "learning_rate": 1.999939423333805e-05, "loss": 0.1681, "step": 25210 }, { "epoch": 30.422450211225105, "grad_norm": 11.488990783691406, "learning_rate": 1.9999393992083258e-05, "loss": 0.1686, "step": 25220 }, { "epoch": 30.434520217260108, "grad_norm": 9.619407653808594, "learning_rate": 1.9999393750828464e-05, "loss": 0.1664, "step": 25230 }, { "epoch": 30.44659022329511, "grad_norm": 10.994078636169434, "learning_rate": 1.999939350957367e-05, "loss": 0.1729, "step": 25240 }, { "epoch": 30.458660229330114, "grad_norm": 10.885587692260742, "learning_rate": 1.9999393268318877e-05, "loss": 0.1693, "step": 25250 }, { "epoch": 30.470730235365117, "grad_norm": 11.05837345123291, "learning_rate": 1.9999393027064083e-05, "loss": 0.1725, "step": 25260 }, { "epoch": 30.48280024140012, "grad_norm": 11.756367683410645, "learning_rate": 1.999939278580929e-05, "loss": 0.172, "step": 25270 }, { "epoch": 30.494870247435124, "grad_norm": 10.182501792907715, "learning_rate": 1.9999392544554495e-05, "loss": 0.1717, "step": 25280 }, { "epoch": 30.506940253470127, "grad_norm": 10.459653854370117, "learning_rate": 1.99993923032997e-05, "loss": 0.1691, "step": 25290 }, { "epoch": 30.51901025950513, "grad_norm": 10.647300720214844, "learning_rate": 1.9999392062044908e-05, "loss": 0.1702, "step": 25300 }, { "epoch": 30.531080265540133, "grad_norm": 10.178346633911133, "learning_rate": 1.9999391820790114e-05, "loss": 0.1755, "step": 25310 }, { "epoch": 30.543150271575136, "grad_norm": 9.877023696899414, "learning_rate": 1.999939157953532e-05, "loss": 0.1696, "step": 25320 }, { "epoch": 30.55522027761014, "grad_norm": 10.132230758666992, "learning_rate": 1.9999391338280526e-05, "loss": 0.1751, "step": 25330 }, { "epoch": 30.567290283645143, "grad_norm": 10.732073783874512, "learning_rate": 1.9999391097025733e-05, "loss": 0.177, "step": 25340 }, { "epoch": 30.579360289680146, "grad_norm": 10.545641899108887, "learning_rate": 1.999939085577094e-05, "loss": 0.1715, "step": 25350 }, { "epoch": 30.59143029571515, "grad_norm": 10.974493980407715, "learning_rate": 1.999939061451614e-05, "loss": 0.1718, "step": 25360 }, { "epoch": 30.603500301750152, "grad_norm": 10.182547569274902, "learning_rate": 1.9999390373261348e-05, "loss": 0.1683, "step": 25370 }, { "epoch": 30.615570307785156, "grad_norm": 9.840641021728516, "learning_rate": 1.9999390132006554e-05, "loss": 0.1698, "step": 25380 }, { "epoch": 30.62764031382016, "grad_norm": 11.735215187072754, "learning_rate": 1.999938989075176e-05, "loss": 0.1814, "step": 25390 }, { "epoch": 30.63971031985516, "grad_norm": 11.663591384887695, "learning_rate": 1.9999389649496966e-05, "loss": 0.1779, "step": 25400 }, { "epoch": 30.65178032589016, "grad_norm": 9.98086166381836, "learning_rate": 1.9999389408242173e-05, "loss": 0.1776, "step": 25410 }, { "epoch": 30.663850331925165, "grad_norm": 11.113421440124512, "learning_rate": 1.999938916698738e-05, "loss": 0.1778, "step": 25420 }, { "epoch": 30.675920337960168, "grad_norm": 11.250824928283691, "learning_rate": 1.9999388925732585e-05, "loss": 0.1823, "step": 25430 }, { "epoch": 30.68799034399517, "grad_norm": 11.133387565612793, "learning_rate": 1.999938868447779e-05, "loss": 0.184, "step": 25440 }, { "epoch": 30.700060350030174, "grad_norm": 10.359094619750977, "learning_rate": 1.9999388443222998e-05, "loss": 0.1792, "step": 25450 }, { "epoch": 30.712130356065177, "grad_norm": 11.349752426147461, "learning_rate": 1.9999388201968204e-05, "loss": 0.1807, "step": 25460 }, { "epoch": 30.72420036210018, "grad_norm": 12.193495750427246, "learning_rate": 1.999938796071341e-05, "loss": 0.1803, "step": 25470 }, { "epoch": 30.736270368135184, "grad_norm": 10.71377182006836, "learning_rate": 1.9999387719458616e-05, "loss": 0.1784, "step": 25480 }, { "epoch": 30.748340374170187, "grad_norm": 10.867632865905762, "learning_rate": 1.9999387478203822e-05, "loss": 0.1796, "step": 25490 }, { "epoch": 30.76041038020519, "grad_norm": 10.502401351928711, "learning_rate": 1.999938723694903e-05, "loss": 0.176, "step": 25500 }, { "epoch": 30.76041038020519, "eval_loss": 11.116083145141602, "eval_runtime": 8.1282, "eval_samples_per_second": 85.751, "eval_steps_per_second": 10.827, "step": 25500 }, { "epoch": 30.772480386240193, "grad_norm": 10.810869216918945, "learning_rate": 1.9999386995694235e-05, "loss": 0.1777, "step": 25510 }, { "epoch": 30.784550392275197, "grad_norm": 10.430785179138184, "learning_rate": 1.999938675443944e-05, "loss": 0.1765, "step": 25520 }, { "epoch": 30.7966203983102, "grad_norm": 10.662517547607422, "learning_rate": 1.9999386513184647e-05, "loss": 0.1841, "step": 25530 }, { "epoch": 30.808690404345203, "grad_norm": 10.260457038879395, "learning_rate": 1.9999386271929853e-05, "loss": 0.178, "step": 25540 }, { "epoch": 30.820760410380206, "grad_norm": 10.346382141113281, "learning_rate": 1.999938603067506e-05, "loss": 0.1811, "step": 25550 }, { "epoch": 30.83283041641521, "grad_norm": 10.362434387207031, "learning_rate": 1.9999385789420266e-05, "loss": 0.1828, "step": 25560 }, { "epoch": 30.844900422450213, "grad_norm": 10.645103454589844, "learning_rate": 1.9999385548165472e-05, "loss": 0.1824, "step": 25570 }, { "epoch": 30.856970428485216, "grad_norm": 12.029854774475098, "learning_rate": 1.999938530691068e-05, "loss": 0.1785, "step": 25580 }, { "epoch": 30.86904043452022, "grad_norm": 10.400871276855469, "learning_rate": 1.9999385065655885e-05, "loss": 0.1808, "step": 25590 }, { "epoch": 30.88111044055522, "grad_norm": 10.445680618286133, "learning_rate": 1.999938482440109e-05, "loss": 0.1815, "step": 25600 }, { "epoch": 30.893180446590222, "grad_norm": 10.7264986038208, "learning_rate": 1.9999384583146294e-05, "loss": 0.1799, "step": 25610 }, { "epoch": 30.905250452625225, "grad_norm": 11.47745132446289, "learning_rate": 1.99993843418915e-05, "loss": 0.1827, "step": 25620 }, { "epoch": 30.917320458660228, "grad_norm": 11.636698722839355, "learning_rate": 1.9999384100636706e-05, "loss": 0.187, "step": 25630 }, { "epoch": 30.92939046469523, "grad_norm": 11.625661849975586, "learning_rate": 1.9999383859381912e-05, "loss": 0.1833, "step": 25640 }, { "epoch": 30.941460470730235, "grad_norm": 11.07909107208252, "learning_rate": 1.999938361812712e-05, "loss": 0.1905, "step": 25650 }, { "epoch": 30.953530476765238, "grad_norm": 10.838208198547363, "learning_rate": 1.9999383376872325e-05, "loss": 0.1937, "step": 25660 }, { "epoch": 30.96560048280024, "grad_norm": 10.911153793334961, "learning_rate": 1.999938313561753e-05, "loss": 0.1904, "step": 25670 }, { "epoch": 30.977670488835244, "grad_norm": 10.579143524169922, "learning_rate": 1.9999382894362737e-05, "loss": 0.1846, "step": 25680 }, { "epoch": 30.989740494870247, "grad_norm": 11.078817367553711, "learning_rate": 1.9999382653107943e-05, "loss": 0.1859, "step": 25690 }, { "epoch": 31.0012070006035, "grad_norm": 9.152548789978027, "learning_rate": 1.999938241185315e-05, "loss": 0.1817, "step": 25700 }, { "epoch": 31.013277006638504, "grad_norm": 9.638137817382812, "learning_rate": 1.9999382170598356e-05, "loss": 0.1279, "step": 25710 }, { "epoch": 31.025347012673507, "grad_norm": 8.945830345153809, "learning_rate": 1.9999381929343562e-05, "loss": 0.1319, "step": 25720 }, { "epoch": 31.03741701870851, "grad_norm": 9.146814346313477, "learning_rate": 1.9999381688088768e-05, "loss": 0.1338, "step": 25730 }, { "epoch": 31.049487024743513, "grad_norm": 9.326952934265137, "learning_rate": 1.9999381446833974e-05, "loss": 0.1348, "step": 25740 }, { "epoch": 31.061557030778516, "grad_norm": 8.957891464233398, "learning_rate": 1.999938120557918e-05, "loss": 0.1326, "step": 25750 }, { "epoch": 31.07362703681352, "grad_norm": 9.5245361328125, "learning_rate": 1.999938096432439e-05, "loss": 0.1408, "step": 25760 }, { "epoch": 31.085697042848523, "grad_norm": 10.062725067138672, "learning_rate": 1.9999380723069593e-05, "loss": 0.139, "step": 25770 }, { "epoch": 31.097767048883526, "grad_norm": 9.808100700378418, "learning_rate": 1.99993804818148e-05, "loss": 0.142, "step": 25780 }, { "epoch": 31.10983705491853, "grad_norm": 9.84362506866455, "learning_rate": 1.9999380240560005e-05, "loss": 0.1456, "step": 25790 }, { "epoch": 31.121907060953532, "grad_norm": 10.83470344543457, "learning_rate": 1.999937999930521e-05, "loss": 0.148, "step": 25800 }, { "epoch": 31.133977066988532, "grad_norm": 10.221667289733887, "learning_rate": 1.9999379758050418e-05, "loss": 0.1462, "step": 25810 }, { "epoch": 31.146047073023535, "grad_norm": 9.701325416564941, "learning_rate": 1.9999379516795624e-05, "loss": 0.1462, "step": 25820 }, { "epoch": 31.158117079058538, "grad_norm": 9.852583885192871, "learning_rate": 1.999937927554083e-05, "loss": 0.1458, "step": 25830 }, { "epoch": 31.17018708509354, "grad_norm": 10.442230224609375, "learning_rate": 1.9999379034286037e-05, "loss": 0.1489, "step": 25840 }, { "epoch": 31.182257091128545, "grad_norm": 9.717171669006348, "learning_rate": 1.9999378793031243e-05, "loss": 0.1506, "step": 25850 }, { "epoch": 31.194327097163548, "grad_norm": 9.506509780883789, "learning_rate": 1.999937855177645e-05, "loss": 0.1512, "step": 25860 }, { "epoch": 31.20639710319855, "grad_norm": 10.346840858459473, "learning_rate": 1.9999378310521655e-05, "loss": 0.1487, "step": 25870 }, { "epoch": 31.218467109233554, "grad_norm": 10.699027061462402, "learning_rate": 1.999937806926686e-05, "loss": 0.1475, "step": 25880 }, { "epoch": 31.230537115268557, "grad_norm": 10.203125, "learning_rate": 1.9999377828012068e-05, "loss": 0.1487, "step": 25890 }, { "epoch": 31.24260712130356, "grad_norm": 9.990378379821777, "learning_rate": 1.9999377586757274e-05, "loss": 0.1487, "step": 25900 }, { "epoch": 31.254677127338564, "grad_norm": 11.253348350524902, "learning_rate": 1.999937734550248e-05, "loss": 0.1543, "step": 25910 }, { "epoch": 31.266747133373567, "grad_norm": 10.15051555633545, "learning_rate": 1.9999377104247686e-05, "loss": 0.154, "step": 25920 }, { "epoch": 31.27881713940857, "grad_norm": 9.649221420288086, "learning_rate": 1.9999376862992892e-05, "loss": 0.1528, "step": 25930 }, { "epoch": 31.290887145443573, "grad_norm": 9.265660285949707, "learning_rate": 1.99993766217381e-05, "loss": 0.1539, "step": 25940 }, { "epoch": 31.302957151478576, "grad_norm": 9.740531921386719, "learning_rate": 1.9999376380483305e-05, "loss": 0.1507, "step": 25950 }, { "epoch": 31.31502715751358, "grad_norm": 10.919938087463379, "learning_rate": 1.999937613922851e-05, "loss": 0.1487, "step": 25960 }, { "epoch": 31.327097163548583, "grad_norm": 10.794705390930176, "learning_rate": 1.9999375897973717e-05, "loss": 0.1592, "step": 25970 }, { "epoch": 31.339167169583586, "grad_norm": 10.4498929977417, "learning_rate": 1.9999375656718924e-05, "loss": 0.1601, "step": 25980 }, { "epoch": 31.35123717561859, "grad_norm": 11.009072303771973, "learning_rate": 1.999937541546413e-05, "loss": 0.1575, "step": 25990 }, { "epoch": 31.363307181653592, "grad_norm": 11.084282875061035, "learning_rate": 1.9999375174209336e-05, "loss": 0.1566, "step": 26000 }, { "epoch": 31.363307181653592, "eval_loss": 11.116002082824707, "eval_runtime": 8.1287, "eval_samples_per_second": 85.746, "eval_steps_per_second": 10.826, "step": 26000 }, { "epoch": 31.375377187688592, "grad_norm": 10.007634162902832, "learning_rate": 1.9999374932954542e-05, "loss": 0.1546, "step": 26010 }, { "epoch": 31.387447193723595, "grad_norm": 10.166543006896973, "learning_rate": 1.9999374691699745e-05, "loss": 0.1548, "step": 26020 }, { "epoch": 31.3995171997586, "grad_norm": 10.239744186401367, "learning_rate": 1.999937445044495e-05, "loss": 0.162, "step": 26030 }, { "epoch": 31.4115872057936, "grad_norm": 10.251249313354492, "learning_rate": 1.9999374209190157e-05, "loss": 0.1655, "step": 26040 }, { "epoch": 31.423657211828605, "grad_norm": 9.373196601867676, "learning_rate": 1.9999373967935364e-05, "loss": 0.1607, "step": 26050 }, { "epoch": 31.435727217863608, "grad_norm": 9.735694885253906, "learning_rate": 1.999937372668057e-05, "loss": 0.1642, "step": 26060 }, { "epoch": 31.44779722389861, "grad_norm": 9.892690658569336, "learning_rate": 1.9999373485425776e-05, "loss": 0.1606, "step": 26070 }, { "epoch": 31.459867229933614, "grad_norm": 10.38939094543457, "learning_rate": 1.9999373244170982e-05, "loss": 0.1617, "step": 26080 }, { "epoch": 31.471937235968618, "grad_norm": 10.640109062194824, "learning_rate": 1.999937300291619e-05, "loss": 0.1589, "step": 26090 }, { "epoch": 31.48400724200362, "grad_norm": 9.90523624420166, "learning_rate": 1.9999372761661395e-05, "loss": 0.1596, "step": 26100 }, { "epoch": 31.496077248038624, "grad_norm": 10.890387535095215, "learning_rate": 1.99993725204066e-05, "loss": 0.165, "step": 26110 }, { "epoch": 31.508147254073627, "grad_norm": 10.303903579711914, "learning_rate": 1.9999372279151807e-05, "loss": 0.1577, "step": 26120 }, { "epoch": 31.52021726010863, "grad_norm": 9.80852222442627, "learning_rate": 1.9999372037897013e-05, "loss": 0.1593, "step": 26130 }, { "epoch": 31.532287266143634, "grad_norm": 10.040470123291016, "learning_rate": 1.999937179664222e-05, "loss": 0.1618, "step": 26140 }, { "epoch": 31.544357272178637, "grad_norm": 9.804975509643555, "learning_rate": 1.9999371555387426e-05, "loss": 0.1716, "step": 26150 }, { "epoch": 31.55642727821364, "grad_norm": 10.282870292663574, "learning_rate": 1.9999371314132632e-05, "loss": 0.1623, "step": 26160 }, { "epoch": 31.568497284248643, "grad_norm": 9.145492553710938, "learning_rate": 1.9999371072877838e-05, "loss": 0.1587, "step": 26170 }, { "epoch": 31.580567290283646, "grad_norm": 9.994990348815918, "learning_rate": 1.9999370831623044e-05, "loss": 0.1654, "step": 26180 }, { "epoch": 31.59263729631865, "grad_norm": 10.621403694152832, "learning_rate": 1.999937059036825e-05, "loss": 0.1624, "step": 26190 }, { "epoch": 31.604707302353653, "grad_norm": 10.145188331604004, "learning_rate": 1.9999370349113457e-05, "loss": 0.1666, "step": 26200 }, { "epoch": 31.616777308388656, "grad_norm": 11.583301544189453, "learning_rate": 1.9999370107858663e-05, "loss": 0.1685, "step": 26210 }, { "epoch": 31.62884731442366, "grad_norm": 10.851933479309082, "learning_rate": 1.999936986660387e-05, "loss": 0.167, "step": 26220 }, { "epoch": 31.64091732045866, "grad_norm": 11.196303367614746, "learning_rate": 1.9999369625349076e-05, "loss": 0.1723, "step": 26230 }, { "epoch": 31.652987326493662, "grad_norm": 9.935582160949707, "learning_rate": 1.9999369384094282e-05, "loss": 0.1683, "step": 26240 }, { "epoch": 31.665057332528665, "grad_norm": 10.899296760559082, "learning_rate": 1.9999369142839488e-05, "loss": 0.1679, "step": 26250 }, { "epoch": 31.67712733856367, "grad_norm": 10.813987731933594, "learning_rate": 1.9999368901584694e-05, "loss": 0.1664, "step": 26260 }, { "epoch": 31.68919734459867, "grad_norm": 10.278332710266113, "learning_rate": 1.9999368660329897e-05, "loss": 0.1653, "step": 26270 }, { "epoch": 31.701267350633675, "grad_norm": 10.68166732788086, "learning_rate": 1.9999368419075103e-05, "loss": 0.1685, "step": 26280 }, { "epoch": 31.713337356668678, "grad_norm": 10.557352066040039, "learning_rate": 1.999936817782031e-05, "loss": 0.1787, "step": 26290 }, { "epoch": 31.72540736270368, "grad_norm": 11.611833572387695, "learning_rate": 1.999936793656552e-05, "loss": 0.1732, "step": 26300 }, { "epoch": 31.737477368738684, "grad_norm": 11.234491348266602, "learning_rate": 1.9999367695310725e-05, "loss": 0.171, "step": 26310 }, { "epoch": 31.749547374773687, "grad_norm": 9.939167022705078, "learning_rate": 1.999936745405593e-05, "loss": 0.1715, "step": 26320 }, { "epoch": 31.76161738080869, "grad_norm": 11.4032621383667, "learning_rate": 1.9999367212801138e-05, "loss": 0.1724, "step": 26330 }, { "epoch": 31.773687386843694, "grad_norm": 10.827996253967285, "learning_rate": 1.9999366971546344e-05, "loss": 0.1731, "step": 26340 }, { "epoch": 31.785757392878697, "grad_norm": 10.877534866333008, "learning_rate": 1.999936673029155e-05, "loss": 0.1758, "step": 26350 }, { "epoch": 31.7978273989137, "grad_norm": 11.06076717376709, "learning_rate": 1.9999366489036756e-05, "loss": 0.1756, "step": 26360 }, { "epoch": 31.809897404948703, "grad_norm": 10.38868236541748, "learning_rate": 1.9999366247781963e-05, "loss": 0.172, "step": 26370 }, { "epoch": 31.821967410983707, "grad_norm": 10.732746124267578, "learning_rate": 1.999936600652717e-05, "loss": 0.1756, "step": 26380 }, { "epoch": 31.83403741701871, "grad_norm": 10.62848949432373, "learning_rate": 1.9999365765272375e-05, "loss": 0.1729, "step": 26390 }, { "epoch": 31.846107423053713, "grad_norm": 10.191451072692871, "learning_rate": 1.999936552401758e-05, "loss": 0.1759, "step": 26400 }, { "epoch": 31.858177429088716, "grad_norm": 9.865903854370117, "learning_rate": 1.9999365282762787e-05, "loss": 0.1702, "step": 26410 }, { "epoch": 31.87024743512372, "grad_norm": 11.410205841064453, "learning_rate": 1.9999365041507994e-05, "loss": 0.1761, "step": 26420 }, { "epoch": 31.88231744115872, "grad_norm": 10.633051872253418, "learning_rate": 1.9999364800253196e-05, "loss": 0.1727, "step": 26430 }, { "epoch": 31.894387447193722, "grad_norm": 11.340578079223633, "learning_rate": 1.9999364558998403e-05, "loss": 0.178, "step": 26440 }, { "epoch": 31.906457453228725, "grad_norm": 10.900120735168457, "learning_rate": 1.999936431774361e-05, "loss": 0.1785, "step": 26450 }, { "epoch": 31.91852745926373, "grad_norm": 11.101557731628418, "learning_rate": 1.9999364076488815e-05, "loss": 0.1798, "step": 26460 }, { "epoch": 31.93059746529873, "grad_norm": 10.7327241897583, "learning_rate": 1.999936383523402e-05, "loss": 0.1815, "step": 26470 }, { "epoch": 31.942667471333735, "grad_norm": 9.885331153869629, "learning_rate": 1.9999363593979228e-05, "loss": 0.1755, "step": 26480 }, { "epoch": 31.954737477368738, "grad_norm": 10.906889915466309, "learning_rate": 1.9999363352724434e-05, "loss": 0.1781, "step": 26490 }, { "epoch": 31.96680748340374, "grad_norm": 10.081951141357422, "learning_rate": 1.999936311146964e-05, "loss": 0.1743, "step": 26500 }, { "epoch": 31.96680748340374, "eval_loss": 11.162881851196289, "eval_runtime": 8.1423, "eval_samples_per_second": 85.603, "eval_steps_per_second": 10.808, "step": 26500 }, { "epoch": 31.978877489438744, "grad_norm": 11.354560852050781, "learning_rate": 1.9999362870214846e-05, "loss": 0.176, "step": 26510 }, { "epoch": 31.990947495473748, "grad_norm": 11.698299407958984, "learning_rate": 1.9999362628960052e-05, "loss": 0.1809, "step": 26520 }, { "epoch": 32.002414001207, "grad_norm": 9.451432228088379, "learning_rate": 1.999936238770526e-05, "loss": 0.1685, "step": 26530 }, { "epoch": 32.014484007242004, "grad_norm": 9.228812217712402, "learning_rate": 1.9999362146450465e-05, "loss": 0.123, "step": 26540 }, { "epoch": 32.02655401327701, "grad_norm": 8.034507751464844, "learning_rate": 1.999936190519567e-05, "loss": 0.1264, "step": 26550 }, { "epoch": 32.03862401931201, "grad_norm": 8.74199390411377, "learning_rate": 1.9999361663940877e-05, "loss": 0.1233, "step": 26560 }, { "epoch": 32.05069402534701, "grad_norm": 9.759060859680176, "learning_rate": 1.9999361422686083e-05, "loss": 0.1295, "step": 26570 }, { "epoch": 32.06276403138202, "grad_norm": 9.83885383605957, "learning_rate": 1.999936118143129e-05, "loss": 0.1312, "step": 26580 }, { "epoch": 32.07483403741702, "grad_norm": 9.28884506225586, "learning_rate": 1.9999360940176496e-05, "loss": 0.1338, "step": 26590 }, { "epoch": 32.08690404345202, "grad_norm": 10.487255096435547, "learning_rate": 1.9999360698921702e-05, "loss": 0.1361, "step": 26600 }, { "epoch": 32.098974049487026, "grad_norm": 9.206151962280273, "learning_rate": 1.999936045766691e-05, "loss": 0.1318, "step": 26610 }, { "epoch": 32.11104405552203, "grad_norm": 10.112557411193848, "learning_rate": 1.9999360216412115e-05, "loss": 0.1379, "step": 26620 }, { "epoch": 32.12311406155703, "grad_norm": 9.156517028808594, "learning_rate": 1.999935997515732e-05, "loss": 0.1359, "step": 26630 }, { "epoch": 32.135184067592036, "grad_norm": 9.149964332580566, "learning_rate": 1.9999359733902527e-05, "loss": 0.1353, "step": 26640 }, { "epoch": 32.14725407362704, "grad_norm": 8.83856201171875, "learning_rate": 1.9999359492647733e-05, "loss": 0.137, "step": 26650 }, { "epoch": 32.15932407966204, "grad_norm": 10.110918045043945, "learning_rate": 1.999935925139294e-05, "loss": 0.1375, "step": 26660 }, { "epoch": 32.171394085697045, "grad_norm": 9.451704025268555, "learning_rate": 1.9999359010138146e-05, "loss": 0.1383, "step": 26670 }, { "epoch": 32.18346409173205, "grad_norm": 9.052194595336914, "learning_rate": 1.999935876888335e-05, "loss": 0.1424, "step": 26680 }, { "epoch": 32.19553409776705, "grad_norm": 10.317416191101074, "learning_rate": 1.9999358527628555e-05, "loss": 0.1391, "step": 26690 }, { "epoch": 32.207604103802055, "grad_norm": 9.268503189086914, "learning_rate": 1.999935828637376e-05, "loss": 0.1419, "step": 26700 }, { "epoch": 32.21967410983706, "grad_norm": 10.51187515258789, "learning_rate": 1.9999358045118967e-05, "loss": 0.1485, "step": 26710 }, { "epoch": 32.23174411587206, "grad_norm": 9.539616584777832, "learning_rate": 1.9999357803864173e-05, "loss": 0.1531, "step": 26720 }, { "epoch": 32.243814121907064, "grad_norm": 8.923384666442871, "learning_rate": 1.999935756260938e-05, "loss": 0.1429, "step": 26730 }, { "epoch": 32.25588412794206, "grad_norm": 10.001574516296387, "learning_rate": 1.9999357321354586e-05, "loss": 0.1439, "step": 26740 }, { "epoch": 32.267954133977064, "grad_norm": 10.000737190246582, "learning_rate": 1.9999357080099792e-05, "loss": 0.1506, "step": 26750 }, { "epoch": 32.28002414001207, "grad_norm": 10.24661636352539, "learning_rate": 1.9999356838844998e-05, "loss": 0.1462, "step": 26760 }, { "epoch": 32.29209414604707, "grad_norm": 10.027256965637207, "learning_rate": 1.9999356597590204e-05, "loss": 0.1466, "step": 26770 }, { "epoch": 32.30416415208207, "grad_norm": 9.2122220993042, "learning_rate": 1.999935635633541e-05, "loss": 0.1466, "step": 26780 }, { "epoch": 32.316234158117076, "grad_norm": 9.72177505493164, "learning_rate": 1.9999356115080617e-05, "loss": 0.1472, "step": 26790 }, { "epoch": 32.32830416415208, "grad_norm": 10.144412994384766, "learning_rate": 1.9999355873825823e-05, "loss": 0.1477, "step": 26800 }, { "epoch": 32.34037417018708, "grad_norm": 10.282918930053711, "learning_rate": 1.999935563257103e-05, "loss": 0.1561, "step": 26810 }, { "epoch": 32.352444176222086, "grad_norm": 10.473298072814941, "learning_rate": 1.9999355391316235e-05, "loss": 0.1444, "step": 26820 }, { "epoch": 32.36451418225709, "grad_norm": 10.21805477142334, "learning_rate": 1.9999355150061442e-05, "loss": 0.1558, "step": 26830 }, { "epoch": 32.37658418829209, "grad_norm": 10.145358085632324, "learning_rate": 1.9999354908806648e-05, "loss": 0.1509, "step": 26840 }, { "epoch": 32.388654194327096, "grad_norm": 9.555004119873047, "learning_rate": 1.9999354667551854e-05, "loss": 0.1439, "step": 26850 }, { "epoch": 32.4007242003621, "grad_norm": 9.869184494018555, "learning_rate": 1.999935442629706e-05, "loss": 0.1553, "step": 26860 }, { "epoch": 32.4127942063971, "grad_norm": 10.01133918762207, "learning_rate": 1.9999354185042267e-05, "loss": 0.1514, "step": 26870 }, { "epoch": 32.424864212432105, "grad_norm": 9.602773666381836, "learning_rate": 1.9999353943787473e-05, "loss": 0.1495, "step": 26880 }, { "epoch": 32.43693421846711, "grad_norm": 10.033738136291504, "learning_rate": 1.999935370253268e-05, "loss": 0.1481, "step": 26890 }, { "epoch": 32.44900422450211, "grad_norm": 10.042325019836426, "learning_rate": 1.9999353461277885e-05, "loss": 0.1523, "step": 26900 }, { "epoch": 32.461074230537115, "grad_norm": 9.35616397857666, "learning_rate": 1.999935322002309e-05, "loss": 0.1516, "step": 26910 }, { "epoch": 32.47314423657212, "grad_norm": 9.78777027130127, "learning_rate": 1.9999352978768298e-05, "loss": 0.1516, "step": 26920 }, { "epoch": 32.48521424260712, "grad_norm": 9.359261512756348, "learning_rate": 1.9999352737513504e-05, "loss": 0.1528, "step": 26930 }, { "epoch": 32.497284248642124, "grad_norm": 9.647866249084473, "learning_rate": 1.999935249625871e-05, "loss": 0.154, "step": 26940 }, { "epoch": 32.50935425467713, "grad_norm": 10.195488929748535, "learning_rate": 1.9999352255003916e-05, "loss": 0.1554, "step": 26950 }, { "epoch": 32.52142426071213, "grad_norm": 9.80721664428711, "learning_rate": 1.9999352013749122e-05, "loss": 0.1529, "step": 26960 }, { "epoch": 32.533494266747134, "grad_norm": 9.898003578186035, "learning_rate": 1.999935177249433e-05, "loss": 0.1534, "step": 26970 }, { "epoch": 32.54556427278214, "grad_norm": 10.485968589782715, "learning_rate": 1.9999351531239535e-05, "loss": 0.1562, "step": 26980 }, { "epoch": 32.55763427881714, "grad_norm": 10.7553071975708, "learning_rate": 1.999935128998474e-05, "loss": 0.1552, "step": 26990 }, { "epoch": 32.56970428485214, "grad_norm": 10.42313289642334, "learning_rate": 1.9999351048729947e-05, "loss": 0.1591, "step": 27000 }, { "epoch": 32.56970428485214, "eval_loss": 11.214214324951172, "eval_runtime": 8.1417, "eval_samples_per_second": 85.609, "eval_steps_per_second": 10.809, "step": 27000 }, { "epoch": 32.58177429088715, "grad_norm": 10.47486400604248, "learning_rate": 1.9999350807475154e-05, "loss": 0.1635, "step": 27010 }, { "epoch": 32.59384429692215, "grad_norm": 10.664046287536621, "learning_rate": 1.999935056622036e-05, "loss": 0.1623, "step": 27020 }, { "epoch": 32.60591430295715, "grad_norm": 10.20971393585205, "learning_rate": 1.9999350324965566e-05, "loss": 0.159, "step": 27030 }, { "epoch": 32.617984308992156, "grad_norm": 9.398658752441406, "learning_rate": 1.9999350083710772e-05, "loss": 0.1588, "step": 27040 }, { "epoch": 32.63005431502716, "grad_norm": 9.565375328063965, "learning_rate": 1.999934984245598e-05, "loss": 0.1612, "step": 27050 }, { "epoch": 32.64212432106216, "grad_norm": 10.47481918334961, "learning_rate": 1.9999349601201185e-05, "loss": 0.1588, "step": 27060 }, { "epoch": 32.654194327097166, "grad_norm": 9.569131851196289, "learning_rate": 1.999934935994639e-05, "loss": 0.1602, "step": 27070 }, { "epoch": 32.66626433313217, "grad_norm": 10.973284721374512, "learning_rate": 1.9999349118691597e-05, "loss": 0.1623, "step": 27080 }, { "epoch": 32.67833433916717, "grad_norm": 10.390109062194824, "learning_rate": 1.9999348877436803e-05, "loss": 0.1619, "step": 27090 }, { "epoch": 32.690404345202175, "grad_norm": 10.614888191223145, "learning_rate": 1.9999348636182006e-05, "loss": 0.1604, "step": 27100 }, { "epoch": 32.70247435123718, "grad_norm": 10.280113220214844, "learning_rate": 1.9999348394927212e-05, "loss": 0.1582, "step": 27110 }, { "epoch": 32.71454435727218, "grad_norm": 10.23862075805664, "learning_rate": 1.999934815367242e-05, "loss": 0.1645, "step": 27120 }, { "epoch": 32.726614363307185, "grad_norm": 10.98208236694336, "learning_rate": 1.9999347912417625e-05, "loss": 0.1601, "step": 27130 }, { "epoch": 32.73868436934219, "grad_norm": 9.6322660446167, "learning_rate": 1.999934767116283e-05, "loss": 0.1612, "step": 27140 }, { "epoch": 32.750754375377184, "grad_norm": 10.405722618103027, "learning_rate": 1.9999347429908037e-05, "loss": 0.1673, "step": 27150 }, { "epoch": 32.76282438141219, "grad_norm": 9.925395011901855, "learning_rate": 1.9999347188653243e-05, "loss": 0.1623, "step": 27160 }, { "epoch": 32.77489438744719, "grad_norm": 10.705119132995605, "learning_rate": 1.999934694739845e-05, "loss": 0.1664, "step": 27170 }, { "epoch": 32.786964393482194, "grad_norm": 10.58224868774414, "learning_rate": 1.9999346706143656e-05, "loss": 0.1669, "step": 27180 }, { "epoch": 32.7990343995172, "grad_norm": 11.107314109802246, "learning_rate": 1.9999346464888862e-05, "loss": 0.1663, "step": 27190 }, { "epoch": 32.8111044055522, "grad_norm": 11.005951881408691, "learning_rate": 1.9999346223634068e-05, "loss": 0.1638, "step": 27200 }, { "epoch": 32.8231744115872, "grad_norm": 10.530645370483398, "learning_rate": 1.9999345982379274e-05, "loss": 0.1668, "step": 27210 }, { "epoch": 32.83524441762221, "grad_norm": 10.134138107299805, "learning_rate": 1.999934574112448e-05, "loss": 0.1588, "step": 27220 }, { "epoch": 32.84731442365721, "grad_norm": 10.034008979797363, "learning_rate": 1.9999345499869687e-05, "loss": 0.1699, "step": 27230 }, { "epoch": 32.85938442969221, "grad_norm": 10.759100914001465, "learning_rate": 1.9999345258614893e-05, "loss": 0.1688, "step": 27240 }, { "epoch": 32.871454435727216, "grad_norm": 11.091179847717285, "learning_rate": 1.99993450173601e-05, "loss": 0.1674, "step": 27250 }, { "epoch": 32.88352444176222, "grad_norm": 11.172391891479492, "learning_rate": 1.9999344776105306e-05, "loss": 0.1675, "step": 27260 }, { "epoch": 32.89559444779722, "grad_norm": 9.813253402709961, "learning_rate": 1.9999344534850512e-05, "loss": 0.1657, "step": 27270 }, { "epoch": 32.907664453832226, "grad_norm": 10.845745086669922, "learning_rate": 1.9999344293595718e-05, "loss": 0.1694, "step": 27280 }, { "epoch": 32.91973445986723, "grad_norm": 10.973840713500977, "learning_rate": 1.9999344052340924e-05, "loss": 0.1681, "step": 27290 }, { "epoch": 32.93180446590223, "grad_norm": 11.186603546142578, "learning_rate": 1.999934381108613e-05, "loss": 0.1666, "step": 27300 }, { "epoch": 32.943874471937235, "grad_norm": 10.630810737609863, "learning_rate": 1.9999343569831337e-05, "loss": 0.1691, "step": 27310 }, { "epoch": 32.95594447797224, "grad_norm": 11.36808967590332, "learning_rate": 1.9999343328576543e-05, "loss": 0.1716, "step": 27320 }, { "epoch": 32.96801448400724, "grad_norm": 10.04823112487793, "learning_rate": 1.999934308732175e-05, "loss": 0.1721, "step": 27330 }, { "epoch": 32.980084490042245, "grad_norm": 10.140503883361816, "learning_rate": 1.9999342846066955e-05, "loss": 0.1666, "step": 27340 }, { "epoch": 32.99215449607725, "grad_norm": 10.533440589904785, "learning_rate": 1.9999342604812158e-05, "loss": 0.1724, "step": 27350 }, { "epoch": 33.0036210018105, "grad_norm": 7.731122970581055, "learning_rate": 1.9999342363557364e-05, "loss": 0.1501, "step": 27360 }, { "epoch": 33.015691007845504, "grad_norm": 8.854116439819336, "learning_rate": 1.999934212230257e-05, "loss": 0.1127, "step": 27370 }, { "epoch": 33.02776101388051, "grad_norm": 8.249544143676758, "learning_rate": 1.9999341881047777e-05, "loss": 0.1166, "step": 27380 }, { "epoch": 33.03983101991551, "grad_norm": 8.136947631835938, "learning_rate": 1.9999341639792986e-05, "loss": 0.1224, "step": 27390 }, { "epoch": 33.051901025950514, "grad_norm": 8.515998840332031, "learning_rate": 1.9999341398538193e-05, "loss": 0.1311, "step": 27400 }, { "epoch": 33.06397103198552, "grad_norm": 8.761889457702637, "learning_rate": 1.99993411572834e-05, "loss": 0.122, "step": 27410 }, { "epoch": 33.07604103802052, "grad_norm": 8.893444061279297, "learning_rate": 1.9999340916028605e-05, "loss": 0.1292, "step": 27420 }, { "epoch": 33.08811104405552, "grad_norm": 9.825721740722656, "learning_rate": 1.999934067477381e-05, "loss": 0.1293, "step": 27430 }, { "epoch": 33.10018105009053, "grad_norm": 9.348423957824707, "learning_rate": 1.9999340433519017e-05, "loss": 0.1272, "step": 27440 }, { "epoch": 33.11225105612553, "grad_norm": 9.343605041503906, "learning_rate": 1.9999340192264224e-05, "loss": 0.1307, "step": 27450 }, { "epoch": 33.12432106216053, "grad_norm": 9.170119285583496, "learning_rate": 1.999933995100943e-05, "loss": 0.1357, "step": 27460 }, { "epoch": 33.136391068195536, "grad_norm": 9.557656288146973, "learning_rate": 1.9999339709754636e-05, "loss": 0.1298, "step": 27470 }, { "epoch": 33.14846107423054, "grad_norm": 8.547588348388672, "learning_rate": 1.9999339468499842e-05, "loss": 0.1355, "step": 27480 }, { "epoch": 33.16053108026554, "grad_norm": 8.926509857177734, "learning_rate": 1.999933922724505e-05, "loss": 0.1343, "step": 27490 }, { "epoch": 33.172601086300546, "grad_norm": 9.464871406555176, "learning_rate": 1.9999338985990255e-05, "loss": 0.1329, "step": 27500 }, { "epoch": 33.172601086300546, "eval_loss": 11.229613304138184, "eval_runtime": 8.1471, "eval_samples_per_second": 85.552, "eval_steps_per_second": 10.801, "step": 27500 }, { "epoch": 33.18467109233555, "grad_norm": 9.170391082763672, "learning_rate": 1.9999338744735458e-05, "loss": 0.1297, "step": 27510 }, { "epoch": 33.19674109837055, "grad_norm": 9.456043243408203, "learning_rate": 1.9999338503480664e-05, "loss": 0.1383, "step": 27520 }, { "epoch": 33.208811104405555, "grad_norm": 8.866593360900879, "learning_rate": 1.999933826222587e-05, "loss": 0.138, "step": 27530 }, { "epoch": 33.22088111044056, "grad_norm": 9.414373397827148, "learning_rate": 1.9999338020971076e-05, "loss": 0.1358, "step": 27540 }, { "epoch": 33.23295111647556, "grad_norm": 9.257960319519043, "learning_rate": 1.9999337779716282e-05, "loss": 0.1394, "step": 27550 }, { "epoch": 33.245021122510565, "grad_norm": 9.19938850402832, "learning_rate": 1.999933753846149e-05, "loss": 0.1411, "step": 27560 }, { "epoch": 33.25709112854556, "grad_norm": 9.676847457885742, "learning_rate": 1.9999337297206695e-05, "loss": 0.1402, "step": 27570 }, { "epoch": 33.269161134580564, "grad_norm": 10.126440048217773, "learning_rate": 1.99993370559519e-05, "loss": 0.1402, "step": 27580 }, { "epoch": 33.28123114061557, "grad_norm": 9.000177383422852, "learning_rate": 1.9999336814697107e-05, "loss": 0.1359, "step": 27590 }, { "epoch": 33.29330114665057, "grad_norm": 9.473075866699219, "learning_rate": 1.9999336573442314e-05, "loss": 0.1365, "step": 27600 }, { "epoch": 33.305371152685574, "grad_norm": 10.044913291931152, "learning_rate": 1.999933633218752e-05, "loss": 0.1394, "step": 27610 }, { "epoch": 33.31744115872058, "grad_norm": 9.543787002563477, "learning_rate": 1.9999336090932726e-05, "loss": 0.1411, "step": 27620 }, { "epoch": 33.32951116475558, "grad_norm": 10.211370468139648, "learning_rate": 1.9999335849677932e-05, "loss": 0.1472, "step": 27630 }, { "epoch": 33.34158117079058, "grad_norm": 9.980547904968262, "learning_rate": 1.999933560842314e-05, "loss": 0.143, "step": 27640 }, { "epoch": 33.353651176825586, "grad_norm": 9.135948181152344, "learning_rate": 1.9999335367168345e-05, "loss": 0.1412, "step": 27650 }, { "epoch": 33.36572118286059, "grad_norm": 9.702611923217773, "learning_rate": 1.999933512591355e-05, "loss": 0.1423, "step": 27660 }, { "epoch": 33.37779118889559, "grad_norm": 9.78563117980957, "learning_rate": 1.9999334884658757e-05, "loss": 0.1473, "step": 27670 }, { "epoch": 33.389861194930596, "grad_norm": 9.949811935424805, "learning_rate": 1.9999334643403963e-05, "loss": 0.1453, "step": 27680 }, { "epoch": 33.4019312009656, "grad_norm": 10.307674407958984, "learning_rate": 1.999933440214917e-05, "loss": 0.1458, "step": 27690 }, { "epoch": 33.4140012070006, "grad_norm": 10.879150390625, "learning_rate": 1.9999334160894376e-05, "loss": 0.1452, "step": 27700 }, { "epoch": 33.426071213035605, "grad_norm": 9.48952579498291, "learning_rate": 1.9999333919639582e-05, "loss": 0.1448, "step": 27710 }, { "epoch": 33.43814121907061, "grad_norm": 10.041439056396484, "learning_rate": 1.9999333678384788e-05, "loss": 0.1449, "step": 27720 }, { "epoch": 33.45021122510561, "grad_norm": 9.10004997253418, "learning_rate": 1.9999333437129994e-05, "loss": 0.1515, "step": 27730 }, { "epoch": 33.462281231140615, "grad_norm": 9.604711532592773, "learning_rate": 1.99993331958752e-05, "loss": 0.1524, "step": 27740 }, { "epoch": 33.47435123717562, "grad_norm": 10.336776733398438, "learning_rate": 1.9999332954620407e-05, "loss": 0.1523, "step": 27750 }, { "epoch": 33.48642124321062, "grad_norm": 9.645914077758789, "learning_rate": 1.999933271336561e-05, "loss": 0.1511, "step": 27760 }, { "epoch": 33.498491249245625, "grad_norm": 9.700021743774414, "learning_rate": 1.9999332472110816e-05, "loss": 0.1528, "step": 27770 }, { "epoch": 33.51056125528063, "grad_norm": 10.450911521911621, "learning_rate": 1.9999332230856022e-05, "loss": 0.1519, "step": 27780 }, { "epoch": 33.52263126131563, "grad_norm": 9.908650398254395, "learning_rate": 1.9999331989601228e-05, "loss": 0.1475, "step": 27790 }, { "epoch": 33.534701267350634, "grad_norm": 10.13331413269043, "learning_rate": 1.9999331748346434e-05, "loss": 0.1488, "step": 27800 }, { "epoch": 33.54677127338564, "grad_norm": 10.23560905456543, "learning_rate": 1.999933150709164e-05, "loss": 0.1558, "step": 27810 }, { "epoch": 33.55884127942064, "grad_norm": 9.565673828125, "learning_rate": 1.9999331265836847e-05, "loss": 0.1475, "step": 27820 }, { "epoch": 33.570911285455644, "grad_norm": 10.0045747756958, "learning_rate": 1.9999331024582053e-05, "loss": 0.1488, "step": 27830 }, { "epoch": 33.58298129149065, "grad_norm": 9.239933967590332, "learning_rate": 1.999933078332726e-05, "loss": 0.1546, "step": 27840 }, { "epoch": 33.59505129752565, "grad_norm": 10.214375495910645, "learning_rate": 1.9999330542072466e-05, "loss": 0.1528, "step": 27850 }, { "epoch": 33.60712130356065, "grad_norm": 10.044506072998047, "learning_rate": 1.9999330300817672e-05, "loss": 0.1487, "step": 27860 }, { "epoch": 33.61919130959566, "grad_norm": 10.565363883972168, "learning_rate": 1.9999330059562878e-05, "loss": 0.1494, "step": 27870 }, { "epoch": 33.63126131563066, "grad_norm": 9.957137107849121, "learning_rate": 1.9999329818308084e-05, "loss": 0.1461, "step": 27880 }, { "epoch": 33.64333132166566, "grad_norm": 9.489255905151367, "learning_rate": 1.999932957705329e-05, "loss": 0.1466, "step": 27890 }, { "epoch": 33.655401327700666, "grad_norm": 9.621824264526367, "learning_rate": 1.9999329335798497e-05, "loss": 0.1555, "step": 27900 }, { "epoch": 33.66747133373567, "grad_norm": 10.057159423828125, "learning_rate": 1.9999329094543703e-05, "loss": 0.1511, "step": 27910 }, { "epoch": 33.67954133977067, "grad_norm": 9.558164596557617, "learning_rate": 1.999932885328891e-05, "loss": 0.1527, "step": 27920 }, { "epoch": 33.691611345805676, "grad_norm": 9.934770584106445, "learning_rate": 1.9999328612034115e-05, "loss": 0.1472, "step": 27930 }, { "epoch": 33.70368135184068, "grad_norm": 10.514352798461914, "learning_rate": 1.999932837077932e-05, "loss": 0.1538, "step": 27940 }, { "epoch": 33.71575135787568, "grad_norm": 9.970117568969727, "learning_rate": 1.9999328129524528e-05, "loss": 0.1549, "step": 27950 }, { "epoch": 33.727821363910685, "grad_norm": 10.395722389221191, "learning_rate": 1.9999327888269734e-05, "loss": 0.1565, "step": 27960 }, { "epoch": 33.73989136994569, "grad_norm": 9.833978652954102, "learning_rate": 1.999932764701494e-05, "loss": 0.1595, "step": 27970 }, { "epoch": 33.751961375980684, "grad_norm": 9.45128059387207, "learning_rate": 1.9999327405760146e-05, "loss": 0.1588, "step": 27980 }, { "epoch": 33.76403138201569, "grad_norm": 10.442251205444336, "learning_rate": 1.9999327164505353e-05, "loss": 0.1602, "step": 27990 }, { "epoch": 33.77610138805069, "grad_norm": 9.542750358581543, "learning_rate": 1.999932692325056e-05, "loss": 0.1535, "step": 28000 }, { "epoch": 33.77610138805069, "eval_loss": 11.256145477294922, "eval_runtime": 8.1467, "eval_samples_per_second": 85.556, "eval_steps_per_second": 10.802, "step": 28000 }, { "epoch": 33.788171394085694, "grad_norm": 10.274908065795898, "learning_rate": 1.9999326681995765e-05, "loss": 0.1541, "step": 28010 }, { "epoch": 33.8002414001207, "grad_norm": 10.641603469848633, "learning_rate": 1.999932644074097e-05, "loss": 0.1582, "step": 28020 }, { "epoch": 33.8123114061557, "grad_norm": 9.371525764465332, "learning_rate": 1.9999326199486177e-05, "loss": 0.1516, "step": 28030 }, { "epoch": 33.824381412190704, "grad_norm": 10.389409065246582, "learning_rate": 1.9999325958231384e-05, "loss": 0.1563, "step": 28040 }, { "epoch": 33.83645141822571, "grad_norm": 9.322901725769043, "learning_rate": 1.999932571697659e-05, "loss": 0.151, "step": 28050 }, { "epoch": 33.84852142426071, "grad_norm": 11.065936088562012, "learning_rate": 1.9999325475721796e-05, "loss": 0.1586, "step": 28060 }, { "epoch": 33.86059143029571, "grad_norm": 9.502508163452148, "learning_rate": 1.9999325234467002e-05, "loss": 0.1611, "step": 28070 }, { "epoch": 33.872661436330716, "grad_norm": 10.398035049438477, "learning_rate": 1.999932499321221e-05, "loss": 0.1567, "step": 28080 }, { "epoch": 33.88473144236572, "grad_norm": 9.999931335449219, "learning_rate": 1.9999324751957415e-05, "loss": 0.159, "step": 28090 }, { "epoch": 33.89680144840072, "grad_norm": 10.769134521484375, "learning_rate": 1.999932451070262e-05, "loss": 0.1593, "step": 28100 }, { "epoch": 33.908871454435726, "grad_norm": 10.24414348602295, "learning_rate": 1.9999324269447827e-05, "loss": 0.1608, "step": 28110 }, { "epoch": 33.92094146047073, "grad_norm": 10.981132507324219, "learning_rate": 1.9999324028193033e-05, "loss": 0.164, "step": 28120 }, { "epoch": 33.93301146650573, "grad_norm": 10.612542152404785, "learning_rate": 1.999932378693824e-05, "loss": 0.1637, "step": 28130 }, { "epoch": 33.945081472540735, "grad_norm": 11.130583763122559, "learning_rate": 1.9999323545683446e-05, "loss": 0.1598, "step": 28140 }, { "epoch": 33.95715147857574, "grad_norm": 10.172077178955078, "learning_rate": 1.9999323304428652e-05, "loss": 0.164, "step": 28150 }, { "epoch": 33.96922148461074, "grad_norm": 10.356639862060547, "learning_rate": 1.9999323063173858e-05, "loss": 0.1642, "step": 28160 }, { "epoch": 33.981291490645745, "grad_norm": 10.44476318359375, "learning_rate": 1.9999322821919064e-05, "loss": 0.1651, "step": 28170 }, { "epoch": 33.99336149668075, "grad_norm": 10.919609069824219, "learning_rate": 1.9999322580664267e-05, "loss": 0.162, "step": 28180 }, { "epoch": 34.004828002414, "grad_norm": 7.881608963012695, "learning_rate": 1.9999322339409473e-05, "loss": 0.1396, "step": 28190 }, { "epoch": 34.016898008449004, "grad_norm": 8.84095573425293, "learning_rate": 1.999932209815468e-05, "loss": 0.1098, "step": 28200 }, { "epoch": 34.02896801448401, "grad_norm": 8.152153015136719, "learning_rate": 1.9999321856899886e-05, "loss": 0.1126, "step": 28210 }, { "epoch": 34.04103802051901, "grad_norm": 8.582411766052246, "learning_rate": 1.9999321615645092e-05, "loss": 0.1161, "step": 28220 }, { "epoch": 34.053108026554014, "grad_norm": 8.614612579345703, "learning_rate": 1.9999321374390298e-05, "loss": 0.1235, "step": 28230 }, { "epoch": 34.06517803258902, "grad_norm": 8.856881141662598, "learning_rate": 1.9999321133135505e-05, "loss": 0.1196, "step": 28240 }, { "epoch": 34.07724803862402, "grad_norm": 8.742995262145996, "learning_rate": 1.999932089188071e-05, "loss": 0.1211, "step": 28250 }, { "epoch": 34.089318044659024, "grad_norm": 9.13664722442627, "learning_rate": 1.9999320650625917e-05, "loss": 0.1216, "step": 28260 }, { "epoch": 34.10138805069403, "grad_norm": 8.20250415802002, "learning_rate": 1.9999320409371123e-05, "loss": 0.1206, "step": 28270 }, { "epoch": 34.11345805672903, "grad_norm": 8.441181182861328, "learning_rate": 1.999932016811633e-05, "loss": 0.1233, "step": 28280 }, { "epoch": 34.12552806276403, "grad_norm": 8.924670219421387, "learning_rate": 1.9999319926861536e-05, "loss": 0.1253, "step": 28290 }, { "epoch": 34.137598068799036, "grad_norm": 9.876238822937012, "learning_rate": 1.9999319685606742e-05, "loss": 0.1321, "step": 28300 }, { "epoch": 34.14966807483404, "grad_norm": 9.00535774230957, "learning_rate": 1.9999319444351948e-05, "loss": 0.1282, "step": 28310 }, { "epoch": 34.16173808086904, "grad_norm": 8.95825481414795, "learning_rate": 1.9999319203097154e-05, "loss": 0.1355, "step": 28320 }, { "epoch": 34.173808086904046, "grad_norm": 9.294838905334473, "learning_rate": 1.999931896184236e-05, "loss": 0.134, "step": 28330 }, { "epoch": 34.18587809293905, "grad_norm": 9.228461265563965, "learning_rate": 1.9999318720587567e-05, "loss": 0.1284, "step": 28340 }, { "epoch": 34.19794809897405, "grad_norm": 8.977957725524902, "learning_rate": 1.9999318479332773e-05, "loss": 0.1349, "step": 28350 }, { "epoch": 34.210018105009055, "grad_norm": 9.1202392578125, "learning_rate": 1.999931823807798e-05, "loss": 0.1315, "step": 28360 }, { "epoch": 34.22208811104406, "grad_norm": 9.245802879333496, "learning_rate": 1.9999317996823185e-05, "loss": 0.1324, "step": 28370 }, { "epoch": 34.23415811707906, "grad_norm": 9.0361909866333, "learning_rate": 1.999931775556839e-05, "loss": 0.1353, "step": 28380 }, { "epoch": 34.246228123114065, "grad_norm": 9.358994483947754, "learning_rate": 1.9999317514313598e-05, "loss": 0.1327, "step": 28390 }, { "epoch": 34.25829812914906, "grad_norm": 8.76244068145752, "learning_rate": 1.9999317273058804e-05, "loss": 0.1332, "step": 28400 }, { "epoch": 34.270368135184064, "grad_norm": 9.082619667053223, "learning_rate": 1.999931703180401e-05, "loss": 0.1328, "step": 28410 }, { "epoch": 34.28243814121907, "grad_norm": 9.048134803771973, "learning_rate": 1.9999316790549216e-05, "loss": 0.1365, "step": 28420 }, { "epoch": 34.29450814725407, "grad_norm": 9.205294609069824, "learning_rate": 1.999931654929442e-05, "loss": 0.1334, "step": 28430 }, { "epoch": 34.306578153289074, "grad_norm": 9.339761734008789, "learning_rate": 1.9999316308039625e-05, "loss": 0.1366, "step": 28440 }, { "epoch": 34.31864815932408, "grad_norm": 9.37782096862793, "learning_rate": 1.999931606678483e-05, "loss": 0.1344, "step": 28450 }, { "epoch": 34.33071816535908, "grad_norm": 9.586116790771484, "learning_rate": 1.9999315825530038e-05, "loss": 0.1342, "step": 28460 }, { "epoch": 34.34278817139408, "grad_norm": 9.545084953308105, "learning_rate": 1.9999315584275247e-05, "loss": 0.1379, "step": 28470 }, { "epoch": 34.35485817742909, "grad_norm": 10.005705833435059, "learning_rate": 1.9999315343020454e-05, "loss": 0.1377, "step": 28480 }, { "epoch": 34.36692818346409, "grad_norm": 10.288095474243164, "learning_rate": 1.999931510176566e-05, "loss": 0.1385, "step": 28490 }, { "epoch": 34.37899818949909, "grad_norm": 9.163973808288574, "learning_rate": 1.9999314860510866e-05, "loss": 0.1365, "step": 28500 }, { "epoch": 34.37899818949909, "eval_loss": 11.287013053894043, "eval_runtime": 8.1764, "eval_samples_per_second": 85.245, "eval_steps_per_second": 10.763, "step": 28500 }, { "epoch": 34.391068195534096, "grad_norm": 9.348191261291504, "learning_rate": 1.9999314619256072e-05, "loss": 0.1458, "step": 28510 }, { "epoch": 34.4031382015691, "grad_norm": 9.576292991638184, "learning_rate": 1.999931437800128e-05, "loss": 0.1405, "step": 28520 }, { "epoch": 34.4152082076041, "grad_norm": 9.457953453063965, "learning_rate": 1.9999314136746485e-05, "loss": 0.1378, "step": 28530 }, { "epoch": 34.427278213639106, "grad_norm": 9.476733207702637, "learning_rate": 1.999931389549169e-05, "loss": 0.1392, "step": 28540 }, { "epoch": 34.43934821967411, "grad_norm": 9.428503036499023, "learning_rate": 1.9999313654236897e-05, "loss": 0.1351, "step": 28550 }, { "epoch": 34.45141822570911, "grad_norm": 9.35505199432373, "learning_rate": 1.9999313412982103e-05, "loss": 0.1435, "step": 28560 }, { "epoch": 34.463488231744115, "grad_norm": 9.611515998840332, "learning_rate": 1.999931317172731e-05, "loss": 0.1414, "step": 28570 }, { "epoch": 34.47555823777912, "grad_norm": 9.001901626586914, "learning_rate": 1.9999312930472516e-05, "loss": 0.137, "step": 28580 }, { "epoch": 34.48762824381412, "grad_norm": 9.226225852966309, "learning_rate": 1.999931268921772e-05, "loss": 0.1399, "step": 28590 }, { "epoch": 34.499698249849125, "grad_norm": 9.876724243164062, "learning_rate": 1.9999312447962925e-05, "loss": 0.1345, "step": 28600 }, { "epoch": 34.51176825588413, "grad_norm": 9.366320610046387, "learning_rate": 1.999931220670813e-05, "loss": 0.1395, "step": 28610 }, { "epoch": 34.52383826191913, "grad_norm": 9.28364372253418, "learning_rate": 1.9999311965453337e-05, "loss": 0.1402, "step": 28620 }, { "epoch": 34.535908267954134, "grad_norm": 10.10129451751709, "learning_rate": 1.9999311724198544e-05, "loss": 0.1443, "step": 28630 }, { "epoch": 34.54797827398914, "grad_norm": 9.437978744506836, "learning_rate": 1.999931148294375e-05, "loss": 0.1425, "step": 28640 }, { "epoch": 34.56004828002414, "grad_norm": 10.117679595947266, "learning_rate": 1.9999311241688956e-05, "loss": 0.1452, "step": 28650 }, { "epoch": 34.572118286059144, "grad_norm": 9.202507972717285, "learning_rate": 1.9999311000434162e-05, "loss": 0.1414, "step": 28660 }, { "epoch": 34.58418829209415, "grad_norm": 9.690150260925293, "learning_rate": 1.999931075917937e-05, "loss": 0.1419, "step": 28670 }, { "epoch": 34.59625829812915, "grad_norm": 10.125447273254395, "learning_rate": 1.9999310517924575e-05, "loss": 0.1492, "step": 28680 }, { "epoch": 34.608328304164154, "grad_norm": 10.176777839660645, "learning_rate": 1.999931027666978e-05, "loss": 0.1431, "step": 28690 }, { "epoch": 34.62039831019916, "grad_norm": 10.488736152648926, "learning_rate": 1.9999310035414987e-05, "loss": 0.1486, "step": 28700 }, { "epoch": 34.63246831623416, "grad_norm": 10.518794059753418, "learning_rate": 1.9999309794160193e-05, "loss": 0.145, "step": 28710 }, { "epoch": 34.64453832226916, "grad_norm": 9.352953910827637, "learning_rate": 1.99993095529054e-05, "loss": 0.1488, "step": 28720 }, { "epoch": 34.656608328304166, "grad_norm": 10.326713562011719, "learning_rate": 1.9999309311650606e-05, "loss": 0.1463, "step": 28730 }, { "epoch": 34.66867833433917, "grad_norm": 9.685356140136719, "learning_rate": 1.9999309070395812e-05, "loss": 0.1471, "step": 28740 }, { "epoch": 34.68074834037417, "grad_norm": 9.861787796020508, "learning_rate": 1.9999308829141018e-05, "loss": 0.1488, "step": 28750 }, { "epoch": 34.692818346409176, "grad_norm": 9.997923851013184, "learning_rate": 1.9999308587886224e-05, "loss": 0.1501, "step": 28760 }, { "epoch": 34.70488835244418, "grad_norm": 10.122523307800293, "learning_rate": 1.999930834663143e-05, "loss": 0.1479, "step": 28770 }, { "epoch": 34.71695835847918, "grad_norm": 9.41877269744873, "learning_rate": 1.9999308105376637e-05, "loss": 0.1495, "step": 28780 }, { "epoch": 34.729028364514186, "grad_norm": 10.206345558166504, "learning_rate": 1.9999307864121843e-05, "loss": 0.1501, "step": 28790 }, { "epoch": 34.74109837054919, "grad_norm": 9.55711555480957, "learning_rate": 1.999930762286705e-05, "loss": 0.1478, "step": 28800 }, { "epoch": 34.753168376584185, "grad_norm": 11.292353630065918, "learning_rate": 1.9999307381612255e-05, "loss": 0.1527, "step": 28810 }, { "epoch": 34.76523838261919, "grad_norm": 10.180397033691406, "learning_rate": 1.999930714035746e-05, "loss": 0.1532, "step": 28820 }, { "epoch": 34.77730838865419, "grad_norm": 9.52293872833252, "learning_rate": 1.9999306899102668e-05, "loss": 0.1511, "step": 28830 }, { "epoch": 34.789378394689194, "grad_norm": 9.510708808898926, "learning_rate": 1.999930665784787e-05, "loss": 0.1484, "step": 28840 }, { "epoch": 34.8014484007242, "grad_norm": 9.971491813659668, "learning_rate": 1.9999306416593077e-05, "loss": 0.1495, "step": 28850 }, { "epoch": 34.8135184067592, "grad_norm": 9.302696228027344, "learning_rate": 1.9999306175338283e-05, "loss": 0.1467, "step": 28860 }, { "epoch": 34.825588412794204, "grad_norm": 10.543562889099121, "learning_rate": 1.999930593408349e-05, "loss": 0.1518, "step": 28870 }, { "epoch": 34.83765841882921, "grad_norm": 10.589448928833008, "learning_rate": 1.9999305692828696e-05, "loss": 0.1528, "step": 28880 }, { "epoch": 34.84972842486421, "grad_norm": 10.19006633758545, "learning_rate": 1.9999305451573902e-05, "loss": 0.1542, "step": 28890 }, { "epoch": 34.86179843089921, "grad_norm": 9.715826034545898, "learning_rate": 1.9999305210319108e-05, "loss": 0.1517, "step": 28900 }, { "epoch": 34.87386843693422, "grad_norm": 9.768033027648926, "learning_rate": 1.9999304969064314e-05, "loss": 0.1547, "step": 28910 }, { "epoch": 34.88593844296922, "grad_norm": 10.81748104095459, "learning_rate": 1.999930472780952e-05, "loss": 0.1575, "step": 28920 }, { "epoch": 34.89800844900422, "grad_norm": 10.300612449645996, "learning_rate": 1.9999304486554727e-05, "loss": 0.151, "step": 28930 }, { "epoch": 34.910078455039226, "grad_norm": 10.371293067932129, "learning_rate": 1.9999304245299933e-05, "loss": 0.1558, "step": 28940 }, { "epoch": 34.92214846107423, "grad_norm": 9.034223556518555, "learning_rate": 1.999930400404514e-05, "loss": 0.1562, "step": 28950 }, { "epoch": 34.93421846710923, "grad_norm": 10.212347030639648, "learning_rate": 1.9999303762790345e-05, "loss": 0.1529, "step": 28960 }, { "epoch": 34.946288473144236, "grad_norm": 10.644030570983887, "learning_rate": 1.999930352153555e-05, "loss": 0.1603, "step": 28970 }, { "epoch": 34.95835847917924, "grad_norm": 9.798059463500977, "learning_rate": 1.9999303280280758e-05, "loss": 0.1564, "step": 28980 }, { "epoch": 34.97042848521424, "grad_norm": 10.215011596679688, "learning_rate": 1.9999303039025964e-05, "loss": 0.152, "step": 28990 }, { "epoch": 34.982498491249245, "grad_norm": 9.716732025146484, "learning_rate": 1.999930279777117e-05, "loss": 0.155, "step": 29000 }, { "epoch": 34.982498491249245, "eval_loss": 11.33104133605957, "eval_runtime": 8.1753, "eval_samples_per_second": 85.257, "eval_steps_per_second": 10.764, "step": 29000 }, { "epoch": 34.99456849728425, "grad_norm": 11.031145095825195, "learning_rate": 1.9999302556516376e-05, "loss": 0.161, "step": 29010 }, { "epoch": 35.0060350030175, "grad_norm": 9.102777481079102, "learning_rate": 1.9999302315261583e-05, "loss": 0.1314, "step": 29020 }, { "epoch": 35.018105009052505, "grad_norm": 8.965258598327637, "learning_rate": 1.999930207400679e-05, "loss": 0.11, "step": 29030 }, { "epoch": 35.03017501508751, "grad_norm": 8.537543296813965, "learning_rate": 1.9999301832751995e-05, "loss": 0.1061, "step": 29040 }, { "epoch": 35.04224502112251, "grad_norm": 8.302045822143555, "learning_rate": 1.99993015914972e-05, "loss": 0.1111, "step": 29050 }, { "epoch": 35.054315027157514, "grad_norm": 8.963780403137207, "learning_rate": 1.9999301350242407e-05, "loss": 0.1135, "step": 29060 }, { "epoch": 35.06638503319252, "grad_norm": 8.530878067016602, "learning_rate": 1.9999301108987614e-05, "loss": 0.1143, "step": 29070 }, { "epoch": 35.07845503922752, "grad_norm": 9.114927291870117, "learning_rate": 1.999930086773282e-05, "loss": 0.1167, "step": 29080 }, { "epoch": 35.090525045262524, "grad_norm": 8.468751907348633, "learning_rate": 1.9999300626478026e-05, "loss": 0.1182, "step": 29090 }, { "epoch": 35.10259505129753, "grad_norm": 8.571985244750977, "learning_rate": 1.9999300385223232e-05, "loss": 0.1171, "step": 29100 }, { "epoch": 35.11466505733253, "grad_norm": 9.078299522399902, "learning_rate": 1.999930014396844e-05, "loss": 0.1209, "step": 29110 }, { "epoch": 35.12673506336753, "grad_norm": 8.322518348693848, "learning_rate": 1.9999299902713645e-05, "loss": 0.1164, "step": 29120 }, { "epoch": 35.13880506940254, "grad_norm": 8.925374984741211, "learning_rate": 1.999929966145885e-05, "loss": 0.1213, "step": 29130 }, { "epoch": 35.15087507543754, "grad_norm": 8.395106315612793, "learning_rate": 1.9999299420204057e-05, "loss": 0.1205, "step": 29140 }, { "epoch": 35.16294508147254, "grad_norm": 8.454405784606934, "learning_rate": 1.9999299178949263e-05, "loss": 0.1212, "step": 29150 }, { "epoch": 35.175015087507546, "grad_norm": 8.937405586242676, "learning_rate": 1.999929893769447e-05, "loss": 0.1234, "step": 29160 }, { "epoch": 35.18708509354255, "grad_norm": 9.230315208435059, "learning_rate": 1.9999298696439676e-05, "loss": 0.1247, "step": 29170 }, { "epoch": 35.19915509957755, "grad_norm": 8.744706153869629, "learning_rate": 1.9999298455184882e-05, "loss": 0.1242, "step": 29180 }, { "epoch": 35.211225105612556, "grad_norm": 8.619563102722168, "learning_rate": 1.9999298213930088e-05, "loss": 0.1282, "step": 29190 }, { "epoch": 35.22329511164756, "grad_norm": 9.715214729309082, "learning_rate": 1.9999297972675294e-05, "loss": 0.1306, "step": 29200 }, { "epoch": 35.23536511768256, "grad_norm": 10.234919548034668, "learning_rate": 1.99992977314205e-05, "loss": 0.1289, "step": 29210 }, { "epoch": 35.247435123717565, "grad_norm": 9.236870765686035, "learning_rate": 1.9999297490165707e-05, "loss": 0.1328, "step": 29220 }, { "epoch": 35.25950512975256, "grad_norm": 8.606081008911133, "learning_rate": 1.9999297248910913e-05, "loss": 0.131, "step": 29230 }, { "epoch": 35.271575135787565, "grad_norm": 8.464536666870117, "learning_rate": 1.999929700765612e-05, "loss": 0.1251, "step": 29240 }, { "epoch": 35.28364514182257, "grad_norm": 8.524144172668457, "learning_rate": 1.9999296766401322e-05, "loss": 0.1303, "step": 29250 }, { "epoch": 35.29571514785757, "grad_norm": 9.072505950927734, "learning_rate": 1.999929652514653e-05, "loss": 0.1301, "step": 29260 }, { "epoch": 35.307785153892574, "grad_norm": 9.631491661071777, "learning_rate": 1.9999296283891735e-05, "loss": 0.1302, "step": 29270 }, { "epoch": 35.31985515992758, "grad_norm": 9.176788330078125, "learning_rate": 1.999929604263694e-05, "loss": 0.1308, "step": 29280 }, { "epoch": 35.33192516596258, "grad_norm": 9.018299102783203, "learning_rate": 1.9999295801382147e-05, "loss": 0.1318, "step": 29290 }, { "epoch": 35.343995171997584, "grad_norm": 9.897069931030273, "learning_rate": 1.9999295560127353e-05, "loss": 0.1323, "step": 29300 }, { "epoch": 35.35606517803259, "grad_norm": 8.919736862182617, "learning_rate": 1.999929531887256e-05, "loss": 0.1268, "step": 29310 }, { "epoch": 35.36813518406759, "grad_norm": 9.366412162780762, "learning_rate": 1.9999295077617766e-05, "loss": 0.1342, "step": 29320 }, { "epoch": 35.38020519010259, "grad_norm": 9.501954078674316, "learning_rate": 1.9999294836362972e-05, "loss": 0.1339, "step": 29330 }, { "epoch": 35.3922751961376, "grad_norm": 9.659928321838379, "learning_rate": 1.9999294595108178e-05, "loss": 0.136, "step": 29340 }, { "epoch": 35.4043452021726, "grad_norm": 9.482196807861328, "learning_rate": 1.9999294353853384e-05, "loss": 0.1332, "step": 29350 }, { "epoch": 35.4164152082076, "grad_norm": 9.274578094482422, "learning_rate": 1.999929411259859e-05, "loss": 0.132, "step": 29360 }, { "epoch": 35.428485214242606, "grad_norm": 9.490325927734375, "learning_rate": 1.9999293871343797e-05, "loss": 0.1344, "step": 29370 }, { "epoch": 35.44055522027761, "grad_norm": 8.718451499938965, "learning_rate": 1.9999293630089003e-05, "loss": 0.1279, "step": 29380 }, { "epoch": 35.45262522631261, "grad_norm": 9.628144264221191, "learning_rate": 1.999929338883421e-05, "loss": 0.1289, "step": 29390 }, { "epoch": 35.464695232347616, "grad_norm": 8.303135871887207, "learning_rate": 1.9999293147579415e-05, "loss": 0.1395, "step": 29400 }, { "epoch": 35.47676523838262, "grad_norm": 9.506521224975586, "learning_rate": 1.999929290632462e-05, "loss": 0.1382, "step": 29410 }, { "epoch": 35.48883524441762, "grad_norm": 9.199543952941895, "learning_rate": 1.9999292665069828e-05, "loss": 0.1356, "step": 29420 }, { "epoch": 35.500905250452625, "grad_norm": 9.289144515991211, "learning_rate": 1.9999292423815034e-05, "loss": 0.1373, "step": 29430 }, { "epoch": 35.51297525648763, "grad_norm": 9.397150993347168, "learning_rate": 1.999929218256024e-05, "loss": 0.1333, "step": 29440 }, { "epoch": 35.52504526252263, "grad_norm": 9.298787117004395, "learning_rate": 1.9999291941305446e-05, "loss": 0.1402, "step": 29450 }, { "epoch": 35.537115268557635, "grad_norm": 9.447854995727539, "learning_rate": 1.9999291700050653e-05, "loss": 0.1391, "step": 29460 }, { "epoch": 35.54918527459264, "grad_norm": 9.1713228225708, "learning_rate": 1.999929145879586e-05, "loss": 0.1368, "step": 29470 }, { "epoch": 35.56125528062764, "grad_norm": 9.325959205627441, "learning_rate": 1.9999291217541065e-05, "loss": 0.1417, "step": 29480 }, { "epoch": 35.573325286662644, "grad_norm": 9.370553970336914, "learning_rate": 1.999929097628627e-05, "loss": 0.1398, "step": 29490 }, { "epoch": 35.58539529269765, "grad_norm": 9.225945472717285, "learning_rate": 1.9999290735031474e-05, "loss": 0.136, "step": 29500 }, { "epoch": 35.58539529269765, "eval_loss": 11.346336364746094, "eval_runtime": 8.1389, "eval_samples_per_second": 85.638, "eval_steps_per_second": 10.812, "step": 29500 }, { "epoch": 35.59746529873265, "grad_norm": 9.87432861328125, "learning_rate": 1.999929049377668e-05, "loss": 0.1394, "step": 29510 }, { "epoch": 35.609535304767654, "grad_norm": 8.80897331237793, "learning_rate": 1.9999290252521887e-05, "loss": 0.1371, "step": 29520 }, { "epoch": 35.62160531080266, "grad_norm": 9.112578392028809, "learning_rate": 1.9999290011267093e-05, "loss": 0.1381, "step": 29530 }, { "epoch": 35.63367531683766, "grad_norm": 9.709457397460938, "learning_rate": 1.99992897700123e-05, "loss": 0.1378, "step": 29540 }, { "epoch": 35.64574532287266, "grad_norm": 9.650135040283203, "learning_rate": 1.999928952875751e-05, "loss": 0.1391, "step": 29550 }, { "epoch": 35.65781532890767, "grad_norm": 8.385799407958984, "learning_rate": 1.9999289287502715e-05, "loss": 0.1415, "step": 29560 }, { "epoch": 35.66988533494267, "grad_norm": 9.784273147583008, "learning_rate": 1.999928904624792e-05, "loss": 0.142, "step": 29570 }, { "epoch": 35.68195534097767, "grad_norm": 10.408834457397461, "learning_rate": 1.9999288804993127e-05, "loss": 0.1444, "step": 29580 }, { "epoch": 35.694025347012676, "grad_norm": 9.600106239318848, "learning_rate": 1.9999288563738333e-05, "loss": 0.1458, "step": 29590 }, { "epoch": 35.70609535304768, "grad_norm": 9.361586570739746, "learning_rate": 1.999928832248354e-05, "loss": 0.1457, "step": 29600 }, { "epoch": 35.71816535908268, "grad_norm": 9.63106918334961, "learning_rate": 1.9999288081228746e-05, "loss": 0.1433, "step": 29610 }, { "epoch": 35.730235365117686, "grad_norm": 10.052619934082031, "learning_rate": 1.9999287839973952e-05, "loss": 0.1436, "step": 29620 }, { "epoch": 35.74230537115269, "grad_norm": 9.57651424407959, "learning_rate": 1.9999287598719158e-05, "loss": 0.1435, "step": 29630 }, { "epoch": 35.754375377187685, "grad_norm": 9.938274383544922, "learning_rate": 1.9999287357464365e-05, "loss": 0.1492, "step": 29640 }, { "epoch": 35.76644538322269, "grad_norm": 9.800298690795898, "learning_rate": 1.999928711620957e-05, "loss": 0.1452, "step": 29650 }, { "epoch": 35.77851538925769, "grad_norm": 10.621685028076172, "learning_rate": 1.9999286874954777e-05, "loss": 0.1462, "step": 29660 }, { "epoch": 35.790585395292695, "grad_norm": 9.822294235229492, "learning_rate": 1.999928663369998e-05, "loss": 0.1459, "step": 29670 }, { "epoch": 35.8026554013277, "grad_norm": 10.086284637451172, "learning_rate": 1.9999286392445186e-05, "loss": 0.1432, "step": 29680 }, { "epoch": 35.8147254073627, "grad_norm": 10.196678161621094, "learning_rate": 1.9999286151190392e-05, "loss": 0.1447, "step": 29690 }, { "epoch": 35.826795413397704, "grad_norm": 9.7380952835083, "learning_rate": 1.99992859099356e-05, "loss": 0.143, "step": 29700 }, { "epoch": 35.83886541943271, "grad_norm": 9.59929084777832, "learning_rate": 1.9999285668680805e-05, "loss": 0.1465, "step": 29710 }, { "epoch": 35.85093542546771, "grad_norm": 10.239871978759766, "learning_rate": 1.999928542742601e-05, "loss": 0.148, "step": 29720 }, { "epoch": 35.863005431502714, "grad_norm": 9.954971313476562, "learning_rate": 1.9999285186171217e-05, "loss": 0.1479, "step": 29730 }, { "epoch": 35.87507543753772, "grad_norm": 9.672239303588867, "learning_rate": 1.9999284944916423e-05, "loss": 0.1464, "step": 29740 }, { "epoch": 35.88714544357272, "grad_norm": 9.317548751831055, "learning_rate": 1.999928470366163e-05, "loss": 0.1459, "step": 29750 }, { "epoch": 35.89921544960772, "grad_norm": 9.820948600769043, "learning_rate": 1.9999284462406836e-05, "loss": 0.1445, "step": 29760 }, { "epoch": 35.91128545564273, "grad_norm": 10.401371955871582, "learning_rate": 1.9999284221152042e-05, "loss": 0.15, "step": 29770 }, { "epoch": 35.92335546167773, "grad_norm": 10.365264892578125, "learning_rate": 1.9999283979897248e-05, "loss": 0.1463, "step": 29780 }, { "epoch": 35.93542546771273, "grad_norm": 10.367101669311523, "learning_rate": 1.9999283738642454e-05, "loss": 0.1513, "step": 29790 }, { "epoch": 35.947495473747736, "grad_norm": 10.045818328857422, "learning_rate": 1.999928349738766e-05, "loss": 0.1485, "step": 29800 }, { "epoch": 35.95956547978274, "grad_norm": 10.025796890258789, "learning_rate": 1.9999283256132867e-05, "loss": 0.1489, "step": 29810 }, { "epoch": 35.97163548581774, "grad_norm": 10.495179176330566, "learning_rate": 1.9999283014878073e-05, "loss": 0.151, "step": 29820 }, { "epoch": 35.983705491852746, "grad_norm": 9.429216384887695, "learning_rate": 1.999928277362328e-05, "loss": 0.1498, "step": 29830 }, { "epoch": 35.99577549788775, "grad_norm": 10.397522926330566, "learning_rate": 1.9999282532368485e-05, "loss": 0.1475, "step": 29840 }, { "epoch": 36.007242003621, "grad_norm": 7.8629560470581055, "learning_rate": 1.999928229111369e-05, "loss": 0.1241, "step": 29850 }, { "epoch": 36.019312009656005, "grad_norm": 8.023923873901367, "learning_rate": 1.9999282049858898e-05, "loss": 0.1087, "step": 29860 }, { "epoch": 36.03138201569101, "grad_norm": 8.240487098693848, "learning_rate": 1.9999281808604104e-05, "loss": 0.108, "step": 29870 }, { "epoch": 36.04345202172601, "grad_norm": 8.394013404846191, "learning_rate": 1.999928156734931e-05, "loss": 0.1092, "step": 29880 }, { "epoch": 36.055522027761015, "grad_norm": 7.7342848777771, "learning_rate": 1.9999281326094517e-05, "loss": 0.1143, "step": 29890 }, { "epoch": 36.06759203379602, "grad_norm": 8.624968528747559, "learning_rate": 1.9999281084839723e-05, "loss": 0.1124, "step": 29900 }, { "epoch": 36.07966203983102, "grad_norm": 9.117629051208496, "learning_rate": 1.999928084358493e-05, "loss": 0.1146, "step": 29910 }, { "epoch": 36.091732045866024, "grad_norm": 8.539716720581055, "learning_rate": 1.9999280602330132e-05, "loss": 0.1105, "step": 29920 }, { "epoch": 36.10380205190103, "grad_norm": 8.98807430267334, "learning_rate": 1.9999280361075338e-05, "loss": 0.1173, "step": 29930 }, { "epoch": 36.11587205793603, "grad_norm": 9.141290664672852, "learning_rate": 1.9999280119820544e-05, "loss": 0.1149, "step": 29940 }, { "epoch": 36.127942063971034, "grad_norm": 8.324451446533203, "learning_rate": 1.999927987856575e-05, "loss": 0.1165, "step": 29950 }, { "epoch": 36.14001207000604, "grad_norm": 9.281306266784668, "learning_rate": 1.9999279637310957e-05, "loss": 0.1182, "step": 29960 }, { "epoch": 36.15208207604104, "grad_norm": 8.46061897277832, "learning_rate": 1.9999279396056163e-05, "loss": 0.1175, "step": 29970 }, { "epoch": 36.16415208207604, "grad_norm": 8.680758476257324, "learning_rate": 1.999927915480137e-05, "loss": 0.1218, "step": 29980 }, { "epoch": 36.17622208811105, "grad_norm": 7.956368923187256, "learning_rate": 1.9999278913546575e-05, "loss": 0.1168, "step": 29990 }, { "epoch": 36.18829209414605, "grad_norm": 10.293691635131836, "learning_rate": 1.999927867229178e-05, "loss": 0.1232, "step": 30000 }, { "epoch": 36.18829209414605, "eval_loss": 11.364558219909668, "eval_runtime": 8.138, "eval_samples_per_second": 85.648, "eval_steps_per_second": 10.813, "step": 30000 }, { "epoch": 36.20036210018105, "grad_norm": 9.237372398376465, "learning_rate": 1.9999278431036988e-05, "loss": 0.1224, "step": 30010 }, { "epoch": 36.212432106216056, "grad_norm": 8.906211853027344, "learning_rate": 1.9999278189782194e-05, "loss": 0.1208, "step": 30020 }, { "epoch": 36.22450211225106, "grad_norm": 8.892001152038574, "learning_rate": 1.99992779485274e-05, "loss": 0.1271, "step": 30030 }, { "epoch": 36.23657211828606, "grad_norm": 8.404146194458008, "learning_rate": 1.9999277707272606e-05, "loss": 0.1265, "step": 30040 }, { "epoch": 36.248642124321066, "grad_norm": 8.975317001342773, "learning_rate": 1.9999277466017813e-05, "loss": 0.1231, "step": 30050 }, { "epoch": 36.26071213035606, "grad_norm": 8.801883697509766, "learning_rate": 1.999927722476302e-05, "loss": 0.127, "step": 30060 }, { "epoch": 36.272782136391065, "grad_norm": 9.322874069213867, "learning_rate": 1.9999276983508225e-05, "loss": 0.1234, "step": 30070 }, { "epoch": 36.28485214242607, "grad_norm": 9.01101303100586, "learning_rate": 1.999927674225343e-05, "loss": 0.1276, "step": 30080 }, { "epoch": 36.29692214846107, "grad_norm": 9.786666870117188, "learning_rate": 1.9999276500998637e-05, "loss": 0.1277, "step": 30090 }, { "epoch": 36.308992154496075, "grad_norm": 9.532572746276855, "learning_rate": 1.9999276259743844e-05, "loss": 0.1249, "step": 30100 }, { "epoch": 36.32106216053108, "grad_norm": 9.202963829040527, "learning_rate": 1.999927601848905e-05, "loss": 0.1282, "step": 30110 }, { "epoch": 36.33313216656608, "grad_norm": 9.997757911682129, "learning_rate": 1.9999275777234256e-05, "loss": 0.1233, "step": 30120 }, { "epoch": 36.345202172601084, "grad_norm": 9.110601425170898, "learning_rate": 1.9999275535979462e-05, "loss": 0.1269, "step": 30130 }, { "epoch": 36.35727217863609, "grad_norm": 9.809715270996094, "learning_rate": 1.999927529472467e-05, "loss": 0.1235, "step": 30140 }, { "epoch": 36.36934218467109, "grad_norm": 8.262893676757812, "learning_rate": 1.9999275053469875e-05, "loss": 0.1306, "step": 30150 }, { "epoch": 36.381412190706094, "grad_norm": 8.671040534973145, "learning_rate": 1.999927481221508e-05, "loss": 0.1288, "step": 30160 }, { "epoch": 36.3934821967411, "grad_norm": 9.564306259155273, "learning_rate": 1.9999274570960287e-05, "loss": 0.1315, "step": 30170 }, { "epoch": 36.4055522027761, "grad_norm": 8.683401107788086, "learning_rate": 1.9999274329705493e-05, "loss": 0.1308, "step": 30180 }, { "epoch": 36.4176222088111, "grad_norm": 9.871010780334473, "learning_rate": 1.99992740884507e-05, "loss": 0.13, "step": 30190 }, { "epoch": 36.429692214846106, "grad_norm": 9.824578285217285, "learning_rate": 1.9999273847195906e-05, "loss": 0.1358, "step": 30200 }, { "epoch": 36.44176222088111, "grad_norm": 9.746700286865234, "learning_rate": 1.9999273605941112e-05, "loss": 0.1309, "step": 30210 }, { "epoch": 36.45383222691611, "grad_norm": 10.10595703125, "learning_rate": 1.9999273364686318e-05, "loss": 0.1336, "step": 30220 }, { "epoch": 36.465902232951116, "grad_norm": 8.847012519836426, "learning_rate": 1.9999273123431524e-05, "loss": 0.1346, "step": 30230 }, { "epoch": 36.47797223898612, "grad_norm": 9.691594123840332, "learning_rate": 1.999927288217673e-05, "loss": 0.1311, "step": 30240 }, { "epoch": 36.49004224502112, "grad_norm": 9.29859447479248, "learning_rate": 1.9999272640921937e-05, "loss": 0.1317, "step": 30250 }, { "epoch": 36.502112251056126, "grad_norm": 8.915749549865723, "learning_rate": 1.9999272399667143e-05, "loss": 0.1316, "step": 30260 }, { "epoch": 36.51418225709113, "grad_norm": 9.240715980529785, "learning_rate": 1.999927215841235e-05, "loss": 0.1288, "step": 30270 }, { "epoch": 36.52625226312613, "grad_norm": 8.645984649658203, "learning_rate": 1.9999271917157556e-05, "loss": 0.1263, "step": 30280 }, { "epoch": 36.538322269161135, "grad_norm": 9.21142578125, "learning_rate": 1.9999271675902762e-05, "loss": 0.1322, "step": 30290 }, { "epoch": 36.55039227519614, "grad_norm": 9.306364059448242, "learning_rate": 1.9999271434647968e-05, "loss": 0.1333, "step": 30300 }, { "epoch": 36.56246228123114, "grad_norm": 9.211089134216309, "learning_rate": 1.9999271193393174e-05, "loss": 0.1313, "step": 30310 }, { "epoch": 36.574532287266145, "grad_norm": 10.332683563232422, "learning_rate": 1.999927095213838e-05, "loss": 0.1378, "step": 30320 }, { "epoch": 36.58660229330115, "grad_norm": 9.208658218383789, "learning_rate": 1.9999270710883583e-05, "loss": 0.1322, "step": 30330 }, { "epoch": 36.59867229933615, "grad_norm": 9.629827499389648, "learning_rate": 1.999927046962879e-05, "loss": 0.1327, "step": 30340 }, { "epoch": 36.610742305371154, "grad_norm": 9.144984245300293, "learning_rate": 1.9999270228373996e-05, "loss": 0.1318, "step": 30350 }, { "epoch": 36.62281231140616, "grad_norm": 8.836241722106934, "learning_rate": 1.9999269987119202e-05, "loss": 0.1322, "step": 30360 }, { "epoch": 36.63488231744116, "grad_norm": 9.631068229675293, "learning_rate": 1.9999269745864408e-05, "loss": 0.1363, "step": 30370 }, { "epoch": 36.646952323476164, "grad_norm": 9.774215698242188, "learning_rate": 1.9999269504609614e-05, "loss": 0.1349, "step": 30380 }, { "epoch": 36.65902232951117, "grad_norm": 8.905024528503418, "learning_rate": 1.999926926335482e-05, "loss": 0.1402, "step": 30390 }, { "epoch": 36.67109233554617, "grad_norm": 9.56110954284668, "learning_rate": 1.9999269022100027e-05, "loss": 0.1387, "step": 30400 }, { "epoch": 36.68316234158117, "grad_norm": 9.786452293395996, "learning_rate": 1.9999268780845233e-05, "loss": 0.1405, "step": 30410 }, { "epoch": 36.69523234761618, "grad_norm": 9.062438011169434, "learning_rate": 1.999926853959044e-05, "loss": 0.1355, "step": 30420 }, { "epoch": 36.70730235365118, "grad_norm": 10.933148384094238, "learning_rate": 1.9999268298335645e-05, "loss": 0.1382, "step": 30430 }, { "epoch": 36.71937235968618, "grad_norm": 9.422237396240234, "learning_rate": 1.999926805708085e-05, "loss": 0.1371, "step": 30440 }, { "epoch": 36.731442365721186, "grad_norm": 9.365809440612793, "learning_rate": 1.9999267815826058e-05, "loss": 0.138, "step": 30450 }, { "epoch": 36.74351237175619, "grad_norm": 9.571290969848633, "learning_rate": 1.9999267574571264e-05, "loss": 0.1361, "step": 30460 }, { "epoch": 36.755582377791185, "grad_norm": 9.274819374084473, "learning_rate": 1.999926733331647e-05, "loss": 0.1372, "step": 30470 }, { "epoch": 36.76765238382619, "grad_norm": 9.297545433044434, "learning_rate": 1.9999267092061676e-05, "loss": 0.1366, "step": 30480 }, { "epoch": 36.77972238986119, "grad_norm": 9.004526138305664, "learning_rate": 1.9999266850806883e-05, "loss": 0.1363, "step": 30490 }, { "epoch": 36.791792395896195, "grad_norm": 9.008610725402832, "learning_rate": 1.999926660955209e-05, "loss": 0.1414, "step": 30500 }, { "epoch": 36.791792395896195, "eval_loss": 11.410135269165039, "eval_runtime": 8.1297, "eval_samples_per_second": 85.735, "eval_steps_per_second": 10.824, "step": 30500 }, { "epoch": 36.8038624019312, "grad_norm": 9.570773124694824, "learning_rate": 1.9999266368297295e-05, "loss": 0.1442, "step": 30510 }, { "epoch": 36.8159324079662, "grad_norm": 9.762811660766602, "learning_rate": 1.99992661270425e-05, "loss": 0.1392, "step": 30520 }, { "epoch": 36.828002414001205, "grad_norm": 9.555707931518555, "learning_rate": 1.9999265885787708e-05, "loss": 0.1388, "step": 30530 }, { "epoch": 36.84007242003621, "grad_norm": 9.60499382019043, "learning_rate": 1.9999265644532914e-05, "loss": 0.1398, "step": 30540 }, { "epoch": 36.85214242607121, "grad_norm": 8.80374813079834, "learning_rate": 1.999926540327812e-05, "loss": 0.1454, "step": 30550 }, { "epoch": 36.864212432106214, "grad_norm": 11.0659818649292, "learning_rate": 1.9999265162023326e-05, "loss": 0.1356, "step": 30560 }, { "epoch": 36.87628243814122, "grad_norm": 10.198934555053711, "learning_rate": 1.9999264920768532e-05, "loss": 0.1456, "step": 30570 }, { "epoch": 36.88835244417622, "grad_norm": 9.033019065856934, "learning_rate": 1.9999264679513735e-05, "loss": 0.1382, "step": 30580 }, { "epoch": 36.900422450211224, "grad_norm": 9.771589279174805, "learning_rate": 1.999926443825894e-05, "loss": 0.1401, "step": 30590 }, { "epoch": 36.91249245624623, "grad_norm": 9.469162940979004, "learning_rate": 1.9999264197004148e-05, "loss": 0.1397, "step": 30600 }, { "epoch": 36.92456246228123, "grad_norm": 10.10308837890625, "learning_rate": 1.9999263955749354e-05, "loss": 0.145, "step": 30610 }, { "epoch": 36.93663246831623, "grad_norm": 9.249743461608887, "learning_rate": 1.999926371449456e-05, "loss": 0.1478, "step": 30620 }, { "epoch": 36.948702474351236, "grad_norm": 10.681233406066895, "learning_rate": 1.999926347323977e-05, "loss": 0.1461, "step": 30630 }, { "epoch": 36.96077248038624, "grad_norm": 10.167328834533691, "learning_rate": 1.9999263231984976e-05, "loss": 0.1434, "step": 30640 }, { "epoch": 36.97284248642124, "grad_norm": 9.342216491699219, "learning_rate": 1.9999262990730182e-05, "loss": 0.1452, "step": 30650 }, { "epoch": 36.984912492456246, "grad_norm": 9.623711585998535, "learning_rate": 1.999926274947539e-05, "loss": 0.144, "step": 30660 }, { "epoch": 36.99698249849125, "grad_norm": 9.673982620239258, "learning_rate": 1.9999262508220595e-05, "loss": 0.1425, "step": 30670 }, { "epoch": 37.0084490042245, "grad_norm": 7.830747604370117, "learning_rate": 1.99992622669658e-05, "loss": 0.1068, "step": 30680 }, { "epoch": 37.020519010259505, "grad_norm": 7.713870048522949, "learning_rate": 1.9999262025711007e-05, "loss": 0.1005, "step": 30690 }, { "epoch": 37.03258901629451, "grad_norm": 7.8909220695495605, "learning_rate": 1.9999261784456213e-05, "loss": 0.0998, "step": 30700 }, { "epoch": 37.04465902232951, "grad_norm": 8.425085067749023, "learning_rate": 1.999926154320142e-05, "loss": 0.1069, "step": 30710 }, { "epoch": 37.056729028364515, "grad_norm": 8.441025733947754, "learning_rate": 1.9999261301946626e-05, "loss": 0.1053, "step": 30720 }, { "epoch": 37.06879903439952, "grad_norm": 8.270679473876953, "learning_rate": 1.9999261060691832e-05, "loss": 0.1111, "step": 30730 }, { "epoch": 37.08086904043452, "grad_norm": 9.201637268066406, "learning_rate": 1.9999260819437035e-05, "loss": 0.1139, "step": 30740 }, { "epoch": 37.092939046469525, "grad_norm": 8.419515609741211, "learning_rate": 1.999926057818224e-05, "loss": 0.1151, "step": 30750 }, { "epoch": 37.10500905250453, "grad_norm": 7.9073662757873535, "learning_rate": 1.9999260336927447e-05, "loss": 0.1124, "step": 30760 }, { "epoch": 37.11707905853953, "grad_norm": 8.387486457824707, "learning_rate": 1.9999260095672653e-05, "loss": 0.1156, "step": 30770 }, { "epoch": 37.129149064574534, "grad_norm": 9.527688026428223, "learning_rate": 1.999925985441786e-05, "loss": 0.1159, "step": 30780 }, { "epoch": 37.14121907060954, "grad_norm": 8.396798133850098, "learning_rate": 1.9999259613163066e-05, "loss": 0.1151, "step": 30790 }, { "epoch": 37.15328907664454, "grad_norm": 8.3779296875, "learning_rate": 1.9999259371908272e-05, "loss": 0.1157, "step": 30800 }, { "epoch": 37.165359082679544, "grad_norm": 8.214798927307129, "learning_rate": 1.9999259130653478e-05, "loss": 0.1136, "step": 30810 }, { "epoch": 37.17742908871455, "grad_norm": 8.152816772460938, "learning_rate": 1.9999258889398684e-05, "loss": 0.1126, "step": 30820 }, { "epoch": 37.18949909474955, "grad_norm": 8.921475410461426, "learning_rate": 1.999925864814389e-05, "loss": 0.1157, "step": 30830 }, { "epoch": 37.20156910078455, "grad_norm": 8.656089782714844, "learning_rate": 1.9999258406889097e-05, "loss": 0.1162, "step": 30840 }, { "epoch": 37.213639106819556, "grad_norm": 8.898589134216309, "learning_rate": 1.9999258165634303e-05, "loss": 0.118, "step": 30850 }, { "epoch": 37.22570911285456, "grad_norm": 10.0120267868042, "learning_rate": 1.999925792437951e-05, "loss": 0.1187, "step": 30860 }, { "epoch": 37.23777911888956, "grad_norm": 9.220602035522461, "learning_rate": 1.9999257683124715e-05, "loss": 0.12, "step": 30870 }, { "epoch": 37.249849124924566, "grad_norm": 8.645119667053223, "learning_rate": 1.999925744186992e-05, "loss": 0.1223, "step": 30880 }, { "epoch": 37.26191913095956, "grad_norm": 9.269572257995605, "learning_rate": 1.9999257200615128e-05, "loss": 0.1225, "step": 30890 }, { "epoch": 37.273989136994565, "grad_norm": 8.866302490234375, "learning_rate": 1.9999256959360334e-05, "loss": 0.1183, "step": 30900 }, { "epoch": 37.28605914302957, "grad_norm": 9.262826919555664, "learning_rate": 1.999925671810554e-05, "loss": 0.1176, "step": 30910 }, { "epoch": 37.29812914906457, "grad_norm": 8.473628044128418, "learning_rate": 1.9999256476850747e-05, "loss": 0.1232, "step": 30920 }, { "epoch": 37.310199155099575, "grad_norm": 8.983037948608398, "learning_rate": 1.9999256235595953e-05, "loss": 0.1247, "step": 30930 }, { "epoch": 37.32226916113458, "grad_norm": 9.66059684753418, "learning_rate": 1.999925599434116e-05, "loss": 0.1193, "step": 30940 }, { "epoch": 37.33433916716958, "grad_norm": 9.229266166687012, "learning_rate": 1.9999255753086365e-05, "loss": 0.125, "step": 30950 }, { "epoch": 37.346409173204584, "grad_norm": 9.23241138458252, "learning_rate": 1.999925551183157e-05, "loss": 0.1249, "step": 30960 }, { "epoch": 37.35847917923959, "grad_norm": 8.589261054992676, "learning_rate": 1.9999255270576778e-05, "loss": 0.1224, "step": 30970 }, { "epoch": 37.37054918527459, "grad_norm": 8.832204818725586, "learning_rate": 1.9999255029321984e-05, "loss": 0.1216, "step": 30980 }, { "epoch": 37.382619191309594, "grad_norm": 8.70802116394043, "learning_rate": 1.9999254788067187e-05, "loss": 0.1242, "step": 30990 }, { "epoch": 37.3946891973446, "grad_norm": 8.862825393676758, "learning_rate": 1.9999254546812393e-05, "loss": 0.1245, "step": 31000 }, { "epoch": 37.3946891973446, "eval_loss": 11.43028450012207, "eval_runtime": 8.1441, "eval_samples_per_second": 85.584, "eval_steps_per_second": 10.805, "step": 31000 }, { "epoch": 37.4067592033796, "grad_norm": 10.268559455871582, "learning_rate": 1.99992543055576e-05, "loss": 0.129, "step": 31010 }, { "epoch": 37.418829209414604, "grad_norm": 9.126133918762207, "learning_rate": 1.9999254064302805e-05, "loss": 0.1276, "step": 31020 }, { "epoch": 37.43089921544961, "grad_norm": 9.094511985778809, "learning_rate": 1.999925382304801e-05, "loss": 0.1265, "step": 31030 }, { "epoch": 37.44296922148461, "grad_norm": 8.457136154174805, "learning_rate": 1.9999253581793218e-05, "loss": 0.126, "step": 31040 }, { "epoch": 37.45503922751961, "grad_norm": 9.606029510498047, "learning_rate": 1.9999253340538424e-05, "loss": 0.1308, "step": 31050 }, { "epoch": 37.467109233554616, "grad_norm": 8.61480712890625, "learning_rate": 1.999925309928363e-05, "loss": 0.1254, "step": 31060 }, { "epoch": 37.47917923958962, "grad_norm": 8.073375701904297, "learning_rate": 1.9999252858028836e-05, "loss": 0.1282, "step": 31070 }, { "epoch": 37.49124924562462, "grad_norm": 8.420039176940918, "learning_rate": 1.9999252616774043e-05, "loss": 0.1283, "step": 31080 }, { "epoch": 37.503319251659626, "grad_norm": 9.155001640319824, "learning_rate": 1.999925237551925e-05, "loss": 0.1272, "step": 31090 }, { "epoch": 37.51538925769463, "grad_norm": 10.294750213623047, "learning_rate": 1.9999252134264455e-05, "loss": 0.1301, "step": 31100 }, { "epoch": 37.52745926372963, "grad_norm": 8.558188438415527, "learning_rate": 1.999925189300966e-05, "loss": 0.1283, "step": 31110 }, { "epoch": 37.539529269764635, "grad_norm": 9.681791305541992, "learning_rate": 1.9999251651754867e-05, "loss": 0.1362, "step": 31120 }, { "epoch": 37.55159927579964, "grad_norm": 8.562723159790039, "learning_rate": 1.9999251410500074e-05, "loss": 0.1286, "step": 31130 }, { "epoch": 37.56366928183464, "grad_norm": 9.26304817199707, "learning_rate": 1.999925116924528e-05, "loss": 0.1314, "step": 31140 }, { "epoch": 37.575739287869645, "grad_norm": 9.199991226196289, "learning_rate": 1.9999250927990486e-05, "loss": 0.1308, "step": 31150 }, { "epoch": 37.58780929390465, "grad_norm": 8.205242156982422, "learning_rate": 1.9999250686735692e-05, "loss": 0.1269, "step": 31160 }, { "epoch": 37.59987929993965, "grad_norm": 9.088810920715332, "learning_rate": 1.99992504454809e-05, "loss": 0.1328, "step": 31170 }, { "epoch": 37.611949305974655, "grad_norm": 9.481836318969727, "learning_rate": 1.9999250204226105e-05, "loss": 0.1298, "step": 31180 }, { "epoch": 37.62401931200966, "grad_norm": 8.660112380981445, "learning_rate": 1.999924996297131e-05, "loss": 0.1326, "step": 31190 }, { "epoch": 37.63608931804466, "grad_norm": 10.22089958190918, "learning_rate": 1.9999249721716517e-05, "loss": 0.1303, "step": 31200 }, { "epoch": 37.648159324079664, "grad_norm": 8.571793556213379, "learning_rate": 1.9999249480461723e-05, "loss": 0.135, "step": 31210 }, { "epoch": 37.66022933011467, "grad_norm": 9.574103355407715, "learning_rate": 1.999924923920693e-05, "loss": 0.1365, "step": 31220 }, { "epoch": 37.67229933614967, "grad_norm": 9.116832733154297, "learning_rate": 1.9999248997952136e-05, "loss": 0.1348, "step": 31230 }, { "epoch": 37.684369342184674, "grad_norm": 9.277179718017578, "learning_rate": 1.9999248756697342e-05, "loss": 0.1289, "step": 31240 }, { "epoch": 37.69643934821968, "grad_norm": 9.374176979064941, "learning_rate": 1.9999248515442548e-05, "loss": 0.1322, "step": 31250 }, { "epoch": 37.70850935425468, "grad_norm": 9.232290267944336, "learning_rate": 1.9999248274187754e-05, "loss": 0.1337, "step": 31260 }, { "epoch": 37.72057936028968, "grad_norm": 8.206911087036133, "learning_rate": 1.999924803293296e-05, "loss": 0.132, "step": 31270 }, { "epoch": 37.73264936632469, "grad_norm": 8.915336608886719, "learning_rate": 1.9999247791678167e-05, "loss": 0.1271, "step": 31280 }, { "epoch": 37.74471937235969, "grad_norm": 8.32841968536377, "learning_rate": 1.9999247550423373e-05, "loss": 0.1364, "step": 31290 }, { "epoch": 37.756789378394686, "grad_norm": 9.948968887329102, "learning_rate": 1.999924730916858e-05, "loss": 0.1321, "step": 31300 }, { "epoch": 37.76885938442969, "grad_norm": 9.052291870117188, "learning_rate": 1.9999247067913786e-05, "loss": 0.1321, "step": 31310 }, { "epoch": 37.78092939046469, "grad_norm": 8.913715362548828, "learning_rate": 1.9999246826658992e-05, "loss": 0.1329, "step": 31320 }, { "epoch": 37.792999396499695, "grad_norm": 9.484868049621582, "learning_rate": 1.9999246585404198e-05, "loss": 0.133, "step": 31330 }, { "epoch": 37.8050694025347, "grad_norm": 9.23119831085205, "learning_rate": 1.9999246344149404e-05, "loss": 0.1373, "step": 31340 }, { "epoch": 37.8171394085697, "grad_norm": 9.093085289001465, "learning_rate": 1.999924610289461e-05, "loss": 0.1356, "step": 31350 }, { "epoch": 37.829209414604705, "grad_norm": 10.377218246459961, "learning_rate": 1.9999245861639817e-05, "loss": 0.1374, "step": 31360 }, { "epoch": 37.84127942063971, "grad_norm": 8.808316230773926, "learning_rate": 1.9999245620385023e-05, "loss": 0.1361, "step": 31370 }, { "epoch": 37.85334942667471, "grad_norm": 9.155333518981934, "learning_rate": 1.999924537913023e-05, "loss": 0.1337, "step": 31380 }, { "epoch": 37.865419432709714, "grad_norm": 9.626872062683105, "learning_rate": 1.9999245137875435e-05, "loss": 0.1368, "step": 31390 }, { "epoch": 37.87748943874472, "grad_norm": 8.670689582824707, "learning_rate": 1.999924489662064e-05, "loss": 0.1338, "step": 31400 }, { "epoch": 37.88955944477972, "grad_norm": 9.888708114624023, "learning_rate": 1.9999244655365844e-05, "loss": 0.1359, "step": 31410 }, { "epoch": 37.901629450814724, "grad_norm": 9.479559898376465, "learning_rate": 1.999924441411105e-05, "loss": 0.1372, "step": 31420 }, { "epoch": 37.91369945684973, "grad_norm": 10.67675495147705, "learning_rate": 1.9999244172856257e-05, "loss": 0.1387, "step": 31430 }, { "epoch": 37.92576946288473, "grad_norm": 9.151412963867188, "learning_rate": 1.9999243931601463e-05, "loss": 0.1398, "step": 31440 }, { "epoch": 37.937839468919734, "grad_norm": 9.099031448364258, "learning_rate": 1.999924369034667e-05, "loss": 0.1411, "step": 31450 }, { "epoch": 37.94990947495474, "grad_norm": 9.38343620300293, "learning_rate": 1.9999243449091875e-05, "loss": 0.1406, "step": 31460 }, { "epoch": 37.96197948098974, "grad_norm": 9.818683624267578, "learning_rate": 1.999924320783708e-05, "loss": 0.1404, "step": 31470 }, { "epoch": 37.97404948702474, "grad_norm": 10.315268516540527, "learning_rate": 1.9999242966582288e-05, "loss": 0.1388, "step": 31480 }, { "epoch": 37.986119493059746, "grad_norm": 9.573949813842773, "learning_rate": 1.9999242725327494e-05, "loss": 0.1395, "step": 31490 }, { "epoch": 37.99818949909475, "grad_norm": 9.0252103805542, "learning_rate": 1.99992424840727e-05, "loss": 0.1396, "step": 31500 }, { "epoch": 37.99818949909475, "eval_loss": 11.460521697998047, "eval_runtime": 8.1344, "eval_samples_per_second": 85.686, "eval_steps_per_second": 10.818, "step": 31500 }, { "epoch": 38.009656004828, "grad_norm": 7.730249881744385, "learning_rate": 1.9999242242817906e-05, "loss": 0.1041, "step": 31510 }, { "epoch": 38.021726010863006, "grad_norm": 7.954694747924805, "learning_rate": 1.9999242001563113e-05, "loss": 0.0983, "step": 31520 }, { "epoch": 38.03379601689801, "grad_norm": 8.150473594665527, "learning_rate": 1.999924176030832e-05, "loss": 0.1013, "step": 31530 }, { "epoch": 38.04586602293301, "grad_norm": 8.502238273620605, "learning_rate": 1.9999241519053525e-05, "loss": 0.1029, "step": 31540 }, { "epoch": 38.057936028968015, "grad_norm": 7.934248924255371, "learning_rate": 1.999924127779873e-05, "loss": 0.1037, "step": 31550 }, { "epoch": 38.07000603500302, "grad_norm": 8.616371154785156, "learning_rate": 1.9999241036543938e-05, "loss": 0.106, "step": 31560 }, { "epoch": 38.08207604103802, "grad_norm": 8.200566291809082, "learning_rate": 1.9999240795289144e-05, "loss": 0.1088, "step": 31570 }, { "epoch": 38.094146047073025, "grad_norm": 8.829747200012207, "learning_rate": 1.999924055403435e-05, "loss": 0.1088, "step": 31580 }, { "epoch": 38.10621605310803, "grad_norm": 8.203644752502441, "learning_rate": 1.9999240312779556e-05, "loss": 0.1088, "step": 31590 }, { "epoch": 38.11828605914303, "grad_norm": 7.89533805847168, "learning_rate": 1.9999240071524762e-05, "loss": 0.1109, "step": 31600 }, { "epoch": 38.130356065178034, "grad_norm": 9.06258487701416, "learning_rate": 1.999923983026997e-05, "loss": 0.111, "step": 31610 }, { "epoch": 38.14242607121304, "grad_norm": 8.102995872497559, "learning_rate": 1.9999239589015175e-05, "loss": 0.1116, "step": 31620 }, { "epoch": 38.15449607724804, "grad_norm": 7.809831619262695, "learning_rate": 1.999923934776038e-05, "loss": 0.1109, "step": 31630 }, { "epoch": 38.166566083283044, "grad_norm": 9.157510757446289, "learning_rate": 1.9999239106505587e-05, "loss": 0.1101, "step": 31640 }, { "epoch": 38.17863608931805, "grad_norm": 8.88426399230957, "learning_rate": 1.9999238865250793e-05, "loss": 0.1134, "step": 31650 }, { "epoch": 38.19070609535305, "grad_norm": 9.0433349609375, "learning_rate": 1.9999238623995996e-05, "loss": 0.1207, "step": 31660 }, { "epoch": 38.202776101388054, "grad_norm": 7.476964473724365, "learning_rate": 1.9999238382741203e-05, "loss": 0.113, "step": 31670 }, { "epoch": 38.21484610742306, "grad_norm": 8.253313064575195, "learning_rate": 1.999923814148641e-05, "loss": 0.1131, "step": 31680 }, { "epoch": 38.22691611345806, "grad_norm": 8.436237335205078, "learning_rate": 1.9999237900231615e-05, "loss": 0.1127, "step": 31690 }, { "epoch": 38.23898611949306, "grad_norm": 8.966449737548828, "learning_rate": 1.999923765897682e-05, "loss": 0.1162, "step": 31700 }, { "epoch": 38.251056125528066, "grad_norm": 8.780954360961914, "learning_rate": 1.999923741772203e-05, "loss": 0.1146, "step": 31710 }, { "epoch": 38.26312613156306, "grad_norm": 8.626849174499512, "learning_rate": 1.9999237176467237e-05, "loss": 0.1152, "step": 31720 }, { "epoch": 38.275196137598066, "grad_norm": 8.193466186523438, "learning_rate": 1.9999236935212443e-05, "loss": 0.1159, "step": 31730 }, { "epoch": 38.28726614363307, "grad_norm": 8.685432434082031, "learning_rate": 1.999923669395765e-05, "loss": 0.1144, "step": 31740 }, { "epoch": 38.29933614966807, "grad_norm": 9.222712516784668, "learning_rate": 1.9999236452702856e-05, "loss": 0.1178, "step": 31750 }, { "epoch": 38.311406155703075, "grad_norm": 8.521329879760742, "learning_rate": 1.9999236211448062e-05, "loss": 0.1172, "step": 31760 }, { "epoch": 38.32347616173808, "grad_norm": 8.584490776062012, "learning_rate": 1.9999235970193268e-05, "loss": 0.1177, "step": 31770 }, { "epoch": 38.33554616777308, "grad_norm": 9.169955253601074, "learning_rate": 1.9999235728938474e-05, "loss": 0.1183, "step": 31780 }, { "epoch": 38.347616173808085, "grad_norm": 8.688033103942871, "learning_rate": 1.999923548768368e-05, "loss": 0.1187, "step": 31790 }, { "epoch": 38.35968617984309, "grad_norm": 8.027390480041504, "learning_rate": 1.9999235246428887e-05, "loss": 0.1194, "step": 31800 }, { "epoch": 38.37175618587809, "grad_norm": 8.484380722045898, "learning_rate": 1.9999235005174093e-05, "loss": 0.1217, "step": 31810 }, { "epoch": 38.383826191913094, "grad_norm": 9.186264038085938, "learning_rate": 1.9999234763919296e-05, "loss": 0.1218, "step": 31820 }, { "epoch": 38.3958961979481, "grad_norm": 8.571508407592773, "learning_rate": 1.9999234522664502e-05, "loss": 0.118, "step": 31830 }, { "epoch": 38.4079662039831, "grad_norm": 8.793351173400879, "learning_rate": 1.9999234281409708e-05, "loss": 0.1192, "step": 31840 }, { "epoch": 38.420036210018104, "grad_norm": 9.697976112365723, "learning_rate": 1.9999234040154914e-05, "loss": 0.1184, "step": 31850 }, { "epoch": 38.43210621605311, "grad_norm": 8.6011962890625, "learning_rate": 1.999923379890012e-05, "loss": 0.1251, "step": 31860 }, { "epoch": 38.44417622208811, "grad_norm": 9.281229019165039, "learning_rate": 1.9999233557645327e-05, "loss": 0.124, "step": 31870 }, { "epoch": 38.45624622812311, "grad_norm": 8.831761360168457, "learning_rate": 1.9999233316390533e-05, "loss": 0.1251, "step": 31880 }, { "epoch": 38.46831623415812, "grad_norm": 8.926401138305664, "learning_rate": 1.999923307513574e-05, "loss": 0.1181, "step": 31890 }, { "epoch": 38.48038624019312, "grad_norm": 8.237926483154297, "learning_rate": 1.9999232833880945e-05, "loss": 0.1194, "step": 31900 }, { "epoch": 38.49245624622812, "grad_norm": 8.024523735046387, "learning_rate": 1.999923259262615e-05, "loss": 0.1236, "step": 31910 }, { "epoch": 38.504526252263126, "grad_norm": 8.544008255004883, "learning_rate": 1.9999232351371358e-05, "loss": 0.1235, "step": 31920 }, { "epoch": 38.51659625829813, "grad_norm": 8.845832824707031, "learning_rate": 1.9999232110116564e-05, "loss": 0.123, "step": 31930 }, { "epoch": 38.52866626433313, "grad_norm": 8.897856712341309, "learning_rate": 1.999923186886177e-05, "loss": 0.1225, "step": 31940 }, { "epoch": 38.540736270368136, "grad_norm": 9.286515235900879, "learning_rate": 1.9999231627606977e-05, "loss": 0.1297, "step": 31950 }, { "epoch": 38.55280627640314, "grad_norm": 8.796181678771973, "learning_rate": 1.9999231386352183e-05, "loss": 0.1252, "step": 31960 }, { "epoch": 38.56487628243814, "grad_norm": 8.918425559997559, "learning_rate": 1.999923114509739e-05, "loss": 0.1247, "step": 31970 }, { "epoch": 38.576946288473145, "grad_norm": 8.473806381225586, "learning_rate": 1.9999230903842595e-05, "loss": 0.1261, "step": 31980 }, { "epoch": 38.58901629450815, "grad_norm": 8.150360107421875, "learning_rate": 1.99992306625878e-05, "loss": 0.1256, "step": 31990 }, { "epoch": 38.60108630054315, "grad_norm": 8.848437309265137, "learning_rate": 1.9999230421333008e-05, "loss": 0.1254, "step": 32000 }, { "epoch": 38.60108630054315, "eval_loss": 11.475966453552246, "eval_runtime": 8.1174, "eval_samples_per_second": 85.865, "eval_steps_per_second": 10.841, "step": 32000 }, { "epoch": 38.613156306578155, "grad_norm": 9.642966270446777, "learning_rate": 1.9999230180078214e-05, "loss": 0.1276, "step": 32010 }, { "epoch": 38.62522631261316, "grad_norm": 8.846292495727539, "learning_rate": 1.999922993882342e-05, "loss": 0.1264, "step": 32020 }, { "epoch": 38.63729631864816, "grad_norm": 9.24441146850586, "learning_rate": 1.9999229697568626e-05, "loss": 0.1243, "step": 32030 }, { "epoch": 38.649366324683164, "grad_norm": 10.056927680969238, "learning_rate": 1.9999229456313832e-05, "loss": 0.1297, "step": 32040 }, { "epoch": 38.66143633071817, "grad_norm": 9.14974308013916, "learning_rate": 1.999922921505904e-05, "loss": 0.1292, "step": 32050 }, { "epoch": 38.67350633675317, "grad_norm": 8.761993408203125, "learning_rate": 1.9999228973804245e-05, "loss": 0.1278, "step": 32060 }, { "epoch": 38.685576342788174, "grad_norm": 8.399710655212402, "learning_rate": 1.9999228732549448e-05, "loss": 0.127, "step": 32070 }, { "epoch": 38.69764634882318, "grad_norm": 8.65570068359375, "learning_rate": 1.9999228491294654e-05, "loss": 0.1249, "step": 32080 }, { "epoch": 38.70971635485818, "grad_norm": 8.710443496704102, "learning_rate": 1.999922825003986e-05, "loss": 0.1309, "step": 32090 }, { "epoch": 38.721786360893184, "grad_norm": 10.10100269317627, "learning_rate": 1.9999228008785066e-05, "loss": 0.1313, "step": 32100 }, { "epoch": 38.73385636692819, "grad_norm": 9.846818923950195, "learning_rate": 1.9999227767530273e-05, "loss": 0.1308, "step": 32110 }, { "epoch": 38.74592637296319, "grad_norm": 9.119641304016113, "learning_rate": 1.999922752627548e-05, "loss": 0.1282, "step": 32120 }, { "epoch": 38.757996378998186, "grad_norm": 9.914813995361328, "learning_rate": 1.9999227285020685e-05, "loss": 0.1325, "step": 32130 }, { "epoch": 38.77006638503319, "grad_norm": 8.880081176757812, "learning_rate": 1.999922704376589e-05, "loss": 0.1298, "step": 32140 }, { "epoch": 38.78213639106819, "grad_norm": 8.870059967041016, "learning_rate": 1.9999226802511097e-05, "loss": 0.1293, "step": 32150 }, { "epoch": 38.794206397103196, "grad_norm": 9.991875648498535, "learning_rate": 1.9999226561256304e-05, "loss": 0.1323, "step": 32160 }, { "epoch": 38.8062764031382, "grad_norm": 8.860307693481445, "learning_rate": 1.999922632000151e-05, "loss": 0.1306, "step": 32170 }, { "epoch": 38.8183464091732, "grad_norm": 9.44069766998291, "learning_rate": 1.9999226078746716e-05, "loss": 0.1316, "step": 32180 }, { "epoch": 38.830416415208205, "grad_norm": 9.274371147155762, "learning_rate": 1.9999225837491922e-05, "loss": 0.1335, "step": 32190 }, { "epoch": 38.84248642124321, "grad_norm": 9.143811225891113, "learning_rate": 1.999922559623713e-05, "loss": 0.1333, "step": 32200 }, { "epoch": 38.85455642727821, "grad_norm": 9.695402145385742, "learning_rate": 1.9999225354982335e-05, "loss": 0.1351, "step": 32210 }, { "epoch": 38.866626433313215, "grad_norm": 9.209427833557129, "learning_rate": 1.999922511372754e-05, "loss": 0.1339, "step": 32220 }, { "epoch": 38.87869643934822, "grad_norm": 9.226417541503906, "learning_rate": 1.9999224872472747e-05, "loss": 0.1361, "step": 32230 }, { "epoch": 38.89076644538322, "grad_norm": 9.64095687866211, "learning_rate": 1.9999224631217953e-05, "loss": 0.1315, "step": 32240 }, { "epoch": 38.902836451418224, "grad_norm": 9.473557472229004, "learning_rate": 1.999922438996316e-05, "loss": 0.1303, "step": 32250 }, { "epoch": 38.91490645745323, "grad_norm": 9.318462371826172, "learning_rate": 1.9999224148708366e-05, "loss": 0.1337, "step": 32260 }, { "epoch": 38.92697646348823, "grad_norm": 9.643866539001465, "learning_rate": 1.9999223907453572e-05, "loss": 0.131, "step": 32270 }, { "epoch": 38.939046469523234, "grad_norm": 9.13167667388916, "learning_rate": 1.9999223666198778e-05, "loss": 0.1352, "step": 32280 }, { "epoch": 38.95111647555824, "grad_norm": 8.90024471282959, "learning_rate": 1.9999223424943984e-05, "loss": 0.1374, "step": 32290 }, { "epoch": 38.96318648159324, "grad_norm": 8.244853973388672, "learning_rate": 1.999922318368919e-05, "loss": 0.1358, "step": 32300 }, { "epoch": 38.97525648762824, "grad_norm": 8.85802173614502, "learning_rate": 1.9999222942434397e-05, "loss": 0.1333, "step": 32310 }, { "epoch": 38.98732649366325, "grad_norm": 10.577792167663574, "learning_rate": 1.9999222701179603e-05, "loss": 0.1355, "step": 32320 }, { "epoch": 38.99939649969825, "grad_norm": 9.643588066101074, "learning_rate": 1.999922245992481e-05, "loss": 0.1362, "step": 32330 }, { "epoch": 39.0108630054315, "grad_norm": 7.580427646636963, "learning_rate": 1.9999222218670016e-05, "loss": 0.0968, "step": 32340 }, { "epoch": 39.022933011466506, "grad_norm": 8.030133247375488, "learning_rate": 1.9999221977415222e-05, "loss": 0.0973, "step": 32350 }, { "epoch": 39.03500301750151, "grad_norm": 8.434908866882324, "learning_rate": 1.9999221736160428e-05, "loss": 0.0985, "step": 32360 }, { "epoch": 39.04707302353651, "grad_norm": 7.954544544219971, "learning_rate": 1.9999221494905634e-05, "loss": 0.1061, "step": 32370 }, { "epoch": 39.059143029571516, "grad_norm": 7.40944242477417, "learning_rate": 1.999922125365084e-05, "loss": 0.0987, "step": 32380 }, { "epoch": 39.07121303560652, "grad_norm": 8.375916481018066, "learning_rate": 1.9999221012396047e-05, "loss": 0.1041, "step": 32390 }, { "epoch": 39.08328304164152, "grad_norm": 8.047123908996582, "learning_rate": 1.9999220771141253e-05, "loss": 0.1058, "step": 32400 }, { "epoch": 39.095353047676525, "grad_norm": 7.592769145965576, "learning_rate": 1.999922052988646e-05, "loss": 0.0995, "step": 32410 }, { "epoch": 39.10742305371153, "grad_norm": 9.002301216125488, "learning_rate": 1.9999220288631665e-05, "loss": 0.1025, "step": 32420 }, { "epoch": 39.11949305974653, "grad_norm": 9.0217866897583, "learning_rate": 1.999922004737687e-05, "loss": 0.1078, "step": 32430 }, { "epoch": 39.131563065781535, "grad_norm": 8.245833396911621, "learning_rate": 1.9999219806122078e-05, "loss": 0.106, "step": 32440 }, { "epoch": 39.14363307181654, "grad_norm": 8.926276206970215, "learning_rate": 1.9999219564867284e-05, "loss": 0.1065, "step": 32450 }, { "epoch": 39.15570307785154, "grad_norm": 9.269146919250488, "learning_rate": 1.999921932361249e-05, "loss": 0.107, "step": 32460 }, { "epoch": 39.167773083886544, "grad_norm": 8.252391815185547, "learning_rate": 1.9999219082357696e-05, "loss": 0.1104, "step": 32470 }, { "epoch": 39.17984308992155, "grad_norm": 8.179865837097168, "learning_rate": 1.9999218841102903e-05, "loss": 0.1096, "step": 32480 }, { "epoch": 39.19191309595655, "grad_norm": 8.77815055847168, "learning_rate": 1.9999218599848105e-05, "loss": 0.1136, "step": 32490 }, { "epoch": 39.203983101991554, "grad_norm": 8.843620300292969, "learning_rate": 1.999921835859331e-05, "loss": 0.1166, "step": 32500 }, { "epoch": 39.203983101991554, "eval_loss": 11.482420921325684, "eval_runtime": 8.1276, "eval_samples_per_second": 85.757, "eval_steps_per_second": 10.827, "step": 32500 }, { "epoch": 39.21605310802656, "grad_norm": 7.692868709564209, "learning_rate": 1.9999218117338518e-05, "loss": 0.1131, "step": 32510 }, { "epoch": 39.22812311406156, "grad_norm": 9.170184135437012, "learning_rate": 1.9999217876083724e-05, "loss": 0.1159, "step": 32520 }, { "epoch": 39.24019312009656, "grad_norm": 8.401741027832031, "learning_rate": 1.999921763482893e-05, "loss": 0.1108, "step": 32530 }, { "epoch": 39.25226312613156, "grad_norm": 8.295746803283691, "learning_rate": 1.9999217393574136e-05, "loss": 0.1099, "step": 32540 }, { "epoch": 39.26433313216656, "grad_norm": 8.644732475280762, "learning_rate": 1.9999217152319343e-05, "loss": 0.1112, "step": 32550 }, { "epoch": 39.276403138201566, "grad_norm": 8.968096733093262, "learning_rate": 1.999921691106455e-05, "loss": 0.1139, "step": 32560 }, { "epoch": 39.28847314423657, "grad_norm": 8.366037368774414, "learning_rate": 1.9999216669809755e-05, "loss": 0.1123, "step": 32570 }, { "epoch": 39.30054315027157, "grad_norm": 7.710782051086426, "learning_rate": 1.999921642855496e-05, "loss": 0.1118, "step": 32580 }, { "epoch": 39.312613156306575, "grad_norm": 8.607976913452148, "learning_rate": 1.9999216187300168e-05, "loss": 0.1122, "step": 32590 }, { "epoch": 39.32468316234158, "grad_norm": 7.998414516448975, "learning_rate": 1.9999215946045374e-05, "loss": 0.117, "step": 32600 }, { "epoch": 39.33675316837658, "grad_norm": 8.455231666564941, "learning_rate": 1.999921570479058e-05, "loss": 0.1139, "step": 32610 }, { "epoch": 39.348823174411585, "grad_norm": 8.2460298538208, "learning_rate": 1.9999215463535786e-05, "loss": 0.115, "step": 32620 }, { "epoch": 39.36089318044659, "grad_norm": 9.19868278503418, "learning_rate": 1.9999215222280992e-05, "loss": 0.1157, "step": 32630 }, { "epoch": 39.37296318648159, "grad_norm": 8.68033504486084, "learning_rate": 1.99992149810262e-05, "loss": 0.1177, "step": 32640 }, { "epoch": 39.385033192516595, "grad_norm": 8.483563423156738, "learning_rate": 1.9999214739771405e-05, "loss": 0.1144, "step": 32650 }, { "epoch": 39.3971031985516, "grad_norm": 8.12133502960205, "learning_rate": 1.999921449851661e-05, "loss": 0.1179, "step": 32660 }, { "epoch": 39.4091732045866, "grad_norm": 8.872349739074707, "learning_rate": 1.9999214257261817e-05, "loss": 0.1186, "step": 32670 }, { "epoch": 39.421243210621604, "grad_norm": 8.624176979064941, "learning_rate": 1.9999214016007024e-05, "loss": 0.1217, "step": 32680 }, { "epoch": 39.43331321665661, "grad_norm": 9.06348991394043, "learning_rate": 1.999921377475223e-05, "loss": 0.1158, "step": 32690 }, { "epoch": 39.44538322269161, "grad_norm": 9.325542449951172, "learning_rate": 1.9999213533497436e-05, "loss": 0.1229, "step": 32700 }, { "epoch": 39.457453228726614, "grad_norm": 8.50928020477295, "learning_rate": 1.9999213292242642e-05, "loss": 0.1218, "step": 32710 }, { "epoch": 39.46952323476162, "grad_norm": 8.45387077331543, "learning_rate": 1.999921305098785e-05, "loss": 0.1208, "step": 32720 }, { "epoch": 39.48159324079662, "grad_norm": 8.87298583984375, "learning_rate": 1.9999212809733055e-05, "loss": 0.1169, "step": 32730 }, { "epoch": 39.49366324683162, "grad_norm": 8.95173454284668, "learning_rate": 1.9999212568478257e-05, "loss": 0.1179, "step": 32740 }, { "epoch": 39.50573325286663, "grad_norm": 8.401531219482422, "learning_rate": 1.9999212327223464e-05, "loss": 0.1185, "step": 32750 }, { "epoch": 39.51780325890163, "grad_norm": 8.840765953063965, "learning_rate": 1.999921208596867e-05, "loss": 0.119, "step": 32760 }, { "epoch": 39.52987326493663, "grad_norm": 8.790966987609863, "learning_rate": 1.9999211844713876e-05, "loss": 0.1205, "step": 32770 }, { "epoch": 39.541943270971636, "grad_norm": 8.642646789550781, "learning_rate": 1.9999211603459082e-05, "loss": 0.1242, "step": 32780 }, { "epoch": 39.55401327700664, "grad_norm": 9.015168190002441, "learning_rate": 1.9999211362204292e-05, "loss": 0.1228, "step": 32790 }, { "epoch": 39.56608328304164, "grad_norm": 9.12108325958252, "learning_rate": 1.9999211120949498e-05, "loss": 0.1189, "step": 32800 }, { "epoch": 39.578153289076646, "grad_norm": 8.527749061584473, "learning_rate": 1.9999210879694704e-05, "loss": 0.1211, "step": 32810 }, { "epoch": 39.59022329511165, "grad_norm": 8.344348907470703, "learning_rate": 1.999921063843991e-05, "loss": 0.1219, "step": 32820 }, { "epoch": 39.60229330114665, "grad_norm": 8.377803802490234, "learning_rate": 1.9999210397185117e-05, "loss": 0.1246, "step": 32830 }, { "epoch": 39.614363307181655, "grad_norm": 9.262369155883789, "learning_rate": 1.9999210155930323e-05, "loss": 0.1211, "step": 32840 }, { "epoch": 39.62643331321666, "grad_norm": 8.9697904586792, "learning_rate": 1.999920991467553e-05, "loss": 0.122, "step": 32850 }, { "epoch": 39.63850331925166, "grad_norm": 8.886343955993652, "learning_rate": 1.9999209673420735e-05, "loss": 0.1227, "step": 32860 }, { "epoch": 39.650573325286665, "grad_norm": 9.274252891540527, "learning_rate": 1.999920943216594e-05, "loss": 0.1204, "step": 32870 }, { "epoch": 39.66264333132167, "grad_norm": 8.977320671081543, "learning_rate": 1.9999209190911148e-05, "loss": 0.1216, "step": 32880 }, { "epoch": 39.67471333735667, "grad_norm": 8.12016773223877, "learning_rate": 1.9999208949656354e-05, "loss": 0.1235, "step": 32890 }, { "epoch": 39.686783343391674, "grad_norm": 8.268648147583008, "learning_rate": 1.9999208708401557e-05, "loss": 0.1221, "step": 32900 }, { "epoch": 39.69885334942668, "grad_norm": 9.194595336914062, "learning_rate": 1.9999208467146763e-05, "loss": 0.123, "step": 32910 }, { "epoch": 39.71092335546168, "grad_norm": 9.259039878845215, "learning_rate": 1.999920822589197e-05, "loss": 0.1232, "step": 32920 }, { "epoch": 39.722993361496684, "grad_norm": 9.985997200012207, "learning_rate": 1.9999207984637175e-05, "loss": 0.1247, "step": 32930 }, { "epoch": 39.73506336753169, "grad_norm": 9.533256530761719, "learning_rate": 1.9999207743382382e-05, "loss": 0.1237, "step": 32940 }, { "epoch": 39.74713337356668, "grad_norm": 9.564168930053711, "learning_rate": 1.9999207502127588e-05, "loss": 0.1275, "step": 32950 }, { "epoch": 39.759203379601686, "grad_norm": 9.555896759033203, "learning_rate": 1.9999207260872794e-05, "loss": 0.1304, "step": 32960 }, { "epoch": 39.77127338563669, "grad_norm": 8.494302749633789, "learning_rate": 1.9999207019618e-05, "loss": 0.1279, "step": 32970 }, { "epoch": 39.78334339167169, "grad_norm": 9.01416301727295, "learning_rate": 1.9999206778363207e-05, "loss": 0.1253, "step": 32980 }, { "epoch": 39.795413397706696, "grad_norm": 9.440053939819336, "learning_rate": 1.9999206537108413e-05, "loss": 0.1217, "step": 32990 }, { "epoch": 39.8074834037417, "grad_norm": 9.067023277282715, "learning_rate": 1.999920629585362e-05, "loss": 0.1269, "step": 33000 }, { "epoch": 39.8074834037417, "eval_loss": 11.550965309143066, "eval_runtime": 8.1279, "eval_samples_per_second": 85.754, "eval_steps_per_second": 10.827, "step": 33000 }, { "epoch": 39.8195534097767, "grad_norm": 9.32444953918457, "learning_rate": 1.9999206054598825e-05, "loss": 0.1283, "step": 33010 }, { "epoch": 39.831623415811706, "grad_norm": 9.049776077270508, "learning_rate": 1.999920581334403e-05, "loss": 0.1264, "step": 33020 }, { "epoch": 39.84369342184671, "grad_norm": 8.507801055908203, "learning_rate": 1.9999205572089238e-05, "loss": 0.1301, "step": 33030 }, { "epoch": 39.85576342788171, "grad_norm": 9.716781616210938, "learning_rate": 1.9999205330834444e-05, "loss": 0.1244, "step": 33040 }, { "epoch": 39.867833433916715, "grad_norm": 9.518760681152344, "learning_rate": 1.999920508957965e-05, "loss": 0.1293, "step": 33050 }, { "epoch": 39.87990343995172, "grad_norm": 8.689533233642578, "learning_rate": 1.9999204848324856e-05, "loss": 0.1276, "step": 33060 }, { "epoch": 39.89197344598672, "grad_norm": 9.061039924621582, "learning_rate": 1.9999204607070063e-05, "loss": 0.1276, "step": 33070 }, { "epoch": 39.904043452021725, "grad_norm": 10.077043533325195, "learning_rate": 1.999920436581527e-05, "loss": 0.1311, "step": 33080 }, { "epoch": 39.91611345805673, "grad_norm": 9.031171798706055, "learning_rate": 1.9999204124560475e-05, "loss": 0.1261, "step": 33090 }, { "epoch": 39.92818346409173, "grad_norm": 10.022102355957031, "learning_rate": 1.999920388330568e-05, "loss": 0.1329, "step": 33100 }, { "epoch": 39.940253470126734, "grad_norm": 9.200531005859375, "learning_rate": 1.9999203642050887e-05, "loss": 0.1291, "step": 33110 }, { "epoch": 39.95232347616174, "grad_norm": 8.708026885986328, "learning_rate": 1.9999203400796094e-05, "loss": 0.1317, "step": 33120 }, { "epoch": 39.96439348219674, "grad_norm": 8.396007537841797, "learning_rate": 1.99992031595413e-05, "loss": 0.1309, "step": 33130 }, { "epoch": 39.976463488231744, "grad_norm": 9.520848274230957, "learning_rate": 1.9999202918286506e-05, "loss": 0.1323, "step": 33140 }, { "epoch": 39.98853349426675, "grad_norm": 9.752159118652344, "learning_rate": 1.999920267703171e-05, "loss": 0.1353, "step": 33150 }, { "epoch": 40.0, "grad_norm": Infinity, "learning_rate": 1.9999202435776915e-05, "loss": 0.1338, "step": 33160 }, { "epoch": 40.012070006035, "grad_norm": 7.295010089874268, "learning_rate": 1.999920219452212e-05, "loss": 0.0893, "step": 33170 }, { "epoch": 40.024140012070006, "grad_norm": 6.916862487792969, "learning_rate": 1.9999201953267327e-05, "loss": 0.0902, "step": 33180 }, { "epoch": 40.03621001810501, "grad_norm": 8.357993125915527, "learning_rate": 1.9999201712012534e-05, "loss": 0.0936, "step": 33190 }, { "epoch": 40.04828002414001, "grad_norm": 7.521778583526611, "learning_rate": 1.999920147075774e-05, "loss": 0.0972, "step": 33200 }, { "epoch": 40.060350030175016, "grad_norm": 7.280166149139404, "learning_rate": 1.9999201229502946e-05, "loss": 0.0997, "step": 33210 }, { "epoch": 40.07242003621002, "grad_norm": 7.309049606323242, "learning_rate": 1.9999200988248152e-05, "loss": 0.101, "step": 33220 }, { "epoch": 40.08449004224502, "grad_norm": 7.3118157386779785, "learning_rate": 1.999920074699336e-05, "loss": 0.1017, "step": 33230 }, { "epoch": 40.096560048280026, "grad_norm": 7.400045871734619, "learning_rate": 1.9999200505738565e-05, "loss": 0.1002, "step": 33240 }, { "epoch": 40.10863005431503, "grad_norm": 7.934249401092529, "learning_rate": 1.999920026448377e-05, "loss": 0.0995, "step": 33250 }, { "epoch": 40.12070006035003, "grad_norm": 7.994880676269531, "learning_rate": 1.9999200023228977e-05, "loss": 0.1074, "step": 33260 }, { "epoch": 40.132770066385035, "grad_norm": 7.975707530975342, "learning_rate": 1.9999199781974183e-05, "loss": 0.1059, "step": 33270 }, { "epoch": 40.14484007242004, "grad_norm": 7.833276748657227, "learning_rate": 1.999919954071939e-05, "loss": 0.103, "step": 33280 }, { "epoch": 40.15691007845504, "grad_norm": 8.297359466552734, "learning_rate": 1.9999199299464596e-05, "loss": 0.1045, "step": 33290 }, { "epoch": 40.168980084490045, "grad_norm": 8.733682632446289, "learning_rate": 1.9999199058209802e-05, "loss": 0.1064, "step": 33300 }, { "epoch": 40.18105009052505, "grad_norm": 8.40625286102295, "learning_rate": 1.9999198816955008e-05, "loss": 0.1066, "step": 33310 }, { "epoch": 40.19312009656005, "grad_norm": 7.836758136749268, "learning_rate": 1.9999198575700215e-05, "loss": 0.109, "step": 33320 }, { "epoch": 40.205190102595054, "grad_norm": 8.915181159973145, "learning_rate": 1.999919833444542e-05, "loss": 0.1096, "step": 33330 }, { "epoch": 40.21726010863006, "grad_norm": 7.989918231964111, "learning_rate": 1.9999198093190627e-05, "loss": 0.1082, "step": 33340 }, { "epoch": 40.22933011466506, "grad_norm": 8.785135269165039, "learning_rate": 1.9999197851935833e-05, "loss": 0.1096, "step": 33350 }, { "epoch": 40.241400120700064, "grad_norm": 8.081915855407715, "learning_rate": 1.999919761068104e-05, "loss": 0.106, "step": 33360 }, { "epoch": 40.25347012673506, "grad_norm": 7.627058506011963, "learning_rate": 1.9999197369426246e-05, "loss": 0.109, "step": 33370 }, { "epoch": 40.26554013277006, "grad_norm": 8.223875045776367, "learning_rate": 1.9999197128171452e-05, "loss": 0.105, "step": 33380 }, { "epoch": 40.277610138805066, "grad_norm": 8.349647521972656, "learning_rate": 1.9999196886916658e-05, "loss": 0.1126, "step": 33390 }, { "epoch": 40.28968014484007, "grad_norm": 7.5857462882995605, "learning_rate": 1.9999196645661864e-05, "loss": 0.1059, "step": 33400 }, { "epoch": 40.30175015087507, "grad_norm": 8.758030891418457, "learning_rate": 1.999919640440707e-05, "loss": 0.1078, "step": 33410 }, { "epoch": 40.313820156910076, "grad_norm": 8.674176216125488, "learning_rate": 1.9999196163152277e-05, "loss": 0.1106, "step": 33420 }, { "epoch": 40.32589016294508, "grad_norm": 8.165825843811035, "learning_rate": 1.9999195921897483e-05, "loss": 0.114, "step": 33430 }, { "epoch": 40.33796016898008, "grad_norm": 8.477066993713379, "learning_rate": 1.999919568064269e-05, "loss": 0.1129, "step": 33440 }, { "epoch": 40.350030175015085, "grad_norm": 8.481464385986328, "learning_rate": 1.9999195439387895e-05, "loss": 0.1089, "step": 33450 }, { "epoch": 40.36210018105009, "grad_norm": 8.033802032470703, "learning_rate": 1.99991951981331e-05, "loss": 0.113, "step": 33460 }, { "epoch": 40.37417018708509, "grad_norm": 8.2025146484375, "learning_rate": 1.9999194956878308e-05, "loss": 0.1115, "step": 33470 }, { "epoch": 40.386240193120095, "grad_norm": 8.553913116455078, "learning_rate": 1.9999194715623514e-05, "loss": 0.1112, "step": 33480 }, { "epoch": 40.3983101991551, "grad_norm": 8.0443696975708, "learning_rate": 1.999919447436872e-05, "loss": 0.1149, "step": 33490 }, { "epoch": 40.4103802051901, "grad_norm": 8.784496307373047, "learning_rate": 1.9999194233113926e-05, "loss": 0.1168, "step": 33500 }, { "epoch": 40.4103802051901, "eval_loss": 11.541379928588867, "eval_runtime": 8.1396, "eval_samples_per_second": 85.631, "eval_steps_per_second": 10.811, "step": 33500 }, { "epoch": 40.422450211225105, "grad_norm": 8.423803329467773, "learning_rate": 1.9999193991859133e-05, "loss": 0.1158, "step": 33510 }, { "epoch": 40.43452021726011, "grad_norm": 8.986648559570312, "learning_rate": 1.999919375060434e-05, "loss": 0.1155, "step": 33520 }, { "epoch": 40.44659022329511, "grad_norm": 8.166865348815918, "learning_rate": 1.9999193509349545e-05, "loss": 0.115, "step": 33530 }, { "epoch": 40.458660229330114, "grad_norm": 8.619139671325684, "learning_rate": 1.999919326809475e-05, "loss": 0.1151, "step": 33540 }, { "epoch": 40.47073023536512, "grad_norm": 8.444806098937988, "learning_rate": 1.9999193026839957e-05, "loss": 0.1124, "step": 33550 }, { "epoch": 40.48280024140012, "grad_norm": 8.557428359985352, "learning_rate": 1.999919278558516e-05, "loss": 0.1136, "step": 33560 }, { "epoch": 40.494870247435124, "grad_norm": 8.363120079040527, "learning_rate": 1.9999192544330367e-05, "loss": 0.1129, "step": 33570 }, { "epoch": 40.50694025347013, "grad_norm": 8.927071571350098, "learning_rate": 1.9999192303075573e-05, "loss": 0.1197, "step": 33580 }, { "epoch": 40.51901025950513, "grad_norm": 8.820205688476562, "learning_rate": 1.999919206182078e-05, "loss": 0.1167, "step": 33590 }, { "epoch": 40.53108026554013, "grad_norm": 7.868465900421143, "learning_rate": 1.9999191820565985e-05, "loss": 0.1179, "step": 33600 }, { "epoch": 40.543150271575136, "grad_norm": 9.134904861450195, "learning_rate": 1.999919157931119e-05, "loss": 0.116, "step": 33610 }, { "epoch": 40.55522027761014, "grad_norm": 9.075288772583008, "learning_rate": 1.9999191338056398e-05, "loss": 0.114, "step": 33620 }, { "epoch": 40.56729028364514, "grad_norm": 8.349418640136719, "learning_rate": 1.9999191096801604e-05, "loss": 0.1166, "step": 33630 }, { "epoch": 40.579360289680146, "grad_norm": 8.09981918334961, "learning_rate": 1.999919085554681e-05, "loss": 0.116, "step": 33640 }, { "epoch": 40.59143029571515, "grad_norm": 8.370985984802246, "learning_rate": 1.9999190614292016e-05, "loss": 0.1195, "step": 33650 }, { "epoch": 40.60350030175015, "grad_norm": 7.989498138427734, "learning_rate": 1.9999190373037222e-05, "loss": 0.1154, "step": 33660 }, { "epoch": 40.615570307785156, "grad_norm": 9.373289108276367, "learning_rate": 1.999919013178243e-05, "loss": 0.1192, "step": 33670 }, { "epoch": 40.62764031382016, "grad_norm": 8.955394744873047, "learning_rate": 1.9999189890527635e-05, "loss": 0.1233, "step": 33680 }, { "epoch": 40.63971031985516, "grad_norm": 8.852289199829102, "learning_rate": 1.999918964927284e-05, "loss": 0.1186, "step": 33690 }, { "epoch": 40.651780325890165, "grad_norm": 8.407901763916016, "learning_rate": 1.9999189408018047e-05, "loss": 0.1234, "step": 33700 }, { "epoch": 40.66385033192517, "grad_norm": 9.853032112121582, "learning_rate": 1.9999189166763254e-05, "loss": 0.1209, "step": 33710 }, { "epoch": 40.67592033796017, "grad_norm": 9.152206420898438, "learning_rate": 1.999918892550846e-05, "loss": 0.1223, "step": 33720 }, { "epoch": 40.687990343995175, "grad_norm": 8.977150917053223, "learning_rate": 1.9999188684253666e-05, "loss": 0.1226, "step": 33730 }, { "epoch": 40.70006035003018, "grad_norm": 8.706067085266113, "learning_rate": 1.9999188442998872e-05, "loss": 0.1236, "step": 33740 }, { "epoch": 40.71213035606518, "grad_norm": 8.968780517578125, "learning_rate": 1.999918820174408e-05, "loss": 0.1226, "step": 33750 }, { "epoch": 40.724200362100184, "grad_norm": 8.319668769836426, "learning_rate": 1.9999187960489285e-05, "loss": 0.119, "step": 33760 }, { "epoch": 40.73627036813519, "grad_norm": 9.021988868713379, "learning_rate": 1.999918771923449e-05, "loss": 0.1225, "step": 33770 }, { "epoch": 40.74834037417018, "grad_norm": 8.46576976776123, "learning_rate": 1.9999187477979697e-05, "loss": 0.1251, "step": 33780 }, { "epoch": 40.76041038020519, "grad_norm": 8.991959571838379, "learning_rate": 1.9999187236724903e-05, "loss": 0.1205, "step": 33790 }, { "epoch": 40.77248038624019, "grad_norm": 8.769937515258789, "learning_rate": 1.999918699547011e-05, "loss": 0.1279, "step": 33800 }, { "epoch": 40.78455039227519, "grad_norm": 9.664041519165039, "learning_rate": 1.9999186754215312e-05, "loss": 0.1235, "step": 33810 }, { "epoch": 40.796620398310196, "grad_norm": 8.949159622192383, "learning_rate": 1.999918651296052e-05, "loss": 0.1233, "step": 33820 }, { "epoch": 40.8086904043452, "grad_norm": 9.918442726135254, "learning_rate": 1.9999186271705725e-05, "loss": 0.1268, "step": 33830 }, { "epoch": 40.8207604103802, "grad_norm": 9.470316886901855, "learning_rate": 1.999918603045093e-05, "loss": 0.1291, "step": 33840 }, { "epoch": 40.832830416415206, "grad_norm": 8.838103294372559, "learning_rate": 1.9999185789196137e-05, "loss": 0.1208, "step": 33850 }, { "epoch": 40.84490042245021, "grad_norm": 9.075504302978516, "learning_rate": 1.9999185547941343e-05, "loss": 0.1252, "step": 33860 }, { "epoch": 40.85697042848521, "grad_norm": 8.58432388305664, "learning_rate": 1.9999185306686553e-05, "loss": 0.1243, "step": 33870 }, { "epoch": 40.869040434520215, "grad_norm": 8.772253036499023, "learning_rate": 1.999918506543176e-05, "loss": 0.1238, "step": 33880 }, { "epoch": 40.88111044055522, "grad_norm": 9.024093627929688, "learning_rate": 1.9999184824176965e-05, "loss": 0.127, "step": 33890 }, { "epoch": 40.89318044659022, "grad_norm": 9.275625228881836, "learning_rate": 1.999918458292217e-05, "loss": 0.1225, "step": 33900 }, { "epoch": 40.905250452625225, "grad_norm": 9.348467826843262, "learning_rate": 1.9999184341667378e-05, "loss": 0.1286, "step": 33910 }, { "epoch": 40.91732045866023, "grad_norm": 8.99459171295166, "learning_rate": 1.9999184100412584e-05, "loss": 0.124, "step": 33920 }, { "epoch": 40.92939046469523, "grad_norm": 9.79403018951416, "learning_rate": 1.999918385915779e-05, "loss": 0.1256, "step": 33930 }, { "epoch": 40.941460470730235, "grad_norm": 9.399724006652832, "learning_rate": 1.9999183617902996e-05, "loss": 0.1252, "step": 33940 }, { "epoch": 40.95353047676524, "grad_norm": 8.953380584716797, "learning_rate": 1.9999183376648203e-05, "loss": 0.1289, "step": 33950 }, { "epoch": 40.96560048280024, "grad_norm": 9.292410850524902, "learning_rate": 1.999918313539341e-05, "loss": 0.1287, "step": 33960 }, { "epoch": 40.977670488835244, "grad_norm": 9.286895751953125, "learning_rate": 1.9999182894138615e-05, "loss": 0.123, "step": 33970 }, { "epoch": 40.98974049487025, "grad_norm": 8.839954376220703, "learning_rate": 1.9999182652883818e-05, "loss": 0.1268, "step": 33980 }, { "epoch": 41.0012070006035, "grad_norm": 8.248231887817383, "learning_rate": 1.9999182411629024e-05, "loss": 0.1209, "step": 33990 }, { "epoch": 41.0132770066385, "grad_norm": 8.091815948486328, "learning_rate": 1.999918217037423e-05, "loss": 0.0917, "step": 34000 }, { "epoch": 41.0132770066385, "eval_loss": 11.56505298614502, "eval_runtime": 8.1349, "eval_samples_per_second": 85.68, "eval_steps_per_second": 10.818, "step": 34000 }, { "epoch": 41.02534701267351, "grad_norm": 6.932821273803711, "learning_rate": 1.9999181929119437e-05, "loss": 0.0911, "step": 34010 }, { "epoch": 41.03741701870851, "grad_norm": 7.704680442810059, "learning_rate": 1.9999181687864643e-05, "loss": 0.0937, "step": 34020 }, { "epoch": 41.04948702474351, "grad_norm": 7.562203884124756, "learning_rate": 1.999918144660985e-05, "loss": 0.096, "step": 34030 }, { "epoch": 41.061557030778516, "grad_norm": 7.8626532554626465, "learning_rate": 1.9999181205355055e-05, "loss": 0.0933, "step": 34040 }, { "epoch": 41.07362703681352, "grad_norm": 7.859243392944336, "learning_rate": 1.999918096410026e-05, "loss": 0.0963, "step": 34050 }, { "epoch": 41.08569704284852, "grad_norm": 7.7171783447265625, "learning_rate": 1.9999180722845468e-05, "loss": 0.0972, "step": 34060 }, { "epoch": 41.097767048883526, "grad_norm": 7.1501312255859375, "learning_rate": 1.9999180481590674e-05, "loss": 0.0982, "step": 34070 }, { "epoch": 41.10983705491853, "grad_norm": 7.344725131988525, "learning_rate": 1.999918024033588e-05, "loss": 0.0972, "step": 34080 }, { "epoch": 41.12190706095353, "grad_norm": 7.765169620513916, "learning_rate": 1.9999179999081086e-05, "loss": 0.097, "step": 34090 }, { "epoch": 41.133977066988535, "grad_norm": 8.186606407165527, "learning_rate": 1.9999179757826293e-05, "loss": 0.1037, "step": 34100 }, { "epoch": 41.14604707302354, "grad_norm": 7.888797760009766, "learning_rate": 1.99991795165715e-05, "loss": 0.0999, "step": 34110 }, { "epoch": 41.15811707905854, "grad_norm": 8.045211791992188, "learning_rate": 1.9999179275316705e-05, "loss": 0.1022, "step": 34120 }, { "epoch": 41.170187085093545, "grad_norm": 7.445483207702637, "learning_rate": 1.999917903406191e-05, "loss": 0.1018, "step": 34130 }, { "epoch": 41.18225709112855, "grad_norm": 8.601751327514648, "learning_rate": 1.9999178792807117e-05, "loss": 0.1043, "step": 34140 }, { "epoch": 41.19432709716355, "grad_norm": 8.134662628173828, "learning_rate": 1.9999178551552324e-05, "loss": 0.1044, "step": 34150 }, { "epoch": 41.206397103198555, "grad_norm": 8.276429176330566, "learning_rate": 1.999917831029753e-05, "loss": 0.1051, "step": 34160 }, { "epoch": 41.21846710923356, "grad_norm": 8.179678916931152, "learning_rate": 1.9999178069042736e-05, "loss": 0.1066, "step": 34170 }, { "epoch": 41.23053711526856, "grad_norm": 8.398102760314941, "learning_rate": 1.9999177827787942e-05, "loss": 0.1042, "step": 34180 }, { "epoch": 41.242607121303564, "grad_norm": 7.791502952575684, "learning_rate": 1.999917758653315e-05, "loss": 0.1035, "step": 34190 }, { "epoch": 41.25467712733856, "grad_norm": 8.179070472717285, "learning_rate": 1.9999177345278355e-05, "loss": 0.108, "step": 34200 }, { "epoch": 41.26674713337356, "grad_norm": 8.51523208618164, "learning_rate": 1.999917710402356e-05, "loss": 0.1072, "step": 34210 }, { "epoch": 41.27881713940857, "grad_norm": 8.120832443237305, "learning_rate": 1.9999176862768767e-05, "loss": 0.1099, "step": 34220 }, { "epoch": 41.29088714544357, "grad_norm": 7.994271755218506, "learning_rate": 1.999917662151397e-05, "loss": 0.1112, "step": 34230 }, { "epoch": 41.30295715147857, "grad_norm": 8.420711517333984, "learning_rate": 1.9999176380259176e-05, "loss": 0.1114, "step": 34240 }, { "epoch": 41.315027157513576, "grad_norm": 7.774344444274902, "learning_rate": 1.9999176139004382e-05, "loss": 0.1076, "step": 34250 }, { "epoch": 41.32709716354858, "grad_norm": 7.955318927764893, "learning_rate": 1.999917589774959e-05, "loss": 0.1051, "step": 34260 }, { "epoch": 41.33916716958358, "grad_norm": 8.07659912109375, "learning_rate": 1.9999175656494795e-05, "loss": 0.1072, "step": 34270 }, { "epoch": 41.351237175618586, "grad_norm": 8.637524604797363, "learning_rate": 1.999917541524e-05, "loss": 0.109, "step": 34280 }, { "epoch": 41.36330718165359, "grad_norm": 8.127988815307617, "learning_rate": 1.9999175173985207e-05, "loss": 0.1073, "step": 34290 }, { "epoch": 41.37537718768859, "grad_norm": 8.495828628540039, "learning_rate": 1.9999174932730413e-05, "loss": 0.1076, "step": 34300 }, { "epoch": 41.387447193723595, "grad_norm": 8.84493637084961, "learning_rate": 1.999917469147562e-05, "loss": 0.1088, "step": 34310 }, { "epoch": 41.3995171997586, "grad_norm": 8.080735206604004, "learning_rate": 1.9999174450220826e-05, "loss": 0.1135, "step": 34320 }, { "epoch": 41.4115872057936, "grad_norm": 8.683029174804688, "learning_rate": 1.9999174208966032e-05, "loss": 0.1135, "step": 34330 }, { "epoch": 41.423657211828605, "grad_norm": 8.53392219543457, "learning_rate": 1.999917396771124e-05, "loss": 0.1102, "step": 34340 }, { "epoch": 41.43572721786361, "grad_norm": 8.652215003967285, "learning_rate": 1.9999173726456445e-05, "loss": 0.1106, "step": 34350 }, { "epoch": 41.44779722389861, "grad_norm": 8.616124153137207, "learning_rate": 1.999917348520165e-05, "loss": 0.1137, "step": 34360 }, { "epoch": 41.459867229933614, "grad_norm": 7.534670352935791, "learning_rate": 1.9999173243946857e-05, "loss": 0.1143, "step": 34370 }, { "epoch": 41.47193723596862, "grad_norm": 8.708889961242676, "learning_rate": 1.9999173002692063e-05, "loss": 0.1151, "step": 34380 }, { "epoch": 41.48400724200362, "grad_norm": 7.662004470825195, "learning_rate": 1.999917276143727e-05, "loss": 0.1111, "step": 34390 }, { "epoch": 41.496077248038624, "grad_norm": 7.791184902191162, "learning_rate": 1.9999172520182476e-05, "loss": 0.1121, "step": 34400 }, { "epoch": 41.50814725407363, "grad_norm": 8.133023262023926, "learning_rate": 1.9999172278927682e-05, "loss": 0.113, "step": 34410 }, { "epoch": 41.52021726010863, "grad_norm": 7.990051746368408, "learning_rate": 1.9999172037672888e-05, "loss": 0.1128, "step": 34420 }, { "epoch": 41.53228726614363, "grad_norm": 8.618638038635254, "learning_rate": 1.9999171796418094e-05, "loss": 0.1144, "step": 34430 }, { "epoch": 41.54435727217864, "grad_norm": 8.589627265930176, "learning_rate": 1.99991715551633e-05, "loss": 0.1169, "step": 34440 }, { "epoch": 41.55642727821364, "grad_norm": 8.715862274169922, "learning_rate": 1.9999171313908507e-05, "loss": 0.1101, "step": 34450 }, { "epoch": 41.56849728424864, "grad_norm": 8.927557945251465, "learning_rate": 1.9999171072653713e-05, "loss": 0.1164, "step": 34460 }, { "epoch": 41.580567290283646, "grad_norm": 8.532756805419922, "learning_rate": 1.999917083139892e-05, "loss": 0.1142, "step": 34470 }, { "epoch": 41.59263729631865, "grad_norm": 8.770740509033203, "learning_rate": 1.9999170590144125e-05, "loss": 0.1164, "step": 34480 }, { "epoch": 41.60470730235365, "grad_norm": 9.023125648498535, "learning_rate": 1.999917034888933e-05, "loss": 0.1202, "step": 34490 }, { "epoch": 41.616777308388656, "grad_norm": 8.692851066589355, "learning_rate": 1.9999170107634538e-05, "loss": 0.1149, "step": 34500 }, { "epoch": 41.616777308388656, "eval_loss": 11.605066299438477, "eval_runtime": 8.1255, "eval_samples_per_second": 85.779, "eval_steps_per_second": 10.83, "step": 34500 }, { "epoch": 41.62884731442366, "grad_norm": 9.107776641845703, "learning_rate": 1.9999169866379744e-05, "loss": 0.1225, "step": 34510 }, { "epoch": 41.64091732045866, "grad_norm": 8.0089111328125, "learning_rate": 1.999916962512495e-05, "loss": 0.1154, "step": 34520 }, { "epoch": 41.652987326493665, "grad_norm": 9.043435096740723, "learning_rate": 1.9999169383870156e-05, "loss": 0.1186, "step": 34530 }, { "epoch": 41.66505733252867, "grad_norm": 9.086257934570312, "learning_rate": 1.9999169142615363e-05, "loss": 0.1145, "step": 34540 }, { "epoch": 41.67712733856367, "grad_norm": 9.040194511413574, "learning_rate": 1.999916890136057e-05, "loss": 0.1264, "step": 34550 }, { "epoch": 41.689197344598675, "grad_norm": 8.23604965209961, "learning_rate": 1.9999168660105775e-05, "loss": 0.1184, "step": 34560 }, { "epoch": 41.70126735063368, "grad_norm": 8.401419639587402, "learning_rate": 1.999916841885098e-05, "loss": 0.118, "step": 34570 }, { "epoch": 41.71333735666868, "grad_norm": 8.20773983001709, "learning_rate": 1.9999168177596187e-05, "loss": 0.1181, "step": 34580 }, { "epoch": 41.725407362703685, "grad_norm": 9.214927673339844, "learning_rate": 1.9999167936341394e-05, "loss": 0.1181, "step": 34590 }, { "epoch": 41.73747736873869, "grad_norm": 9.106075286865234, "learning_rate": 1.99991676950866e-05, "loss": 0.1228, "step": 34600 }, { "epoch": 41.749547374773684, "grad_norm": 9.064778327941895, "learning_rate": 1.9999167453831806e-05, "loss": 0.119, "step": 34610 }, { "epoch": 41.76161738080869, "grad_norm": 8.942567825317383, "learning_rate": 1.9999167212577012e-05, "loss": 0.1163, "step": 34620 }, { "epoch": 41.77368738684369, "grad_norm": 8.573975563049316, "learning_rate": 1.999916697132222e-05, "loss": 0.1212, "step": 34630 }, { "epoch": 41.78575739287869, "grad_norm": 8.434731483459473, "learning_rate": 1.999916673006742e-05, "loss": 0.1249, "step": 34640 }, { "epoch": 41.7978273989137, "grad_norm": 8.339980125427246, "learning_rate": 1.9999166488812628e-05, "loss": 0.1197, "step": 34650 }, { "epoch": 41.8098974049487, "grad_norm": 9.676884651184082, "learning_rate": 1.9999166247557834e-05, "loss": 0.1202, "step": 34660 }, { "epoch": 41.8219674109837, "grad_norm": 8.757452964782715, "learning_rate": 1.999916600630304e-05, "loss": 0.1214, "step": 34670 }, { "epoch": 41.834037417018706, "grad_norm": 9.015426635742188, "learning_rate": 1.9999165765048246e-05, "loss": 0.1208, "step": 34680 }, { "epoch": 41.84610742305371, "grad_norm": 8.362361907958984, "learning_rate": 1.9999165523793452e-05, "loss": 0.1197, "step": 34690 }, { "epoch": 41.85817742908871, "grad_norm": 8.464852333068848, "learning_rate": 1.999916528253866e-05, "loss": 0.119, "step": 34700 }, { "epoch": 41.870247435123716, "grad_norm": 8.423097610473633, "learning_rate": 1.9999165041283865e-05, "loss": 0.1222, "step": 34710 }, { "epoch": 41.88231744115872, "grad_norm": 9.009660720825195, "learning_rate": 1.999916480002907e-05, "loss": 0.123, "step": 34720 }, { "epoch": 41.89438744719372, "grad_norm": 8.242378234863281, "learning_rate": 1.9999164558774277e-05, "loss": 0.1218, "step": 34730 }, { "epoch": 41.906457453228725, "grad_norm": 8.321895599365234, "learning_rate": 1.9999164317519484e-05, "loss": 0.1247, "step": 34740 }, { "epoch": 41.91852745926373, "grad_norm": 8.807455062866211, "learning_rate": 1.999916407626469e-05, "loss": 0.121, "step": 34750 }, { "epoch": 41.93059746529873, "grad_norm": 8.858838081359863, "learning_rate": 1.9999163835009896e-05, "loss": 0.1254, "step": 34760 }, { "epoch": 41.942667471333735, "grad_norm": 9.350523948669434, "learning_rate": 1.9999163593755102e-05, "loss": 0.1252, "step": 34770 }, { "epoch": 41.95473747736874, "grad_norm": 8.894034385681152, "learning_rate": 1.999916335250031e-05, "loss": 0.1229, "step": 34780 }, { "epoch": 41.96680748340374, "grad_norm": 8.769867897033691, "learning_rate": 1.9999163111245515e-05, "loss": 0.1222, "step": 34790 }, { "epoch": 41.978877489438744, "grad_norm": 8.964207649230957, "learning_rate": 1.999916286999072e-05, "loss": 0.1204, "step": 34800 }, { "epoch": 41.99094749547375, "grad_norm": 7.424964904785156, "learning_rate": 1.9999162628735927e-05, "loss": 0.1231, "step": 34810 }, { "epoch": 42.002414001207, "grad_norm": 6.981504917144775, "learning_rate": 1.9999162387481133e-05, "loss": 0.1123, "step": 34820 }, { "epoch": 42.014484007242004, "grad_norm": 8.071561813354492, "learning_rate": 1.999916214622634e-05, "loss": 0.0846, "step": 34830 }, { "epoch": 42.02655401327701, "grad_norm": 7.596538543701172, "learning_rate": 1.9999161904971546e-05, "loss": 0.0885, "step": 34840 }, { "epoch": 42.03862401931201, "grad_norm": 8.083812713623047, "learning_rate": 1.9999161663716752e-05, "loss": 0.0893, "step": 34850 }, { "epoch": 42.05069402534701, "grad_norm": 7.847207069396973, "learning_rate": 1.9999161422461958e-05, "loss": 0.0964, "step": 34860 }, { "epoch": 42.06276403138202, "grad_norm": 7.095480442047119, "learning_rate": 1.9999161181207164e-05, "loss": 0.0922, "step": 34870 }, { "epoch": 42.07483403741702, "grad_norm": 7.245641708374023, "learning_rate": 1.999916093995237e-05, "loss": 0.0949, "step": 34880 }, { "epoch": 42.08690404345202, "grad_norm": 8.184151649475098, "learning_rate": 1.9999160698697573e-05, "loss": 0.0972, "step": 34890 }, { "epoch": 42.098974049487026, "grad_norm": 8.408392906188965, "learning_rate": 1.999916045744278e-05, "loss": 0.0964, "step": 34900 }, { "epoch": 42.11104405552203, "grad_norm": 8.108321189880371, "learning_rate": 1.9999160216187986e-05, "loss": 0.0968, "step": 34910 }, { "epoch": 42.12311406155703, "grad_norm": 8.633877754211426, "learning_rate": 1.9999159974933192e-05, "loss": 0.1022, "step": 34920 }, { "epoch": 42.135184067592036, "grad_norm": 8.155157089233398, "learning_rate": 1.9999159733678398e-05, "loss": 0.1004, "step": 34930 }, { "epoch": 42.14725407362704, "grad_norm": 7.620727062225342, "learning_rate": 1.9999159492423604e-05, "loss": 0.0996, "step": 34940 }, { "epoch": 42.15932407966204, "grad_norm": 8.240485191345215, "learning_rate": 1.9999159251168814e-05, "loss": 0.0982, "step": 34950 }, { "epoch": 42.171394085697045, "grad_norm": 7.998702526092529, "learning_rate": 1.999915900991402e-05, "loss": 0.1012, "step": 34960 }, { "epoch": 42.18346409173205, "grad_norm": 7.827645778656006, "learning_rate": 1.9999158768659226e-05, "loss": 0.0997, "step": 34970 }, { "epoch": 42.19553409776705, "grad_norm": 7.898451805114746, "learning_rate": 1.9999158527404433e-05, "loss": 0.1031, "step": 34980 }, { "epoch": 42.207604103802055, "grad_norm": 8.077980041503906, "learning_rate": 1.999915828614964e-05, "loss": 0.104, "step": 34990 }, { "epoch": 42.21967410983706, "grad_norm": 7.463453769683838, "learning_rate": 1.9999158044894845e-05, "loss": 0.1004, "step": 35000 }, { "epoch": 42.21967410983706, "eval_loss": 11.598748207092285, "eval_runtime": 8.1311, "eval_samples_per_second": 85.72, "eval_steps_per_second": 10.823, "step": 35000 }, { "epoch": 42.23174411587206, "grad_norm": 7.92126989364624, "learning_rate": 1.999915780364005e-05, "loss": 0.1023, "step": 35010 }, { "epoch": 42.243814121907064, "grad_norm": 8.57087516784668, "learning_rate": 1.9999157562385258e-05, "loss": 0.1015, "step": 35020 }, { "epoch": 42.25588412794206, "grad_norm": 7.897972583770752, "learning_rate": 1.9999157321130464e-05, "loss": 0.1033, "step": 35030 }, { "epoch": 42.267954133977064, "grad_norm": 8.073040962219238, "learning_rate": 1.999915707987567e-05, "loss": 0.1036, "step": 35040 }, { "epoch": 42.28002414001207, "grad_norm": 8.2291898727417, "learning_rate": 1.9999156838620876e-05, "loss": 0.1038, "step": 35050 }, { "epoch": 42.29209414604707, "grad_norm": 8.031829833984375, "learning_rate": 1.999915659736608e-05, "loss": 0.1063, "step": 35060 }, { "epoch": 42.30416415208207, "grad_norm": 8.291048049926758, "learning_rate": 1.9999156356111285e-05, "loss": 0.1061, "step": 35070 }, { "epoch": 42.316234158117076, "grad_norm": 7.925404071807861, "learning_rate": 1.999915611485649e-05, "loss": 0.1111, "step": 35080 }, { "epoch": 42.32830416415208, "grad_norm": 8.415411949157715, "learning_rate": 1.9999155873601698e-05, "loss": 0.1059, "step": 35090 }, { "epoch": 42.34037417018708, "grad_norm": 7.706505298614502, "learning_rate": 1.9999155632346904e-05, "loss": 0.106, "step": 35100 }, { "epoch": 42.352444176222086, "grad_norm": 8.096033096313477, "learning_rate": 1.999915539109211e-05, "loss": 0.1065, "step": 35110 }, { "epoch": 42.36451418225709, "grad_norm": 8.868087768554688, "learning_rate": 1.9999155149837316e-05, "loss": 0.1113, "step": 35120 }, { "epoch": 42.37658418829209, "grad_norm": 8.253702163696289, "learning_rate": 1.9999154908582523e-05, "loss": 0.1065, "step": 35130 }, { "epoch": 42.388654194327096, "grad_norm": 8.733293533325195, "learning_rate": 1.999915466732773e-05, "loss": 0.1093, "step": 35140 }, { "epoch": 42.4007242003621, "grad_norm": 8.41896915435791, "learning_rate": 1.9999154426072935e-05, "loss": 0.1102, "step": 35150 }, { "epoch": 42.4127942063971, "grad_norm": 8.561758995056152, "learning_rate": 1.999915418481814e-05, "loss": 0.1065, "step": 35160 }, { "epoch": 42.424864212432105, "grad_norm": 8.12000560760498, "learning_rate": 1.9999153943563347e-05, "loss": 0.1079, "step": 35170 }, { "epoch": 42.43693421846711, "grad_norm": 8.088066101074219, "learning_rate": 1.9999153702308554e-05, "loss": 0.1094, "step": 35180 }, { "epoch": 42.44900422450211, "grad_norm": 8.411897659301758, "learning_rate": 1.999915346105376e-05, "loss": 0.1092, "step": 35190 }, { "epoch": 42.461074230537115, "grad_norm": 7.548657417297363, "learning_rate": 1.9999153219798966e-05, "loss": 0.1069, "step": 35200 }, { "epoch": 42.47314423657212, "grad_norm": 8.594258308410645, "learning_rate": 1.9999152978544172e-05, "loss": 0.1107, "step": 35210 }, { "epoch": 42.48521424260712, "grad_norm": 8.529069900512695, "learning_rate": 1.999915273728938e-05, "loss": 0.1114, "step": 35220 }, { "epoch": 42.497284248642124, "grad_norm": 8.210339546203613, "learning_rate": 1.9999152496034585e-05, "loss": 0.1111, "step": 35230 }, { "epoch": 42.50935425467713, "grad_norm": 7.966717720031738, "learning_rate": 1.999915225477979e-05, "loss": 0.1095, "step": 35240 }, { "epoch": 42.52142426071213, "grad_norm": 8.348084449768066, "learning_rate": 1.9999152013524997e-05, "loss": 0.1122, "step": 35250 }, { "epoch": 42.533494266747134, "grad_norm": 7.315126895904541, "learning_rate": 1.9999151772270203e-05, "loss": 0.1148, "step": 35260 }, { "epoch": 42.54556427278214, "grad_norm": 8.881209373474121, "learning_rate": 1.999915153101541e-05, "loss": 0.1159, "step": 35270 }, { "epoch": 42.55763427881714, "grad_norm": 7.963779926300049, "learning_rate": 1.9999151289760616e-05, "loss": 0.1113, "step": 35280 }, { "epoch": 42.56970428485214, "grad_norm": 8.183577537536621, "learning_rate": 1.9999151048505822e-05, "loss": 0.1124, "step": 35290 }, { "epoch": 42.58177429088715, "grad_norm": 8.967522621154785, "learning_rate": 1.9999150807251028e-05, "loss": 0.1134, "step": 35300 }, { "epoch": 42.59384429692215, "grad_norm": 8.484280586242676, "learning_rate": 1.999915056599623e-05, "loss": 0.1113, "step": 35310 }, { "epoch": 42.60591430295715, "grad_norm": 8.434849739074707, "learning_rate": 1.9999150324741437e-05, "loss": 0.1119, "step": 35320 }, { "epoch": 42.617984308992156, "grad_norm": 9.221152305603027, "learning_rate": 1.9999150083486643e-05, "loss": 0.1136, "step": 35330 }, { "epoch": 42.63005431502716, "grad_norm": 9.041547775268555, "learning_rate": 1.999914984223185e-05, "loss": 0.1138, "step": 35340 }, { "epoch": 42.64212432106216, "grad_norm": 8.882139205932617, "learning_rate": 1.9999149600977056e-05, "loss": 0.1108, "step": 35350 }, { "epoch": 42.654194327097166, "grad_norm": 9.068334579467773, "learning_rate": 1.9999149359722262e-05, "loss": 0.1141, "step": 35360 }, { "epoch": 42.66626433313217, "grad_norm": 8.98112678527832, "learning_rate": 1.999914911846747e-05, "loss": 0.1139, "step": 35370 }, { "epoch": 42.67833433916717, "grad_norm": 8.148063659667969, "learning_rate": 1.9999148877212675e-05, "loss": 0.1103, "step": 35380 }, { "epoch": 42.690404345202175, "grad_norm": 8.060462951660156, "learning_rate": 1.999914863595788e-05, "loss": 0.1143, "step": 35390 }, { "epoch": 42.70247435123718, "grad_norm": 8.5453519821167, "learning_rate": 1.9999148394703087e-05, "loss": 0.1144, "step": 35400 }, { "epoch": 42.71454435727218, "grad_norm": 8.058005332946777, "learning_rate": 1.9999148153448293e-05, "loss": 0.1168, "step": 35410 }, { "epoch": 42.726614363307185, "grad_norm": 8.813763618469238, "learning_rate": 1.99991479121935e-05, "loss": 0.113, "step": 35420 }, { "epoch": 42.73868436934219, "grad_norm": 8.305359840393066, "learning_rate": 1.9999147670938706e-05, "loss": 0.1157, "step": 35430 }, { "epoch": 42.750754375377184, "grad_norm": 8.693392753601074, "learning_rate": 1.9999147429683912e-05, "loss": 0.1134, "step": 35440 }, { "epoch": 42.76282438141219, "grad_norm": 8.13950252532959, "learning_rate": 1.9999147188429118e-05, "loss": 0.1139, "step": 35450 }, { "epoch": 42.77489438744719, "grad_norm": 8.816286087036133, "learning_rate": 1.9999146947174324e-05, "loss": 0.1189, "step": 35460 }, { "epoch": 42.786964393482194, "grad_norm": 9.01662540435791, "learning_rate": 1.999914670591953e-05, "loss": 0.1122, "step": 35470 }, { "epoch": 42.7990343995172, "grad_norm": 8.153149604797363, "learning_rate": 1.9999146464664737e-05, "loss": 0.1168, "step": 35480 }, { "epoch": 42.8111044055522, "grad_norm": 9.518385887145996, "learning_rate": 1.9999146223409943e-05, "loss": 0.116, "step": 35490 }, { "epoch": 42.8231744115872, "grad_norm": 8.093730926513672, "learning_rate": 1.999914598215515e-05, "loss": 0.1203, "step": 35500 }, { "epoch": 42.8231744115872, "eval_loss": 11.63310718536377, "eval_runtime": 8.1365, "eval_samples_per_second": 85.664, "eval_steps_per_second": 10.816, "step": 35500 }, { "epoch": 42.83524441762221, "grad_norm": 8.800875663757324, "learning_rate": 1.9999145740900355e-05, "loss": 0.1201, "step": 35510 }, { "epoch": 42.84731442365721, "grad_norm": 8.87839126586914, "learning_rate": 1.999914549964556e-05, "loss": 0.1188, "step": 35520 }, { "epoch": 42.85938442969221, "grad_norm": 8.115975379943848, "learning_rate": 1.9999145258390768e-05, "loss": 0.1182, "step": 35530 }, { "epoch": 42.871454435727216, "grad_norm": 8.681598663330078, "learning_rate": 1.9999145017135974e-05, "loss": 0.1165, "step": 35540 }, { "epoch": 42.88352444176222, "grad_norm": 7.831724643707275, "learning_rate": 1.999914477588118e-05, "loss": 0.1197, "step": 35550 }, { "epoch": 42.89559444779722, "grad_norm": 9.089218139648438, "learning_rate": 1.9999144534626386e-05, "loss": 0.1147, "step": 35560 }, { "epoch": 42.907664453832226, "grad_norm": 9.004312515258789, "learning_rate": 1.9999144293371593e-05, "loss": 0.1203, "step": 35570 }, { "epoch": 42.91973445986723, "grad_norm": 9.110687255859375, "learning_rate": 1.99991440521168e-05, "loss": 0.1193, "step": 35580 }, { "epoch": 42.93180446590223, "grad_norm": 9.311285972595215, "learning_rate": 1.9999143810862005e-05, "loss": 0.1248, "step": 35590 }, { "epoch": 42.943874471937235, "grad_norm": 8.901398658752441, "learning_rate": 1.999914356960721e-05, "loss": 0.1191, "step": 35600 }, { "epoch": 42.95594447797224, "grad_norm": 8.633712768554688, "learning_rate": 1.9999143328352418e-05, "loss": 0.1194, "step": 35610 }, { "epoch": 42.96801448400724, "grad_norm": 8.60753059387207, "learning_rate": 1.9999143087097624e-05, "loss": 0.1215, "step": 35620 }, { "epoch": 42.980084490042245, "grad_norm": 8.584927558898926, "learning_rate": 1.999914284584283e-05, "loss": 0.1201, "step": 35630 }, { "epoch": 42.99215449607725, "grad_norm": 8.290674209594727, "learning_rate": 1.9999142604588036e-05, "loss": 0.1178, "step": 35640 }, { "epoch": 43.0036210018105, "grad_norm": 6.772762298583984, "learning_rate": 1.9999142363333242e-05, "loss": 0.1049, "step": 35650 }, { "epoch": 43.015691007845504, "grad_norm": 7.4671125411987305, "learning_rate": 1.999914212207845e-05, "loss": 0.0847, "step": 35660 }, { "epoch": 43.02776101388051, "grad_norm": 7.979122638702393, "learning_rate": 1.9999141880823655e-05, "loss": 0.0863, "step": 35670 }, { "epoch": 43.03983101991551, "grad_norm": 7.774200916290283, "learning_rate": 1.999914163956886e-05, "loss": 0.0886, "step": 35680 }, { "epoch": 43.051901025950514, "grad_norm": 7.361456871032715, "learning_rate": 1.9999141398314067e-05, "loss": 0.0913, "step": 35690 }, { "epoch": 43.06397103198552, "grad_norm": 7.413816452026367, "learning_rate": 1.9999141157059273e-05, "loss": 0.0895, "step": 35700 }, { "epoch": 43.07604103802052, "grad_norm": 7.276921272277832, "learning_rate": 1.999914091580448e-05, "loss": 0.0919, "step": 35710 }, { "epoch": 43.08811104405552, "grad_norm": 7.119116306304932, "learning_rate": 1.9999140674549682e-05, "loss": 0.091, "step": 35720 }, { "epoch": 43.10018105009053, "grad_norm": 8.125150680541992, "learning_rate": 1.999914043329489e-05, "loss": 0.0909, "step": 35730 }, { "epoch": 43.11225105612553, "grad_norm": 6.422246932983398, "learning_rate": 1.9999140192040095e-05, "loss": 0.0922, "step": 35740 }, { "epoch": 43.12432106216053, "grad_norm": 7.666179656982422, "learning_rate": 1.99991399507853e-05, "loss": 0.0947, "step": 35750 }, { "epoch": 43.136391068195536, "grad_norm": 7.602639675140381, "learning_rate": 1.9999139709530507e-05, "loss": 0.0953, "step": 35760 }, { "epoch": 43.14846107423054, "grad_norm": 7.286924362182617, "learning_rate": 1.9999139468275714e-05, "loss": 0.0944, "step": 35770 }, { "epoch": 43.16053108026554, "grad_norm": 7.216557025909424, "learning_rate": 1.999913922702092e-05, "loss": 0.0938, "step": 35780 }, { "epoch": 43.172601086300546, "grad_norm": 8.167135238647461, "learning_rate": 1.9999138985766126e-05, "loss": 0.0995, "step": 35790 }, { "epoch": 43.18467109233555, "grad_norm": 8.40335750579834, "learning_rate": 1.9999138744511332e-05, "loss": 0.0997, "step": 35800 }, { "epoch": 43.19674109837055, "grad_norm": 7.590811729431152, "learning_rate": 1.999913850325654e-05, "loss": 0.099, "step": 35810 }, { "epoch": 43.208811104405555, "grad_norm": 7.333605766296387, "learning_rate": 1.9999138262001745e-05, "loss": 0.098, "step": 35820 }, { "epoch": 43.22088111044056, "grad_norm": 7.679373741149902, "learning_rate": 1.999913802074695e-05, "loss": 0.098, "step": 35830 }, { "epoch": 43.23295111647556, "grad_norm": 7.656306743621826, "learning_rate": 1.9999137779492157e-05, "loss": 0.0964, "step": 35840 }, { "epoch": 43.245021122510565, "grad_norm": 7.936821460723877, "learning_rate": 1.9999137538237363e-05, "loss": 0.1005, "step": 35850 }, { "epoch": 43.25709112854556, "grad_norm": 8.444130897521973, "learning_rate": 1.999913729698257e-05, "loss": 0.0993, "step": 35860 }, { "epoch": 43.269161134580564, "grad_norm": 8.383756637573242, "learning_rate": 1.9999137055727776e-05, "loss": 0.1012, "step": 35870 }, { "epoch": 43.28123114061557, "grad_norm": 8.270936965942383, "learning_rate": 1.9999136814472982e-05, "loss": 0.1014, "step": 35880 }, { "epoch": 43.29330114665057, "grad_norm": 7.998587131500244, "learning_rate": 1.9999136573218188e-05, "loss": 0.1073, "step": 35890 }, { "epoch": 43.305371152685574, "grad_norm": 7.4375104904174805, "learning_rate": 1.9999136331963394e-05, "loss": 0.1014, "step": 35900 }, { "epoch": 43.31744115872058, "grad_norm": 7.6620354652404785, "learning_rate": 1.99991360907086e-05, "loss": 0.1017, "step": 35910 }, { "epoch": 43.32951116475558, "grad_norm": 7.910877227783203, "learning_rate": 1.9999135849453807e-05, "loss": 0.1014, "step": 35920 }, { "epoch": 43.34158117079058, "grad_norm": 7.519763469696045, "learning_rate": 1.9999135608199013e-05, "loss": 0.107, "step": 35930 }, { "epoch": 43.353651176825586, "grad_norm": 7.7889790534973145, "learning_rate": 1.999913536694422e-05, "loss": 0.1005, "step": 35940 }, { "epoch": 43.36572118286059, "grad_norm": 7.411113739013672, "learning_rate": 1.9999135125689425e-05, "loss": 0.103, "step": 35950 }, { "epoch": 43.37779118889559, "grad_norm": 8.443138122558594, "learning_rate": 1.999913488443463e-05, "loss": 0.1046, "step": 35960 }, { "epoch": 43.389861194930596, "grad_norm": 8.086044311523438, "learning_rate": 1.9999134643179834e-05, "loss": 0.1049, "step": 35970 }, { "epoch": 43.4019312009656, "grad_norm": 7.509288311004639, "learning_rate": 1.999913440192504e-05, "loss": 0.1054, "step": 35980 }, { "epoch": 43.4140012070006, "grad_norm": 7.909120082855225, "learning_rate": 1.9999134160670247e-05, "loss": 0.1051, "step": 35990 }, { "epoch": 43.426071213035605, "grad_norm": 7.779096603393555, "learning_rate": 1.9999133919415453e-05, "loss": 0.1033, "step": 36000 }, { "epoch": 43.426071213035605, "eval_loss": 11.662515640258789, "eval_runtime": 8.1222, "eval_samples_per_second": 85.814, "eval_steps_per_second": 10.835, "step": 36000 }, { "epoch": 43.43814121907061, "grad_norm": 7.537408351898193, "learning_rate": 1.999913367816066e-05, "loss": 0.1067, "step": 36010 }, { "epoch": 43.45021122510561, "grad_norm": 7.806889057159424, "learning_rate": 1.9999133436905866e-05, "loss": 0.1023, "step": 36020 }, { "epoch": 43.462281231140615, "grad_norm": 8.920476913452148, "learning_rate": 1.9999133195651075e-05, "loss": 0.1067, "step": 36030 }, { "epoch": 43.47435123717562, "grad_norm": 8.21190071105957, "learning_rate": 1.999913295439628e-05, "loss": 0.1087, "step": 36040 }, { "epoch": 43.48642124321062, "grad_norm": 7.829590320587158, "learning_rate": 1.9999132713141488e-05, "loss": 0.1075, "step": 36050 }, { "epoch": 43.498491249245625, "grad_norm": 8.165225982666016, "learning_rate": 1.9999132471886694e-05, "loss": 0.1078, "step": 36060 }, { "epoch": 43.51056125528063, "grad_norm": 8.167505264282227, "learning_rate": 1.99991322306319e-05, "loss": 0.1072, "step": 36070 }, { "epoch": 43.52263126131563, "grad_norm": 8.177483558654785, "learning_rate": 1.9999131989377106e-05, "loss": 0.1064, "step": 36080 }, { "epoch": 43.534701267350634, "grad_norm": 7.616897106170654, "learning_rate": 1.9999131748122312e-05, "loss": 0.1095, "step": 36090 }, { "epoch": 43.54677127338564, "grad_norm": 8.055828094482422, "learning_rate": 1.999913150686752e-05, "loss": 0.1108, "step": 36100 }, { "epoch": 43.55884127942064, "grad_norm": 8.6476411819458, "learning_rate": 1.9999131265612725e-05, "loss": 0.1094, "step": 36110 }, { "epoch": 43.570911285455644, "grad_norm": 8.650827407836914, "learning_rate": 1.999913102435793e-05, "loss": 0.113, "step": 36120 }, { "epoch": 43.58298129149065, "grad_norm": 8.082877159118652, "learning_rate": 1.9999130783103134e-05, "loss": 0.11, "step": 36130 }, { "epoch": 43.59505129752565, "grad_norm": 8.960680961608887, "learning_rate": 1.999913054184834e-05, "loss": 0.1084, "step": 36140 }, { "epoch": 43.60712130356065, "grad_norm": 8.189901351928711, "learning_rate": 1.9999130300593546e-05, "loss": 0.106, "step": 36150 }, { "epoch": 43.61919130959566, "grad_norm": 7.716009616851807, "learning_rate": 1.9999130059338753e-05, "loss": 0.1094, "step": 36160 }, { "epoch": 43.63126131563066, "grad_norm": 8.420042991638184, "learning_rate": 1.999912981808396e-05, "loss": 0.1083, "step": 36170 }, { "epoch": 43.64333132166566, "grad_norm": 8.091172218322754, "learning_rate": 1.9999129576829165e-05, "loss": 0.1077, "step": 36180 }, { "epoch": 43.655401327700666, "grad_norm": 8.840784072875977, "learning_rate": 1.999912933557437e-05, "loss": 0.1109, "step": 36190 }, { "epoch": 43.66747133373567, "grad_norm": 8.571383476257324, "learning_rate": 1.9999129094319577e-05, "loss": 0.1093, "step": 36200 }, { "epoch": 43.67954133977067, "grad_norm": 8.480493545532227, "learning_rate": 1.9999128853064784e-05, "loss": 0.1093, "step": 36210 }, { "epoch": 43.691611345805676, "grad_norm": 8.142501831054688, "learning_rate": 1.999912861180999e-05, "loss": 0.1089, "step": 36220 }, { "epoch": 43.70368135184068, "grad_norm": 7.911145210266113, "learning_rate": 1.9999128370555196e-05, "loss": 0.1133, "step": 36230 }, { "epoch": 43.71575135787568, "grad_norm": 8.315420150756836, "learning_rate": 1.9999128129300402e-05, "loss": 0.1088, "step": 36240 }, { "epoch": 43.727821363910685, "grad_norm": 8.390474319458008, "learning_rate": 1.999912788804561e-05, "loss": 0.1133, "step": 36250 }, { "epoch": 43.73989136994569, "grad_norm": 8.73437213897705, "learning_rate": 1.9999127646790815e-05, "loss": 0.1111, "step": 36260 }, { "epoch": 43.751961375980684, "grad_norm": 8.384319305419922, "learning_rate": 1.999912740553602e-05, "loss": 0.1107, "step": 36270 }, { "epoch": 43.76403138201569, "grad_norm": 9.048318862915039, "learning_rate": 1.9999127164281227e-05, "loss": 0.117, "step": 36280 }, { "epoch": 43.77610138805069, "grad_norm": 8.515769004821777, "learning_rate": 1.9999126923026433e-05, "loss": 0.1163, "step": 36290 }, { "epoch": 43.788171394085694, "grad_norm": 8.566049575805664, "learning_rate": 1.999912668177164e-05, "loss": 0.1166, "step": 36300 }, { "epoch": 43.8002414001207, "grad_norm": 8.947233200073242, "learning_rate": 1.9999126440516846e-05, "loss": 0.1146, "step": 36310 }, { "epoch": 43.8123114061557, "grad_norm": 8.284265518188477, "learning_rate": 1.9999126199262052e-05, "loss": 0.1121, "step": 36320 }, { "epoch": 43.824381412190704, "grad_norm": 8.204765319824219, "learning_rate": 1.9999125958007258e-05, "loss": 0.1134, "step": 36330 }, { "epoch": 43.83645141822571, "grad_norm": 8.834737777709961, "learning_rate": 1.9999125716752464e-05, "loss": 0.1156, "step": 36340 }, { "epoch": 43.84852142426071, "grad_norm": 9.359312057495117, "learning_rate": 1.999912547549767e-05, "loss": 0.1132, "step": 36350 }, { "epoch": 43.86059143029571, "grad_norm": 8.247825622558594, "learning_rate": 1.9999125234242877e-05, "loss": 0.1155, "step": 36360 }, { "epoch": 43.872661436330716, "grad_norm": 8.050289154052734, "learning_rate": 1.9999124992988083e-05, "loss": 0.1152, "step": 36370 }, { "epoch": 43.88473144236572, "grad_norm": 8.794509887695312, "learning_rate": 1.9999124751733286e-05, "loss": 0.1122, "step": 36380 }, { "epoch": 43.89680144840072, "grad_norm": 8.22842025756836, "learning_rate": 1.9999124510478492e-05, "loss": 0.1127, "step": 36390 }, { "epoch": 43.908871454435726, "grad_norm": 8.112460136413574, "learning_rate": 1.99991242692237e-05, "loss": 0.1143, "step": 36400 }, { "epoch": 43.92094146047073, "grad_norm": 9.110854148864746, "learning_rate": 1.9999124027968905e-05, "loss": 0.1164, "step": 36410 }, { "epoch": 43.93301146650573, "grad_norm": 8.625025749206543, "learning_rate": 1.999912378671411e-05, "loss": 0.1169, "step": 36420 }, { "epoch": 43.945081472540735, "grad_norm": 8.468772888183594, "learning_rate": 1.9999123545459317e-05, "loss": 0.1173, "step": 36430 }, { "epoch": 43.95715147857574, "grad_norm": 8.83586597442627, "learning_rate": 1.9999123304204523e-05, "loss": 0.1136, "step": 36440 }, { "epoch": 43.96922148461074, "grad_norm": 8.685946464538574, "learning_rate": 1.999912306294973e-05, "loss": 0.1153, "step": 36450 }, { "epoch": 43.981291490645745, "grad_norm": 9.321817398071289, "learning_rate": 1.9999122821694936e-05, "loss": 0.1182, "step": 36460 }, { "epoch": 43.99336149668075, "grad_norm": 9.001611709594727, "learning_rate": 1.9999122580440142e-05, "loss": 0.1157, "step": 36470 }, { "epoch": 44.004828002414, "grad_norm": 6.50749397277832, "learning_rate": 1.9999122339185348e-05, "loss": 0.1025, "step": 36480 }, { "epoch": 44.016898008449004, "grad_norm": 7.625373363494873, "learning_rate": 1.9999122097930554e-05, "loss": 0.0797, "step": 36490 }, { "epoch": 44.02896801448401, "grad_norm": 8.171028137207031, "learning_rate": 1.999912185667576e-05, "loss": 0.0827, "step": 36500 }, { "epoch": 44.02896801448401, "eval_loss": 11.65872573852539, "eval_runtime": 8.127, "eval_samples_per_second": 85.763, "eval_steps_per_second": 10.828, "step": 36500 }, { "epoch": 44.04103802051901, "grad_norm": 7.5197834968566895, "learning_rate": 1.9999121615420967e-05, "loss": 0.0869, "step": 36510 }, { "epoch": 44.053108026554014, "grad_norm": 7.247259616851807, "learning_rate": 1.9999121374166173e-05, "loss": 0.0872, "step": 36520 }, { "epoch": 44.06517803258902, "grad_norm": 6.620570659637451, "learning_rate": 1.999912113291138e-05, "loss": 0.0916, "step": 36530 }, { "epoch": 44.07724803862402, "grad_norm": 6.744095802307129, "learning_rate": 1.9999120891656585e-05, "loss": 0.0888, "step": 36540 }, { "epoch": 44.089318044659024, "grad_norm": 7.7801079750061035, "learning_rate": 1.999912065040179e-05, "loss": 0.0891, "step": 36550 }, { "epoch": 44.10138805069403, "grad_norm": 6.786291599273682, "learning_rate": 1.9999120409146998e-05, "loss": 0.0898, "step": 36560 }, { "epoch": 44.11345805672903, "grad_norm": 8.063508033752441, "learning_rate": 1.9999120167892204e-05, "loss": 0.0964, "step": 36570 }, { "epoch": 44.12552806276403, "grad_norm": 6.7097392082214355, "learning_rate": 1.999911992663741e-05, "loss": 0.0923, "step": 36580 }, { "epoch": 44.137598068799036, "grad_norm": 6.9217000007629395, "learning_rate": 1.9999119685382616e-05, "loss": 0.0954, "step": 36590 }, { "epoch": 44.14966807483404, "grad_norm": 6.918553352355957, "learning_rate": 1.9999119444127823e-05, "loss": 0.0922, "step": 36600 }, { "epoch": 44.16173808086904, "grad_norm": 7.013489723205566, "learning_rate": 1.999911920287303e-05, "loss": 0.0969, "step": 36610 }, { "epoch": 44.173808086904046, "grad_norm": 8.437304496765137, "learning_rate": 1.9999118961618235e-05, "loss": 0.0962, "step": 36620 }, { "epoch": 44.18587809293905, "grad_norm": 7.157087326049805, "learning_rate": 1.999911872036344e-05, "loss": 0.0974, "step": 36630 }, { "epoch": 44.19794809897405, "grad_norm": 7.7815704345703125, "learning_rate": 1.9999118479108648e-05, "loss": 0.0953, "step": 36640 }, { "epoch": 44.210018105009055, "grad_norm": 7.624409198760986, "learning_rate": 1.9999118237853854e-05, "loss": 0.0959, "step": 36650 }, { "epoch": 44.22208811104406, "grad_norm": 8.240124702453613, "learning_rate": 1.999911799659906e-05, "loss": 0.1002, "step": 36660 }, { "epoch": 44.23415811707906, "grad_norm": 7.274369716644287, "learning_rate": 1.9999117755344266e-05, "loss": 0.0996, "step": 36670 }, { "epoch": 44.246228123114065, "grad_norm": 7.916162490844727, "learning_rate": 1.9999117514089472e-05, "loss": 0.1, "step": 36680 }, { "epoch": 44.25829812914906, "grad_norm": 7.705639362335205, "learning_rate": 1.999911727283468e-05, "loss": 0.098, "step": 36690 }, { "epoch": 44.270368135184064, "grad_norm": 7.374164581298828, "learning_rate": 1.9999117031579885e-05, "loss": 0.0978, "step": 36700 }, { "epoch": 44.28243814121907, "grad_norm": 7.453375339508057, "learning_rate": 1.999911679032509e-05, "loss": 0.0972, "step": 36710 }, { "epoch": 44.29450814725407, "grad_norm": 7.100893020629883, "learning_rate": 1.9999116549070297e-05, "loss": 0.0985, "step": 36720 }, { "epoch": 44.306578153289074, "grad_norm": 7.465360164642334, "learning_rate": 1.9999116307815503e-05, "loss": 0.0999, "step": 36730 }, { "epoch": 44.31864815932408, "grad_norm": 7.653491020202637, "learning_rate": 1.999911606656071e-05, "loss": 0.1004, "step": 36740 }, { "epoch": 44.33071816535908, "grad_norm": 8.312492370605469, "learning_rate": 1.9999115825305916e-05, "loss": 0.1022, "step": 36750 }, { "epoch": 44.34278817139408, "grad_norm": 7.824400424957275, "learning_rate": 1.9999115584051122e-05, "loss": 0.1032, "step": 36760 }, { "epoch": 44.35485817742909, "grad_norm": 7.23616361618042, "learning_rate": 1.999911534279633e-05, "loss": 0.1007, "step": 36770 }, { "epoch": 44.36692818346409, "grad_norm": 7.627196311950684, "learning_rate": 1.9999115101541535e-05, "loss": 0.0993, "step": 36780 }, { "epoch": 44.37899818949909, "grad_norm": 7.890810966491699, "learning_rate": 1.999911486028674e-05, "loss": 0.0996, "step": 36790 }, { "epoch": 44.391068195534096, "grad_norm": 7.933648109436035, "learning_rate": 1.9999114619031944e-05, "loss": 0.1016, "step": 36800 }, { "epoch": 44.4031382015691, "grad_norm": 7.912554740905762, "learning_rate": 1.999911437777715e-05, "loss": 0.1062, "step": 36810 }, { "epoch": 44.4152082076041, "grad_norm": 8.184324264526367, "learning_rate": 1.9999114136522356e-05, "loss": 0.103, "step": 36820 }, { "epoch": 44.427278213639106, "grad_norm": 8.604528427124023, "learning_rate": 1.9999113895267562e-05, "loss": 0.1071, "step": 36830 }, { "epoch": 44.43934821967411, "grad_norm": 8.342833518981934, "learning_rate": 1.999911365401277e-05, "loss": 0.1044, "step": 36840 }, { "epoch": 44.45141822570911, "grad_norm": 7.661088466644287, "learning_rate": 1.9999113412757975e-05, "loss": 0.1, "step": 36850 }, { "epoch": 44.463488231744115, "grad_norm": 7.988140106201172, "learning_rate": 1.999911317150318e-05, "loss": 0.1023, "step": 36860 }, { "epoch": 44.47555823777912, "grad_norm": 8.397379875183105, "learning_rate": 1.9999112930248387e-05, "loss": 0.1058, "step": 36870 }, { "epoch": 44.48762824381412, "grad_norm": 8.534106254577637, "learning_rate": 1.9999112688993593e-05, "loss": 0.1014, "step": 36880 }, { "epoch": 44.499698249849125, "grad_norm": 8.430763244628906, "learning_rate": 1.99991124477388e-05, "loss": 0.1054, "step": 36890 }, { "epoch": 44.51176825588413, "grad_norm": 8.136177062988281, "learning_rate": 1.9999112206484006e-05, "loss": 0.105, "step": 36900 }, { "epoch": 44.52383826191913, "grad_norm": 8.116512298583984, "learning_rate": 1.9999111965229212e-05, "loss": 0.1081, "step": 36910 }, { "epoch": 44.535908267954134, "grad_norm": 7.767886161804199, "learning_rate": 1.9999111723974418e-05, "loss": 0.1022, "step": 36920 }, { "epoch": 44.54797827398914, "grad_norm": 8.140549659729004, "learning_rate": 1.9999111482719624e-05, "loss": 0.1048, "step": 36930 }, { "epoch": 44.56004828002414, "grad_norm": 8.052779197692871, "learning_rate": 1.999911124146483e-05, "loss": 0.1062, "step": 36940 }, { "epoch": 44.572118286059144, "grad_norm": 7.669057369232178, "learning_rate": 1.9999111000210037e-05, "loss": 0.1021, "step": 36950 }, { "epoch": 44.58418829209415, "grad_norm": 7.665679454803467, "learning_rate": 1.9999110758955243e-05, "loss": 0.1035, "step": 36960 }, { "epoch": 44.59625829812915, "grad_norm": 8.590936660766602, "learning_rate": 1.999911051770045e-05, "loss": 0.1069, "step": 36970 }, { "epoch": 44.608328304164154, "grad_norm": 8.795587539672852, "learning_rate": 1.9999110276445655e-05, "loss": 0.106, "step": 36980 }, { "epoch": 44.62039831019916, "grad_norm": 8.024588584899902, "learning_rate": 1.999911003519086e-05, "loss": 0.1018, "step": 36990 }, { "epoch": 44.63246831623416, "grad_norm": 7.558562278747559, "learning_rate": 1.9999109793936068e-05, "loss": 0.1049, "step": 37000 }, { "epoch": 44.63246831623416, "eval_loss": 11.691576957702637, "eval_runtime": 8.1342, "eval_samples_per_second": 85.687, "eval_steps_per_second": 10.818, "step": 37000 }, { "epoch": 44.64453832226916, "grad_norm": 7.925698757171631, "learning_rate": 1.9999109552681274e-05, "loss": 0.103, "step": 37010 }, { "epoch": 44.656608328304166, "grad_norm": 8.384378433227539, "learning_rate": 1.999910931142648e-05, "loss": 0.1028, "step": 37020 }, { "epoch": 44.66867833433917, "grad_norm": 8.352166175842285, "learning_rate": 1.9999109070171687e-05, "loss": 0.1076, "step": 37030 }, { "epoch": 44.68074834037417, "grad_norm": 6.889833450317383, "learning_rate": 1.9999108828916893e-05, "loss": 0.1037, "step": 37040 }, { "epoch": 44.692818346409176, "grad_norm": 7.73795223236084, "learning_rate": 1.9999108587662096e-05, "loss": 0.1088, "step": 37050 }, { "epoch": 44.70488835244418, "grad_norm": 8.03374195098877, "learning_rate": 1.9999108346407302e-05, "loss": 0.1098, "step": 37060 }, { "epoch": 44.71695835847918, "grad_norm": 7.349059104919434, "learning_rate": 1.9999108105152508e-05, "loss": 0.1081, "step": 37070 }, { "epoch": 44.729028364514186, "grad_norm": 7.4223527908325195, "learning_rate": 1.9999107863897714e-05, "loss": 0.1071, "step": 37080 }, { "epoch": 44.74109837054919, "grad_norm": 9.395209312438965, "learning_rate": 1.999910762264292e-05, "loss": 0.1113, "step": 37090 }, { "epoch": 44.753168376584185, "grad_norm": 8.872891426086426, "learning_rate": 1.9999107381388127e-05, "loss": 0.1101, "step": 37100 }, { "epoch": 44.76523838261919, "grad_norm": 8.817893028259277, "learning_rate": 1.9999107140133336e-05, "loss": 0.1141, "step": 37110 }, { "epoch": 44.77730838865419, "grad_norm": 7.9366044998168945, "learning_rate": 1.9999106898878542e-05, "loss": 0.1104, "step": 37120 }, { "epoch": 44.789378394689194, "grad_norm": 7.6938676834106445, "learning_rate": 1.999910665762375e-05, "loss": 0.111, "step": 37130 }, { "epoch": 44.8014484007242, "grad_norm": 8.244601249694824, "learning_rate": 1.9999106416368955e-05, "loss": 0.1072, "step": 37140 }, { "epoch": 44.8135184067592, "grad_norm": 8.331605911254883, "learning_rate": 1.999910617511416e-05, "loss": 0.1136, "step": 37150 }, { "epoch": 44.825588412794204, "grad_norm": 7.44158411026001, "learning_rate": 1.9999105933859367e-05, "loss": 0.1091, "step": 37160 }, { "epoch": 44.83765841882921, "grad_norm": 8.383366584777832, "learning_rate": 1.9999105692604574e-05, "loss": 0.1131, "step": 37170 }, { "epoch": 44.84972842486421, "grad_norm": 8.29902172088623, "learning_rate": 1.999910545134978e-05, "loss": 0.1114, "step": 37180 }, { "epoch": 44.86179843089921, "grad_norm": 8.866331100463867, "learning_rate": 1.9999105210094986e-05, "loss": 0.1131, "step": 37190 }, { "epoch": 44.87386843693422, "grad_norm": 8.557445526123047, "learning_rate": 1.9999104968840192e-05, "loss": 0.1101, "step": 37200 }, { "epoch": 44.88593844296922, "grad_norm": 8.515091896057129, "learning_rate": 1.9999104727585395e-05, "loss": 0.1123, "step": 37210 }, { "epoch": 44.89800844900422, "grad_norm": 7.668406009674072, "learning_rate": 1.99991044863306e-05, "loss": 0.1132, "step": 37220 }, { "epoch": 44.910078455039226, "grad_norm": 8.31334114074707, "learning_rate": 1.9999104245075807e-05, "loss": 0.1138, "step": 37230 }, { "epoch": 44.92214846107423, "grad_norm": 8.23514175415039, "learning_rate": 1.9999104003821014e-05, "loss": 0.1124, "step": 37240 }, { "epoch": 44.93421846710923, "grad_norm": 8.883479118347168, "learning_rate": 1.999910376256622e-05, "loss": 0.1168, "step": 37250 }, { "epoch": 44.946288473144236, "grad_norm": 8.450743675231934, "learning_rate": 1.9999103521311426e-05, "loss": 0.1124, "step": 37260 }, { "epoch": 44.95835847917924, "grad_norm": 8.920717239379883, "learning_rate": 1.9999103280056632e-05, "loss": 0.1131, "step": 37270 }, { "epoch": 44.97042848521424, "grad_norm": 8.981025695800781, "learning_rate": 1.999910303880184e-05, "loss": 0.1138, "step": 37280 }, { "epoch": 44.982498491249245, "grad_norm": 7.889089584350586, "learning_rate": 1.9999102797547045e-05, "loss": 0.1118, "step": 37290 }, { "epoch": 44.99456849728425, "grad_norm": 8.375700950622559, "learning_rate": 1.999910255629225e-05, "loss": 0.115, "step": 37300 }, { "epoch": 45.0060350030175, "grad_norm": 6.451115131378174, "learning_rate": 1.9999102315037457e-05, "loss": 0.0961, "step": 37310 }, { "epoch": 45.018105009052505, "grad_norm": 6.936765193939209, "learning_rate": 1.9999102073782663e-05, "loss": 0.0781, "step": 37320 }, { "epoch": 45.03017501508751, "grad_norm": 7.223410606384277, "learning_rate": 1.999910183252787e-05, "loss": 0.0875, "step": 37330 }, { "epoch": 45.04224502112251, "grad_norm": 7.199265480041504, "learning_rate": 1.9999101591273076e-05, "loss": 0.0889, "step": 37340 }, { "epoch": 45.054315027157514, "grad_norm": 7.117958068847656, "learning_rate": 1.9999101350018282e-05, "loss": 0.0827, "step": 37350 }, { "epoch": 45.06638503319252, "grad_norm": 6.734995365142822, "learning_rate": 1.9999101108763488e-05, "loss": 0.0875, "step": 37360 }, { "epoch": 45.07845503922752, "grad_norm": 6.718678951263428, "learning_rate": 1.9999100867508694e-05, "loss": 0.0855, "step": 37370 }, { "epoch": 45.090525045262524, "grad_norm": 7.5206828117370605, "learning_rate": 1.99991006262539e-05, "loss": 0.0838, "step": 37380 }, { "epoch": 45.10259505129753, "grad_norm": 6.643110275268555, "learning_rate": 1.9999100384999107e-05, "loss": 0.088, "step": 37390 }, { "epoch": 45.11466505733253, "grad_norm": 7.348848819732666, "learning_rate": 1.9999100143744313e-05, "loss": 0.0909, "step": 37400 }, { "epoch": 45.12673506336753, "grad_norm": 7.254395484924316, "learning_rate": 1.999909990248952e-05, "loss": 0.0923, "step": 37410 }, { "epoch": 45.13880506940254, "grad_norm": 8.678648948669434, "learning_rate": 1.9999099661234726e-05, "loss": 0.0931, "step": 37420 }, { "epoch": 45.15087507543754, "grad_norm": 7.856507778167725, "learning_rate": 1.9999099419979932e-05, "loss": 0.0927, "step": 37430 }, { "epoch": 45.16294508147254, "grad_norm": 7.523709774017334, "learning_rate": 1.9999099178725138e-05, "loss": 0.0921, "step": 37440 }, { "epoch": 45.175015087507546, "grad_norm": 7.545246601104736, "learning_rate": 1.9999098937470344e-05, "loss": 0.0922, "step": 37450 }, { "epoch": 45.18708509354255, "grad_norm": 7.440414905548096, "learning_rate": 1.9999098696215547e-05, "loss": 0.0917, "step": 37460 }, { "epoch": 45.19915509957755, "grad_norm": 7.0614776611328125, "learning_rate": 1.9999098454960753e-05, "loss": 0.0902, "step": 37470 }, { "epoch": 45.211225105612556, "grad_norm": 8.001973152160645, "learning_rate": 1.999909821370596e-05, "loss": 0.0942, "step": 37480 }, { "epoch": 45.22329511164756, "grad_norm": 7.855236530303955, "learning_rate": 1.9999097972451166e-05, "loss": 0.094, "step": 37490 }, { "epoch": 45.23536511768256, "grad_norm": 7.813599109649658, "learning_rate": 1.9999097731196372e-05, "loss": 0.0985, "step": 37500 }, { "epoch": 45.23536511768256, "eval_loss": 11.69037914276123, "eval_runtime": 8.1842, "eval_samples_per_second": 85.164, "eval_steps_per_second": 10.752, "step": 37500 }, { "epoch": 45.247435123717565, "grad_norm": 7.853457450866699, "learning_rate": 1.9999097489941578e-05, "loss": 0.0953, "step": 37510 }, { "epoch": 45.25950512975256, "grad_norm": 7.311386585235596, "learning_rate": 1.9999097248686784e-05, "loss": 0.097, "step": 37520 }, { "epoch": 45.271575135787565, "grad_norm": 7.437973499298096, "learning_rate": 1.999909700743199e-05, "loss": 0.095, "step": 37530 }, { "epoch": 45.28364514182257, "grad_norm": 7.856088161468506, "learning_rate": 1.9999096766177197e-05, "loss": 0.0946, "step": 37540 }, { "epoch": 45.29571514785757, "grad_norm": 7.901101112365723, "learning_rate": 1.9999096524922403e-05, "loss": 0.099, "step": 37550 }, { "epoch": 45.307785153892574, "grad_norm": 8.434165954589844, "learning_rate": 1.999909628366761e-05, "loss": 0.101, "step": 37560 }, { "epoch": 45.31985515992758, "grad_norm": 7.2202043533325195, "learning_rate": 1.9999096042412815e-05, "loss": 0.0954, "step": 37570 }, { "epoch": 45.33192516596258, "grad_norm": 8.214638710021973, "learning_rate": 1.999909580115802e-05, "loss": 0.0957, "step": 37580 }, { "epoch": 45.343995171997584, "grad_norm": 7.9421162605285645, "learning_rate": 1.9999095559903228e-05, "loss": 0.0988, "step": 37590 }, { "epoch": 45.35606517803259, "grad_norm": 6.977893352508545, "learning_rate": 1.9999095318648434e-05, "loss": 0.0986, "step": 37600 }, { "epoch": 45.36813518406759, "grad_norm": 7.957215785980225, "learning_rate": 1.999909507739364e-05, "loss": 0.0999, "step": 37610 }, { "epoch": 45.38020519010259, "grad_norm": 8.803808212280273, "learning_rate": 1.9999094836138846e-05, "loss": 0.1017, "step": 37620 }, { "epoch": 45.3922751961376, "grad_norm": 7.798969745635986, "learning_rate": 1.9999094594884053e-05, "loss": 0.1009, "step": 37630 }, { "epoch": 45.4043452021726, "grad_norm": 7.674078464508057, "learning_rate": 1.999909435362926e-05, "loss": 0.0984, "step": 37640 }, { "epoch": 45.4164152082076, "grad_norm": 7.534451961517334, "learning_rate": 1.9999094112374465e-05, "loss": 0.0968, "step": 37650 }, { "epoch": 45.428485214242606, "grad_norm": 8.185445785522461, "learning_rate": 1.999909387111967e-05, "loss": 0.0983, "step": 37660 }, { "epoch": 45.44055522027761, "grad_norm": 8.195934295654297, "learning_rate": 1.9999093629864878e-05, "loss": 0.0995, "step": 37670 }, { "epoch": 45.45262522631261, "grad_norm": 7.764647960662842, "learning_rate": 1.9999093388610084e-05, "loss": 0.1015, "step": 37680 }, { "epoch": 45.464695232347616, "grad_norm": 8.105177879333496, "learning_rate": 1.999909314735529e-05, "loss": 0.0991, "step": 37690 }, { "epoch": 45.47676523838262, "grad_norm": 8.27087688446045, "learning_rate": 1.9999092906100496e-05, "loss": 0.1009, "step": 37700 }, { "epoch": 45.48883524441762, "grad_norm": 7.764581203460693, "learning_rate": 1.9999092664845702e-05, "loss": 0.104, "step": 37710 }, { "epoch": 45.500905250452625, "grad_norm": 8.59019660949707, "learning_rate": 1.999909242359091e-05, "loss": 0.1044, "step": 37720 }, { "epoch": 45.51297525648763, "grad_norm": 8.735974311828613, "learning_rate": 1.9999092182336115e-05, "loss": 0.1036, "step": 37730 }, { "epoch": 45.52504526252263, "grad_norm": 8.149447441101074, "learning_rate": 1.999909194108132e-05, "loss": 0.104, "step": 37740 }, { "epoch": 45.537115268557635, "grad_norm": 7.665748119354248, "learning_rate": 1.9999091699826527e-05, "loss": 0.1047, "step": 37750 }, { "epoch": 45.54918527459264, "grad_norm": 8.142786979675293, "learning_rate": 1.9999091458571733e-05, "loss": 0.105, "step": 37760 }, { "epoch": 45.56125528062764, "grad_norm": 8.262974739074707, "learning_rate": 1.999909121731694e-05, "loss": 0.1049, "step": 37770 }, { "epoch": 45.573325286662644, "grad_norm": 7.335455417633057, "learning_rate": 1.9999090976062146e-05, "loss": 0.0999, "step": 37780 }, { "epoch": 45.58539529269765, "grad_norm": 8.585466384887695, "learning_rate": 1.9999090734807352e-05, "loss": 0.1048, "step": 37790 }, { "epoch": 45.59746529873265, "grad_norm": 8.354022026062012, "learning_rate": 1.999909049355256e-05, "loss": 0.1068, "step": 37800 }, { "epoch": 45.609535304767654, "grad_norm": 7.834239482879639, "learning_rate": 1.9999090252297765e-05, "loss": 0.1034, "step": 37810 }, { "epoch": 45.62160531080266, "grad_norm": 7.9529829025268555, "learning_rate": 1.999909001104297e-05, "loss": 0.1059, "step": 37820 }, { "epoch": 45.63367531683766, "grad_norm": 7.8213791847229, "learning_rate": 1.9999089769788177e-05, "loss": 0.1064, "step": 37830 }, { "epoch": 45.64574532287266, "grad_norm": 8.180171012878418, "learning_rate": 1.9999089528533383e-05, "loss": 0.1049, "step": 37840 }, { "epoch": 45.65781532890767, "grad_norm": 7.733997344970703, "learning_rate": 1.999908928727859e-05, "loss": 0.1028, "step": 37850 }, { "epoch": 45.66988533494267, "grad_norm": 8.095236778259277, "learning_rate": 1.9999089046023796e-05, "loss": 0.1053, "step": 37860 }, { "epoch": 45.68195534097767, "grad_norm": 8.042702674865723, "learning_rate": 1.9999088804769e-05, "loss": 0.1023, "step": 37870 }, { "epoch": 45.694025347012676, "grad_norm": 7.854078769683838, "learning_rate": 1.9999088563514205e-05, "loss": 0.1019, "step": 37880 }, { "epoch": 45.70609535304768, "grad_norm": 7.991125106811523, "learning_rate": 1.999908832225941e-05, "loss": 0.106, "step": 37890 }, { "epoch": 45.71816535908268, "grad_norm": 8.05309009552002, "learning_rate": 1.9999088081004617e-05, "loss": 0.1041, "step": 37900 }, { "epoch": 45.730235365117686, "grad_norm": 7.974242210388184, "learning_rate": 1.9999087839749823e-05, "loss": 0.109, "step": 37910 }, { "epoch": 45.74230537115269, "grad_norm": 7.722018718719482, "learning_rate": 1.999908759849503e-05, "loss": 0.1051, "step": 37920 }, { "epoch": 45.754375377187685, "grad_norm": 7.981302261352539, "learning_rate": 1.9999087357240236e-05, "loss": 0.1072, "step": 37930 }, { "epoch": 45.76644538322269, "grad_norm": 8.635458946228027, "learning_rate": 1.9999087115985442e-05, "loss": 0.1061, "step": 37940 }, { "epoch": 45.77851538925769, "grad_norm": 7.8827996253967285, "learning_rate": 1.9999086874730648e-05, "loss": 0.1071, "step": 37950 }, { "epoch": 45.790585395292695, "grad_norm": 8.14739990234375, "learning_rate": 1.9999086633475854e-05, "loss": 0.106, "step": 37960 }, { "epoch": 45.8026554013277, "grad_norm": 8.133596420288086, "learning_rate": 1.999908639222106e-05, "loss": 0.1129, "step": 37970 }, { "epoch": 45.8147254073627, "grad_norm": 8.422465324401855, "learning_rate": 1.9999086150966267e-05, "loss": 0.1117, "step": 37980 }, { "epoch": 45.826795413397704, "grad_norm": 8.73164176940918, "learning_rate": 1.9999085909711473e-05, "loss": 0.1126, "step": 37990 }, { "epoch": 45.83886541943271, "grad_norm": 8.58842658996582, "learning_rate": 1.999908566845668e-05, "loss": 0.106, "step": 38000 }, { "epoch": 45.83886541943271, "eval_loss": 11.744543075561523, "eval_runtime": 8.1578, "eval_samples_per_second": 85.439, "eval_steps_per_second": 10.787, "step": 38000 }, { "epoch": 45.85093542546771, "grad_norm": 7.60548734664917, "learning_rate": 1.9999085427201885e-05, "loss": 0.1076, "step": 38010 }, { "epoch": 45.863005431502714, "grad_norm": 8.285138130187988, "learning_rate": 1.9999085185947092e-05, "loss": 0.1094, "step": 38020 }, { "epoch": 45.87507543753772, "grad_norm": 8.336402893066406, "learning_rate": 1.9999084944692298e-05, "loss": 0.1063, "step": 38030 }, { "epoch": 45.88714544357272, "grad_norm": 8.591403007507324, "learning_rate": 1.9999084703437504e-05, "loss": 0.1091, "step": 38040 }, { "epoch": 45.89921544960772, "grad_norm": 8.650285720825195, "learning_rate": 1.999908446218271e-05, "loss": 0.1079, "step": 38050 }, { "epoch": 45.91128545564273, "grad_norm": 8.412415504455566, "learning_rate": 1.9999084220927917e-05, "loss": 0.1157, "step": 38060 }, { "epoch": 45.92335546167773, "grad_norm": 8.009284973144531, "learning_rate": 1.9999083979673123e-05, "loss": 0.109, "step": 38070 }, { "epoch": 45.93542546771273, "grad_norm": 8.50391674041748, "learning_rate": 1.999908373841833e-05, "loss": 0.1074, "step": 38080 }, { "epoch": 45.947495473747736, "grad_norm": 8.48154067993164, "learning_rate": 1.9999083497163535e-05, "loss": 0.1141, "step": 38090 }, { "epoch": 45.95956547978274, "grad_norm": 8.28128719329834, "learning_rate": 1.999908325590874e-05, "loss": 0.1134, "step": 38100 }, { "epoch": 45.97163548581774, "grad_norm": 7.88657808303833, "learning_rate": 1.9999083014653948e-05, "loss": 0.111, "step": 38110 }, { "epoch": 45.983705491852746, "grad_norm": 8.176637649536133, "learning_rate": 1.999908277339915e-05, "loss": 0.1052, "step": 38120 }, { "epoch": 45.99577549788775, "grad_norm": 8.740778923034668, "learning_rate": 1.9999082532144357e-05, "loss": 0.1118, "step": 38130 }, { "epoch": 46.007242003621, "grad_norm": 6.5246901512146, "learning_rate": 1.9999082290889563e-05, "loss": 0.0886, "step": 38140 }, { "epoch": 46.019312009656005, "grad_norm": 7.226640224456787, "learning_rate": 1.999908204963477e-05, "loss": 0.0779, "step": 38150 }, { "epoch": 46.03138201569101, "grad_norm": 6.661814212799072, "learning_rate": 1.9999081808379975e-05, "loss": 0.0811, "step": 38160 }, { "epoch": 46.04345202172601, "grad_norm": 7.774762153625488, "learning_rate": 1.999908156712518e-05, "loss": 0.0851, "step": 38170 }, { "epoch": 46.055522027761015, "grad_norm": 6.8034844398498535, "learning_rate": 1.9999081325870388e-05, "loss": 0.0835, "step": 38180 }, { "epoch": 46.06759203379602, "grad_norm": 7.450770854949951, "learning_rate": 1.9999081084615597e-05, "loss": 0.0869, "step": 38190 }, { "epoch": 46.07966203983102, "grad_norm": 7.507908344268799, "learning_rate": 1.9999080843360804e-05, "loss": 0.0869, "step": 38200 }, { "epoch": 46.091732045866024, "grad_norm": 7.487066268920898, "learning_rate": 1.999908060210601e-05, "loss": 0.0871, "step": 38210 }, { "epoch": 46.10380205190103, "grad_norm": 6.384939670562744, "learning_rate": 1.9999080360851216e-05, "loss": 0.0853, "step": 38220 }, { "epoch": 46.11587205793603, "grad_norm": 6.445372581481934, "learning_rate": 1.9999080119596422e-05, "loss": 0.0875, "step": 38230 }, { "epoch": 46.127942063971034, "grad_norm": 7.490885257720947, "learning_rate": 1.999907987834163e-05, "loss": 0.0886, "step": 38240 }, { "epoch": 46.14001207000604, "grad_norm": 7.657497882843018, "learning_rate": 1.9999079637086835e-05, "loss": 0.0867, "step": 38250 }, { "epoch": 46.15208207604104, "grad_norm": 7.291082859039307, "learning_rate": 1.999907939583204e-05, "loss": 0.0883, "step": 38260 }, { "epoch": 46.16415208207604, "grad_norm": 7.435154438018799, "learning_rate": 1.9999079154577247e-05, "loss": 0.0933, "step": 38270 }, { "epoch": 46.17622208811105, "grad_norm": 6.973855495452881, "learning_rate": 1.9999078913322453e-05, "loss": 0.0894, "step": 38280 }, { "epoch": 46.18829209414605, "grad_norm": 6.936178684234619, "learning_rate": 1.9999078672067656e-05, "loss": 0.0931, "step": 38290 }, { "epoch": 46.20036210018105, "grad_norm": 7.709728240966797, "learning_rate": 1.9999078430812862e-05, "loss": 0.092, "step": 38300 }, { "epoch": 46.212432106216056, "grad_norm": 7.587975025177002, "learning_rate": 1.999907818955807e-05, "loss": 0.096, "step": 38310 }, { "epoch": 46.22450211225106, "grad_norm": 7.771561622619629, "learning_rate": 1.9999077948303275e-05, "loss": 0.0927, "step": 38320 }, { "epoch": 46.23657211828606, "grad_norm": 7.372857093811035, "learning_rate": 1.999907770704848e-05, "loss": 0.0933, "step": 38330 }, { "epoch": 46.248642124321066, "grad_norm": 7.055661678314209, "learning_rate": 1.9999077465793687e-05, "loss": 0.0923, "step": 38340 }, { "epoch": 46.26071213035606, "grad_norm": 7.309264183044434, "learning_rate": 1.9999077224538893e-05, "loss": 0.0944, "step": 38350 }, { "epoch": 46.272782136391065, "grad_norm": 9.077201843261719, "learning_rate": 1.99990769832841e-05, "loss": 0.0938, "step": 38360 }, { "epoch": 46.28485214242607, "grad_norm": 7.425437927246094, "learning_rate": 1.9999076742029306e-05, "loss": 0.0945, "step": 38370 }, { "epoch": 46.29692214846107, "grad_norm": 7.554510116577148, "learning_rate": 1.9999076500774512e-05, "loss": 0.0946, "step": 38380 }, { "epoch": 46.308992154496075, "grad_norm": 7.5878005027771, "learning_rate": 1.9999076259519718e-05, "loss": 0.0985, "step": 38390 }, { "epoch": 46.32106216053108, "grad_norm": 8.168110847473145, "learning_rate": 1.9999076018264925e-05, "loss": 0.0989, "step": 38400 }, { "epoch": 46.33313216656608, "grad_norm": 7.530923366546631, "learning_rate": 1.999907577701013e-05, "loss": 0.0987, "step": 38410 }, { "epoch": 46.345202172601084, "grad_norm": 7.5215163230896, "learning_rate": 1.9999075535755337e-05, "loss": 0.0951, "step": 38420 }, { "epoch": 46.35727217863609, "grad_norm": 7.156764030456543, "learning_rate": 1.9999075294500543e-05, "loss": 0.0955, "step": 38430 }, { "epoch": 46.36934218467109, "grad_norm": 8.304118156433105, "learning_rate": 1.999907505324575e-05, "loss": 0.0967, "step": 38440 }, { "epoch": 46.381412190706094, "grad_norm": 8.270943641662598, "learning_rate": 1.9999074811990956e-05, "loss": 0.0987, "step": 38450 }, { "epoch": 46.3934821967411, "grad_norm": 9.060016632080078, "learning_rate": 1.9999074570736162e-05, "loss": 0.0982, "step": 38460 }, { "epoch": 46.4055522027761, "grad_norm": 7.610828399658203, "learning_rate": 1.9999074329481368e-05, "loss": 0.0976, "step": 38470 }, { "epoch": 46.4176222088111, "grad_norm": 7.186120510101318, "learning_rate": 1.9999074088226574e-05, "loss": 0.1004, "step": 38480 }, { "epoch": 46.429692214846106, "grad_norm": 8.821853637695312, "learning_rate": 1.999907384697178e-05, "loss": 0.0981, "step": 38490 }, { "epoch": 46.44176222088111, "grad_norm": 7.316274642944336, "learning_rate": 1.9999073605716987e-05, "loss": 0.0961, "step": 38500 }, { "epoch": 46.44176222088111, "eval_loss": 11.743006706237793, "eval_runtime": 8.1343, "eval_samples_per_second": 85.686, "eval_steps_per_second": 10.818, "step": 38500 }, { "epoch": 46.45383222691611, "grad_norm": 7.115945339202881, "learning_rate": 1.9999073364462193e-05, "loss": 0.0985, "step": 38510 }, { "epoch": 46.465902232951116, "grad_norm": 7.985426902770996, "learning_rate": 1.99990731232074e-05, "loss": 0.0999, "step": 38520 }, { "epoch": 46.47797223898612, "grad_norm": 7.541236877441406, "learning_rate": 1.9999072881952605e-05, "loss": 0.1009, "step": 38530 }, { "epoch": 46.49004224502112, "grad_norm": 7.300419807434082, "learning_rate": 1.9999072640697808e-05, "loss": 0.0958, "step": 38540 }, { "epoch": 46.502112251056126, "grad_norm": 7.435680389404297, "learning_rate": 1.9999072399443014e-05, "loss": 0.098, "step": 38550 }, { "epoch": 46.51418225709113, "grad_norm": 8.25025749206543, "learning_rate": 1.999907215818822e-05, "loss": 0.0998, "step": 38560 }, { "epoch": 46.52625226312613, "grad_norm": 8.239767074584961, "learning_rate": 1.9999071916933427e-05, "loss": 0.1008, "step": 38570 }, { "epoch": 46.538322269161135, "grad_norm": 7.448886871337891, "learning_rate": 1.9999071675678633e-05, "loss": 0.102, "step": 38580 }, { "epoch": 46.55039227519614, "grad_norm": 7.723857402801514, "learning_rate": 1.999907143442384e-05, "loss": 0.1014, "step": 38590 }, { "epoch": 46.56246228123114, "grad_norm": 7.74446964263916, "learning_rate": 1.9999071193169045e-05, "loss": 0.1006, "step": 38600 }, { "epoch": 46.574532287266145, "grad_norm": 7.517327308654785, "learning_rate": 1.999907095191425e-05, "loss": 0.1011, "step": 38610 }, { "epoch": 46.58660229330115, "grad_norm": 7.969264507293701, "learning_rate": 1.9999070710659458e-05, "loss": 0.0997, "step": 38620 }, { "epoch": 46.59867229933615, "grad_norm": 7.451340675354004, "learning_rate": 1.9999070469404664e-05, "loss": 0.1026, "step": 38630 }, { "epoch": 46.610742305371154, "grad_norm": 7.9716386795043945, "learning_rate": 1.999907022814987e-05, "loss": 0.1031, "step": 38640 }, { "epoch": 46.62281231140616, "grad_norm": 8.106727600097656, "learning_rate": 1.9999069986895076e-05, "loss": 0.1047, "step": 38650 }, { "epoch": 46.63488231744116, "grad_norm": 7.704501152038574, "learning_rate": 1.9999069745640283e-05, "loss": 0.102, "step": 38660 }, { "epoch": 46.646952323476164, "grad_norm": 7.953787326812744, "learning_rate": 1.999906950438549e-05, "loss": 0.1009, "step": 38670 }, { "epoch": 46.65902232951117, "grad_norm": 8.196562767028809, "learning_rate": 1.9999069263130695e-05, "loss": 0.1039, "step": 38680 }, { "epoch": 46.67109233554617, "grad_norm": 7.586479187011719, "learning_rate": 1.99990690218759e-05, "loss": 0.1039, "step": 38690 }, { "epoch": 46.68316234158117, "grad_norm": 7.370808124542236, "learning_rate": 1.9999068780621108e-05, "loss": 0.1037, "step": 38700 }, { "epoch": 46.69523234761618, "grad_norm": 9.036026000976562, "learning_rate": 1.9999068539366314e-05, "loss": 0.0985, "step": 38710 }, { "epoch": 46.70730235365118, "grad_norm": 7.8997483253479, "learning_rate": 1.999906829811152e-05, "loss": 0.1051, "step": 38720 }, { "epoch": 46.71937235968618, "grad_norm": 7.7045488357543945, "learning_rate": 1.9999068056856726e-05, "loss": 0.1008, "step": 38730 }, { "epoch": 46.731442365721186, "grad_norm": 7.948310375213623, "learning_rate": 1.9999067815601932e-05, "loss": 0.1047, "step": 38740 }, { "epoch": 46.74351237175619, "grad_norm": 7.445521354675293, "learning_rate": 1.999906757434714e-05, "loss": 0.1008, "step": 38750 }, { "epoch": 46.755582377791185, "grad_norm": 7.602056980133057, "learning_rate": 1.9999067333092345e-05, "loss": 0.1064, "step": 38760 }, { "epoch": 46.76765238382619, "grad_norm": 8.508523941040039, "learning_rate": 1.999906709183755e-05, "loss": 0.1061, "step": 38770 }, { "epoch": 46.77972238986119, "grad_norm": 8.526290893554688, "learning_rate": 1.9999066850582757e-05, "loss": 0.1059, "step": 38780 }, { "epoch": 46.791792395896195, "grad_norm": 8.185855865478516, "learning_rate": 1.9999066609327964e-05, "loss": 0.106, "step": 38790 }, { "epoch": 46.8038624019312, "grad_norm": 7.827500343322754, "learning_rate": 1.999906636807317e-05, "loss": 0.106, "step": 38800 }, { "epoch": 46.8159324079662, "grad_norm": 7.982572078704834, "learning_rate": 1.9999066126818376e-05, "loss": 0.1072, "step": 38810 }, { "epoch": 46.828002414001205, "grad_norm": 7.627448081970215, "learning_rate": 1.9999065885563582e-05, "loss": 0.1068, "step": 38820 }, { "epoch": 46.84007242003621, "grad_norm": 7.86598014831543, "learning_rate": 1.999906564430879e-05, "loss": 0.1089, "step": 38830 }, { "epoch": 46.85214242607121, "grad_norm": 8.582625389099121, "learning_rate": 1.9999065403053995e-05, "loss": 0.1068, "step": 38840 }, { "epoch": 46.864212432106214, "grad_norm": 8.215176582336426, "learning_rate": 1.99990651617992e-05, "loss": 0.1078, "step": 38850 }, { "epoch": 46.87628243814122, "grad_norm": 8.194639205932617, "learning_rate": 1.9999064920544407e-05, "loss": 0.107, "step": 38860 }, { "epoch": 46.88835244417622, "grad_norm": 7.982693195343018, "learning_rate": 1.9999064679289613e-05, "loss": 0.1076, "step": 38870 }, { "epoch": 46.900422450211224, "grad_norm": 8.771345138549805, "learning_rate": 1.999906443803482e-05, "loss": 0.107, "step": 38880 }, { "epoch": 46.91249245624623, "grad_norm": 8.083992958068848, "learning_rate": 1.9999064196780026e-05, "loss": 0.1072, "step": 38890 }, { "epoch": 46.92456246228123, "grad_norm": 8.10530948638916, "learning_rate": 1.9999063955525232e-05, "loss": 0.108, "step": 38900 }, { "epoch": 46.93663246831623, "grad_norm": 8.824614524841309, "learning_rate": 1.9999063714270438e-05, "loss": 0.1074, "step": 38910 }, { "epoch": 46.948702474351236, "grad_norm": 7.843191146850586, "learning_rate": 1.9999063473015644e-05, "loss": 0.1072, "step": 38920 }, { "epoch": 46.96077248038624, "grad_norm": 7.849874496459961, "learning_rate": 1.999906323176085e-05, "loss": 0.1101, "step": 38930 }, { "epoch": 46.97284248642124, "grad_norm": 7.859445571899414, "learning_rate": 1.9999062990506057e-05, "loss": 0.1061, "step": 38940 }, { "epoch": 46.984912492456246, "grad_norm": 8.938703536987305, "learning_rate": 1.999906274925126e-05, "loss": 0.1101, "step": 38950 }, { "epoch": 46.99698249849125, "grad_norm": 8.319433212280273, "learning_rate": 1.9999062507996466e-05, "loss": 0.1074, "step": 38960 }, { "epoch": 47.0084490042245, "grad_norm": 6.6417236328125, "learning_rate": 1.9999062266741672e-05, "loss": 0.0805, "step": 38970 }, { "epoch": 47.020519010259505, "grad_norm": 7.1344170570373535, "learning_rate": 1.9999062025486878e-05, "loss": 0.0743, "step": 38980 }, { "epoch": 47.03258901629451, "grad_norm": 7.072699069976807, "learning_rate": 1.9999061784232084e-05, "loss": 0.0762, "step": 38990 }, { "epoch": 47.04465902232951, "grad_norm": 7.696378707885742, "learning_rate": 1.999906154297729e-05, "loss": 0.0827, "step": 39000 }, { "epoch": 47.04465902232951, "eval_loss": 11.751547813415527, "eval_runtime": 8.1284, "eval_samples_per_second": 85.749, "eval_steps_per_second": 10.826, "step": 39000 }, { "epoch": 47.056729028364515, "grad_norm": 6.811994552612305, "learning_rate": 1.9999061301722497e-05, "loss": 0.0857, "step": 39010 }, { "epoch": 47.06879903439952, "grad_norm": 7.179398536682129, "learning_rate": 1.9999061060467703e-05, "loss": 0.0854, "step": 39020 }, { "epoch": 47.08086904043452, "grad_norm": 7.2742156982421875, "learning_rate": 1.999906081921291e-05, "loss": 0.0845, "step": 39030 }, { "epoch": 47.092939046469525, "grad_norm": 7.459653854370117, "learning_rate": 1.9999060577958116e-05, "loss": 0.0871, "step": 39040 }, { "epoch": 47.10500905250453, "grad_norm": 7.417701721191406, "learning_rate": 1.9999060336703322e-05, "loss": 0.0855, "step": 39050 }, { "epoch": 47.11707905853953, "grad_norm": 7.456145763397217, "learning_rate": 1.9999060095448528e-05, "loss": 0.0881, "step": 39060 }, { "epoch": 47.129149064574534, "grad_norm": 7.639993190765381, "learning_rate": 1.9999059854193734e-05, "loss": 0.087, "step": 39070 }, { "epoch": 47.14121907060954, "grad_norm": 7.6580119132995605, "learning_rate": 1.999905961293894e-05, "loss": 0.0865, "step": 39080 }, { "epoch": 47.15328907664454, "grad_norm": 7.3614349365234375, "learning_rate": 1.9999059371684147e-05, "loss": 0.0917, "step": 39090 }, { "epoch": 47.165359082679544, "grad_norm": 7.49045991897583, "learning_rate": 1.9999059130429353e-05, "loss": 0.0879, "step": 39100 }, { "epoch": 47.17742908871455, "grad_norm": 7.335483074188232, "learning_rate": 1.999905888917456e-05, "loss": 0.0905, "step": 39110 }, { "epoch": 47.18949909474955, "grad_norm": 6.772665977478027, "learning_rate": 1.9999058647919765e-05, "loss": 0.0923, "step": 39120 }, { "epoch": 47.20156910078455, "grad_norm": 7.697539329528809, "learning_rate": 1.999905840666497e-05, "loss": 0.0921, "step": 39130 }, { "epoch": 47.213639106819556, "grad_norm": 6.761911392211914, "learning_rate": 1.9999058165410178e-05, "loss": 0.0889, "step": 39140 }, { "epoch": 47.22570911285456, "grad_norm": 7.497836112976074, "learning_rate": 1.9999057924155384e-05, "loss": 0.0908, "step": 39150 }, { "epoch": 47.23777911888956, "grad_norm": 7.37370491027832, "learning_rate": 1.999905768290059e-05, "loss": 0.089, "step": 39160 }, { "epoch": 47.249849124924566, "grad_norm": 7.055605411529541, "learning_rate": 1.9999057441645796e-05, "loss": 0.0913, "step": 39170 }, { "epoch": 47.26191913095956, "grad_norm": 6.894532203674316, "learning_rate": 1.9999057200391003e-05, "loss": 0.0951, "step": 39180 }, { "epoch": 47.273989136994565, "grad_norm": 7.590365409851074, "learning_rate": 1.999905695913621e-05, "loss": 0.0938, "step": 39190 }, { "epoch": 47.28605914302957, "grad_norm": 7.200679302215576, "learning_rate": 1.999905671788141e-05, "loss": 0.0904, "step": 39200 }, { "epoch": 47.29812914906457, "grad_norm": 7.500698089599609, "learning_rate": 1.9999056476626618e-05, "loss": 0.0878, "step": 39210 }, { "epoch": 47.310199155099575, "grad_norm": 7.83240270614624, "learning_rate": 1.9999056235371824e-05, "loss": 0.0916, "step": 39220 }, { "epoch": 47.32226916113458, "grad_norm": 7.210937023162842, "learning_rate": 1.999905599411703e-05, "loss": 0.0932, "step": 39230 }, { "epoch": 47.33433916716958, "grad_norm": 7.618188381195068, "learning_rate": 1.9999055752862236e-05, "loss": 0.0938, "step": 39240 }, { "epoch": 47.346409173204584, "grad_norm": 7.430671691894531, "learning_rate": 1.9999055511607443e-05, "loss": 0.0954, "step": 39250 }, { "epoch": 47.35847917923959, "grad_norm": 8.500646591186523, "learning_rate": 1.999905527035265e-05, "loss": 0.0941, "step": 39260 }, { "epoch": 47.37054918527459, "grad_norm": 7.87017822265625, "learning_rate": 1.999905502909786e-05, "loss": 0.0948, "step": 39270 }, { "epoch": 47.382619191309594, "grad_norm": 8.211577415466309, "learning_rate": 1.9999054787843065e-05, "loss": 0.0966, "step": 39280 }, { "epoch": 47.3946891973446, "grad_norm": 7.387478828430176, "learning_rate": 1.999905454658827e-05, "loss": 0.0963, "step": 39290 }, { "epoch": 47.4067592033796, "grad_norm": 7.694531440734863, "learning_rate": 1.9999054305333477e-05, "loss": 0.0963, "step": 39300 }, { "epoch": 47.418829209414604, "grad_norm": 7.225515842437744, "learning_rate": 1.9999054064078683e-05, "loss": 0.0915, "step": 39310 }, { "epoch": 47.43089921544961, "grad_norm": 7.088873386383057, "learning_rate": 1.999905382282389e-05, "loss": 0.0969, "step": 39320 }, { "epoch": 47.44296922148461, "grad_norm": 7.617424011230469, "learning_rate": 1.9999053581569096e-05, "loss": 0.0949, "step": 39330 }, { "epoch": 47.45503922751961, "grad_norm": 7.811732769012451, "learning_rate": 1.9999053340314302e-05, "loss": 0.095, "step": 39340 }, { "epoch": 47.467109233554616, "grad_norm": 8.163633346557617, "learning_rate": 1.9999053099059508e-05, "loss": 0.0969, "step": 39350 }, { "epoch": 47.47917923958962, "grad_norm": 8.397712707519531, "learning_rate": 1.9999052857804714e-05, "loss": 0.0978, "step": 39360 }, { "epoch": 47.49124924562462, "grad_norm": 7.510199546813965, "learning_rate": 1.9999052616549917e-05, "loss": 0.0975, "step": 39370 }, { "epoch": 47.503319251659626, "grad_norm": 7.630013942718506, "learning_rate": 1.9999052375295123e-05, "loss": 0.0952, "step": 39380 }, { "epoch": 47.51538925769463, "grad_norm": 7.479676246643066, "learning_rate": 1.999905213404033e-05, "loss": 0.0964, "step": 39390 }, { "epoch": 47.52745926372963, "grad_norm": 7.4170050621032715, "learning_rate": 1.9999051892785536e-05, "loss": 0.0966, "step": 39400 }, { "epoch": 47.539529269764635, "grad_norm": 7.070496082305908, "learning_rate": 1.9999051651530742e-05, "loss": 0.0985, "step": 39410 }, { "epoch": 47.55159927579964, "grad_norm": 7.723612308502197, "learning_rate": 1.999905141027595e-05, "loss": 0.0976, "step": 39420 }, { "epoch": 47.56366928183464, "grad_norm": 7.236082553863525, "learning_rate": 1.9999051169021155e-05, "loss": 0.0964, "step": 39430 }, { "epoch": 47.575739287869645, "grad_norm": 7.489105224609375, "learning_rate": 1.999905092776636e-05, "loss": 0.0978, "step": 39440 }, { "epoch": 47.58780929390465, "grad_norm": 8.577093124389648, "learning_rate": 1.9999050686511567e-05, "loss": 0.0959, "step": 39450 }, { "epoch": 47.59987929993965, "grad_norm": 7.601607799530029, "learning_rate": 1.9999050445256773e-05, "loss": 0.098, "step": 39460 }, { "epoch": 47.611949305974655, "grad_norm": 7.5110764503479, "learning_rate": 1.999905020400198e-05, "loss": 0.0982, "step": 39470 }, { "epoch": 47.62401931200966, "grad_norm": 7.164269924163818, "learning_rate": 1.9999049962747186e-05, "loss": 0.098, "step": 39480 }, { "epoch": 47.63608931804466, "grad_norm": 8.614191055297852, "learning_rate": 1.9999049721492392e-05, "loss": 0.1006, "step": 39490 }, { "epoch": 47.648159324079664, "grad_norm": 8.245658874511719, "learning_rate": 1.9999049480237598e-05, "loss": 0.1001, "step": 39500 }, { "epoch": 47.648159324079664, "eval_loss": 11.785606384277344, "eval_runtime": 8.1257, "eval_samples_per_second": 85.777, "eval_steps_per_second": 10.83, "step": 39500 }, { "epoch": 47.66022933011467, "grad_norm": 7.918700695037842, "learning_rate": 1.9999049238982804e-05, "loss": 0.0979, "step": 39510 }, { "epoch": 47.67229933614967, "grad_norm": 7.932295322418213, "learning_rate": 1.999904899772801e-05, "loss": 0.1032, "step": 39520 }, { "epoch": 47.684369342184674, "grad_norm": 7.814630508422852, "learning_rate": 1.9999048756473217e-05, "loss": 0.1021, "step": 39530 }, { "epoch": 47.69643934821968, "grad_norm": 8.160330772399902, "learning_rate": 1.9999048515218423e-05, "loss": 0.1001, "step": 39540 }, { "epoch": 47.70850935425468, "grad_norm": 7.888943672180176, "learning_rate": 1.999904827396363e-05, "loss": 0.0997, "step": 39550 }, { "epoch": 47.72057936028968, "grad_norm": 7.901956558227539, "learning_rate": 1.9999048032708835e-05, "loss": 0.1034, "step": 39560 }, { "epoch": 47.73264936632469, "grad_norm": 7.581954479217529, "learning_rate": 1.999904779145404e-05, "loss": 0.1042, "step": 39570 }, { "epoch": 47.74471937235969, "grad_norm": 8.128555297851562, "learning_rate": 1.9999047550199248e-05, "loss": 0.103, "step": 39580 }, { "epoch": 47.756789378394686, "grad_norm": 7.831653118133545, "learning_rate": 1.9999047308944454e-05, "loss": 0.1041, "step": 39590 }, { "epoch": 47.76885938442969, "grad_norm": 8.32129955291748, "learning_rate": 1.999904706768966e-05, "loss": 0.104, "step": 39600 }, { "epoch": 47.78092939046469, "grad_norm": 7.49923849105835, "learning_rate": 1.9999046826434866e-05, "loss": 0.101, "step": 39610 }, { "epoch": 47.792999396499695, "grad_norm": 7.573966979980469, "learning_rate": 1.999904658518007e-05, "loss": 0.1048, "step": 39620 }, { "epoch": 47.8050694025347, "grad_norm": 7.666535377502441, "learning_rate": 1.9999046343925275e-05, "loss": 0.1067, "step": 39630 }, { "epoch": 47.8171394085697, "grad_norm": 7.7968244552612305, "learning_rate": 1.999904610267048e-05, "loss": 0.1034, "step": 39640 }, { "epoch": 47.829209414604705, "grad_norm": 7.437354564666748, "learning_rate": 1.9999045861415688e-05, "loss": 0.1035, "step": 39650 }, { "epoch": 47.84127942063971, "grad_norm": 8.155876159667969, "learning_rate": 1.9999045620160894e-05, "loss": 0.1046, "step": 39660 }, { "epoch": 47.85334942667471, "grad_norm": 8.160113334655762, "learning_rate": 1.99990453789061e-05, "loss": 0.1061, "step": 39670 }, { "epoch": 47.865419432709714, "grad_norm": 7.70035982131958, "learning_rate": 1.9999045137651307e-05, "loss": 0.104, "step": 39680 }, { "epoch": 47.87748943874472, "grad_norm": 7.8333868980407715, "learning_rate": 1.9999044896396513e-05, "loss": 0.1052, "step": 39690 }, { "epoch": 47.88955944477972, "grad_norm": 7.184895038604736, "learning_rate": 1.999904465514172e-05, "loss": 0.1028, "step": 39700 }, { "epoch": 47.901629450814724, "grad_norm": 7.618626594543457, "learning_rate": 1.9999044413886925e-05, "loss": 0.1072, "step": 39710 }, { "epoch": 47.91369945684973, "grad_norm": 8.072918891906738, "learning_rate": 1.999904417263213e-05, "loss": 0.1023, "step": 39720 }, { "epoch": 47.92576946288473, "grad_norm": 7.926357269287109, "learning_rate": 1.9999043931377338e-05, "loss": 0.1046, "step": 39730 }, { "epoch": 47.937839468919734, "grad_norm": 7.6120924949646, "learning_rate": 1.9999043690122544e-05, "loss": 0.1047, "step": 39740 }, { "epoch": 47.94990947495474, "grad_norm": 7.827085971832275, "learning_rate": 1.999904344886775e-05, "loss": 0.1044, "step": 39750 }, { "epoch": 47.96197948098974, "grad_norm": 7.868283748626709, "learning_rate": 1.9999043207612956e-05, "loss": 0.1039, "step": 39760 }, { "epoch": 47.97404948702474, "grad_norm": 7.345223903656006, "learning_rate": 1.9999042966358162e-05, "loss": 0.1037, "step": 39770 }, { "epoch": 47.986119493059746, "grad_norm": 7.700930118560791, "learning_rate": 1.999904272510337e-05, "loss": 0.1058, "step": 39780 }, { "epoch": 47.99818949909475, "grad_norm": 7.932934284210205, "learning_rate": 1.9999042483848575e-05, "loss": 0.1025, "step": 39790 }, { "epoch": 48.009656004828, "grad_norm": 7.738202095031738, "learning_rate": 1.999904224259378e-05, "loss": 0.0753, "step": 39800 }, { "epoch": 48.021726010863006, "grad_norm": 6.8059234619140625, "learning_rate": 1.9999042001338987e-05, "loss": 0.0754, "step": 39810 }, { "epoch": 48.03379601689801, "grad_norm": 6.8200364112854, "learning_rate": 1.9999041760084194e-05, "loss": 0.0764, "step": 39820 }, { "epoch": 48.04586602293301, "grad_norm": 6.796507358551025, "learning_rate": 1.99990415188294e-05, "loss": 0.0758, "step": 39830 }, { "epoch": 48.057936028968015, "grad_norm": 6.682655334472656, "learning_rate": 1.9999041277574606e-05, "loss": 0.08, "step": 39840 }, { "epoch": 48.07000603500302, "grad_norm": 6.60886812210083, "learning_rate": 1.9999041036319812e-05, "loss": 0.0751, "step": 39850 }, { "epoch": 48.08207604103802, "grad_norm": 6.843984603881836, "learning_rate": 1.999904079506502e-05, "loss": 0.0834, "step": 39860 }, { "epoch": 48.094146047073025, "grad_norm": 7.087650299072266, "learning_rate": 1.9999040553810225e-05, "loss": 0.0816, "step": 39870 }, { "epoch": 48.10621605310803, "grad_norm": 7.197298049926758, "learning_rate": 1.999904031255543e-05, "loss": 0.0826, "step": 39880 }, { "epoch": 48.11828605914303, "grad_norm": 7.232353687286377, "learning_rate": 1.9999040071300637e-05, "loss": 0.0881, "step": 39890 }, { "epoch": 48.130356065178034, "grad_norm": 7.245333671569824, "learning_rate": 1.9999039830045843e-05, "loss": 0.0841, "step": 39900 }, { "epoch": 48.14242607121304, "grad_norm": 6.656517505645752, "learning_rate": 1.999903958879105e-05, "loss": 0.0861, "step": 39910 }, { "epoch": 48.15449607724804, "grad_norm": 7.351563930511475, "learning_rate": 1.9999039347536256e-05, "loss": 0.0848, "step": 39920 }, { "epoch": 48.166566083283044, "grad_norm": 7.301729679107666, "learning_rate": 1.9999039106281462e-05, "loss": 0.0856, "step": 39930 }, { "epoch": 48.17863608931805, "grad_norm": 6.989707946777344, "learning_rate": 1.9999038865026668e-05, "loss": 0.0885, "step": 39940 }, { "epoch": 48.19070609535305, "grad_norm": 6.5776448249816895, "learning_rate": 1.9999038623771874e-05, "loss": 0.0856, "step": 39950 }, { "epoch": 48.202776101388054, "grad_norm": 6.922290802001953, "learning_rate": 1.999903838251708e-05, "loss": 0.0881, "step": 39960 }, { "epoch": 48.21484610742306, "grad_norm": 7.733505725860596, "learning_rate": 1.9999038141262287e-05, "loss": 0.0881, "step": 39970 }, { "epoch": 48.22691611345806, "grad_norm": 7.340127468109131, "learning_rate": 1.9999037900007493e-05, "loss": 0.0883, "step": 39980 }, { "epoch": 48.23898611949306, "grad_norm": 7.175323486328125, "learning_rate": 1.99990376587527e-05, "loss": 0.0848, "step": 39990 }, { "epoch": 48.251056125528066, "grad_norm": 7.680735111236572, "learning_rate": 1.9999037417497905e-05, "loss": 0.0874, "step": 40000 }, { "epoch": 48.251056125528066, "eval_loss": 11.797197341918945, "eval_runtime": 8.1316, "eval_samples_per_second": 85.715, "eval_steps_per_second": 10.822, "step": 40000 }, { "epoch": 48.26312613156306, "grad_norm": 7.933058261871338, "learning_rate": 1.999903717624311e-05, "loss": 0.0887, "step": 40010 }, { "epoch": 48.275196137598066, "grad_norm": 7.125622272491455, "learning_rate": 1.9999036934988318e-05, "loss": 0.0875, "step": 40020 }, { "epoch": 48.28726614363307, "grad_norm": 8.040776252746582, "learning_rate": 1.999903669373352e-05, "loss": 0.0906, "step": 40030 }, { "epoch": 48.29933614966807, "grad_norm": 6.781001567840576, "learning_rate": 1.9999036452478727e-05, "loss": 0.0918, "step": 40040 }, { "epoch": 48.311406155703075, "grad_norm": 7.306427478790283, "learning_rate": 1.9999036211223933e-05, "loss": 0.0925, "step": 40050 }, { "epoch": 48.32347616173808, "grad_norm": 6.8687520027160645, "learning_rate": 1.999903596996914e-05, "loss": 0.088, "step": 40060 }, { "epoch": 48.33554616777308, "grad_norm": 6.785318374633789, "learning_rate": 1.9999035728714346e-05, "loss": 0.0909, "step": 40070 }, { "epoch": 48.347616173808085, "grad_norm": 8.387290000915527, "learning_rate": 1.9999035487459552e-05, "loss": 0.0949, "step": 40080 }, { "epoch": 48.35968617984309, "grad_norm": 7.053259372711182, "learning_rate": 1.9999035246204758e-05, "loss": 0.0914, "step": 40090 }, { "epoch": 48.37175618587809, "grad_norm": 7.742259502410889, "learning_rate": 1.9999035004949964e-05, "loss": 0.0916, "step": 40100 }, { "epoch": 48.383826191913094, "grad_norm": 7.5678839683532715, "learning_rate": 1.999903476369517e-05, "loss": 0.0899, "step": 40110 }, { "epoch": 48.3958961979481, "grad_norm": 7.129518508911133, "learning_rate": 1.9999034522440377e-05, "loss": 0.0937, "step": 40120 }, { "epoch": 48.4079662039831, "grad_norm": 8.209439277648926, "learning_rate": 1.9999034281185583e-05, "loss": 0.0924, "step": 40130 }, { "epoch": 48.420036210018104, "grad_norm": 7.225387096405029, "learning_rate": 1.999903403993079e-05, "loss": 0.0961, "step": 40140 }, { "epoch": 48.43210621605311, "grad_norm": 7.797441005706787, "learning_rate": 1.9999033798675995e-05, "loss": 0.0949, "step": 40150 }, { "epoch": 48.44417622208811, "grad_norm": 7.028262615203857, "learning_rate": 1.99990335574212e-05, "loss": 0.0978, "step": 40160 }, { "epoch": 48.45624622812311, "grad_norm": 7.925105094909668, "learning_rate": 1.9999033316166408e-05, "loss": 0.0919, "step": 40170 }, { "epoch": 48.46831623415812, "grad_norm": 7.9377217292785645, "learning_rate": 1.9999033074911614e-05, "loss": 0.0966, "step": 40180 }, { "epoch": 48.48038624019312, "grad_norm": 7.848637104034424, "learning_rate": 1.999903283365682e-05, "loss": 0.0971, "step": 40190 }, { "epoch": 48.49245624622812, "grad_norm": 7.5229363441467285, "learning_rate": 1.9999032592402026e-05, "loss": 0.0964, "step": 40200 }, { "epoch": 48.504526252263126, "grad_norm": 7.247541904449463, "learning_rate": 1.9999032351147233e-05, "loss": 0.0957, "step": 40210 }, { "epoch": 48.51659625829813, "grad_norm": 6.982822418212891, "learning_rate": 1.999903210989244e-05, "loss": 0.0953, "step": 40220 }, { "epoch": 48.52866626433313, "grad_norm": 7.9095611572265625, "learning_rate": 1.9999031868637645e-05, "loss": 0.0952, "step": 40230 }, { "epoch": 48.540736270368136, "grad_norm": 8.175434112548828, "learning_rate": 1.999903162738285e-05, "loss": 0.0949, "step": 40240 }, { "epoch": 48.55280627640314, "grad_norm": 7.930110454559326, "learning_rate": 1.9999031386128057e-05, "loss": 0.0976, "step": 40250 }, { "epoch": 48.56487628243814, "grad_norm": 7.43308162689209, "learning_rate": 1.9999031144873264e-05, "loss": 0.0986, "step": 40260 }, { "epoch": 48.576946288473145, "grad_norm": 8.414320945739746, "learning_rate": 1.999903090361847e-05, "loss": 0.0953, "step": 40270 }, { "epoch": 48.58901629450815, "grad_norm": 7.888746738433838, "learning_rate": 1.9999030662363673e-05, "loss": 0.1003, "step": 40280 }, { "epoch": 48.60108630054315, "grad_norm": 7.6700005531311035, "learning_rate": 1.999903042110888e-05, "loss": 0.0965, "step": 40290 }, { "epoch": 48.613156306578155, "grad_norm": 7.131819725036621, "learning_rate": 1.9999030179854085e-05, "loss": 0.0939, "step": 40300 }, { "epoch": 48.62522631261316, "grad_norm": 7.897426128387451, "learning_rate": 1.999902993859929e-05, "loss": 0.0989, "step": 40310 }, { "epoch": 48.63729631864816, "grad_norm": 7.837087154388428, "learning_rate": 1.9999029697344498e-05, "loss": 0.0983, "step": 40320 }, { "epoch": 48.649366324683164, "grad_norm": 7.999877452850342, "learning_rate": 1.9999029456089704e-05, "loss": 0.1001, "step": 40330 }, { "epoch": 48.66143633071817, "grad_norm": 7.18195915222168, "learning_rate": 1.999902921483491e-05, "loss": 0.0974, "step": 40340 }, { "epoch": 48.67350633675317, "grad_norm": 7.298947334289551, "learning_rate": 1.999902897358012e-05, "loss": 0.0971, "step": 40350 }, { "epoch": 48.685576342788174, "grad_norm": 7.569996356964111, "learning_rate": 1.9999028732325326e-05, "loss": 0.0993, "step": 40360 }, { "epoch": 48.69764634882318, "grad_norm": 7.5630364418029785, "learning_rate": 1.9999028491070532e-05, "loss": 0.0999, "step": 40370 }, { "epoch": 48.70971635485818, "grad_norm": 7.246062278747559, "learning_rate": 1.9999028249815738e-05, "loss": 0.0982, "step": 40380 }, { "epoch": 48.721786360893184, "grad_norm": 8.006025314331055, "learning_rate": 1.9999028008560944e-05, "loss": 0.102, "step": 40390 }, { "epoch": 48.73385636692819, "grad_norm": 7.306941509246826, "learning_rate": 1.999902776730615e-05, "loss": 0.0963, "step": 40400 }, { "epoch": 48.74592637296319, "grad_norm": 7.722687244415283, "learning_rate": 1.9999027526051357e-05, "loss": 0.0951, "step": 40410 }, { "epoch": 48.757996378998186, "grad_norm": 7.37482213973999, "learning_rate": 1.9999027284796563e-05, "loss": 0.0992, "step": 40420 }, { "epoch": 48.77006638503319, "grad_norm": 8.359506607055664, "learning_rate": 1.999902704354177e-05, "loss": 0.0989, "step": 40430 }, { "epoch": 48.78213639106819, "grad_norm": 7.881369113922119, "learning_rate": 1.9999026802286972e-05, "loss": 0.1026, "step": 40440 }, { "epoch": 48.794206397103196, "grad_norm": 7.06877326965332, "learning_rate": 1.999902656103218e-05, "loss": 0.0996, "step": 40450 }, { "epoch": 48.8062764031382, "grad_norm": 8.580363273620605, "learning_rate": 1.9999026319777385e-05, "loss": 0.0981, "step": 40460 }, { "epoch": 48.8183464091732, "grad_norm": 8.4296293258667, "learning_rate": 1.999902607852259e-05, "loss": 0.1003, "step": 40470 }, { "epoch": 48.830416415208205, "grad_norm": 8.150676727294922, "learning_rate": 1.9999025837267797e-05, "loss": 0.0991, "step": 40480 }, { "epoch": 48.84248642124321, "grad_norm": 7.6096625328063965, "learning_rate": 1.9999025596013003e-05, "loss": 0.0992, "step": 40490 }, { "epoch": 48.85455642727821, "grad_norm": 7.707493305206299, "learning_rate": 1.999902535475821e-05, "loss": 0.1006, "step": 40500 }, { "epoch": 48.85455642727821, "eval_loss": 11.828109741210938, "eval_runtime": 8.1243, "eval_samples_per_second": 85.792, "eval_steps_per_second": 10.832, "step": 40500 }, { "epoch": 48.866626433313215, "grad_norm": 7.765162944793701, "learning_rate": 1.9999025113503416e-05, "loss": 0.1003, "step": 40510 }, { "epoch": 48.87869643934822, "grad_norm": 7.713868141174316, "learning_rate": 1.9999024872248622e-05, "loss": 0.0998, "step": 40520 }, { "epoch": 48.89076644538322, "grad_norm": 8.060853004455566, "learning_rate": 1.9999024630993828e-05, "loss": 0.1028, "step": 40530 }, { "epoch": 48.902836451418224, "grad_norm": 7.270421028137207, "learning_rate": 1.9999024389739034e-05, "loss": 0.1025, "step": 40540 }, { "epoch": 48.91490645745323, "grad_norm": 7.987793445587158, "learning_rate": 1.999902414848424e-05, "loss": 0.1042, "step": 40550 }, { "epoch": 48.92697646348823, "grad_norm": 8.187826156616211, "learning_rate": 1.9999023907229447e-05, "loss": 0.1021, "step": 40560 }, { "epoch": 48.939046469523234, "grad_norm": 7.590134143829346, "learning_rate": 1.9999023665974653e-05, "loss": 0.1009, "step": 40570 }, { "epoch": 48.95111647555824, "grad_norm": 8.046571731567383, "learning_rate": 1.999902342471986e-05, "loss": 0.1013, "step": 40580 }, { "epoch": 48.96318648159324, "grad_norm": 7.021590709686279, "learning_rate": 1.9999023183465065e-05, "loss": 0.1042, "step": 40590 }, { "epoch": 48.97525648762824, "grad_norm": 7.644413948059082, "learning_rate": 1.999902294221027e-05, "loss": 0.1057, "step": 40600 }, { "epoch": 48.98732649366325, "grad_norm": 7.7888994216918945, "learning_rate": 1.9999022700955478e-05, "loss": 0.1042, "step": 40610 }, { "epoch": 48.99939649969825, "grad_norm": 8.050567626953125, "learning_rate": 1.9999022459700684e-05, "loss": 0.1031, "step": 40620 }, { "epoch": 49.0108630054315, "grad_norm": 6.302297592163086, "learning_rate": 1.999902221844589e-05, "loss": 0.0717, "step": 40630 }, { "epoch": 49.022933011466506, "grad_norm": 6.233858585357666, "learning_rate": 1.9999021977191096e-05, "loss": 0.0737, "step": 40640 }, { "epoch": 49.03500301750151, "grad_norm": 6.344173431396484, "learning_rate": 1.9999021735936303e-05, "loss": 0.0746, "step": 40650 }, { "epoch": 49.04707302353651, "grad_norm": 7.0706048011779785, "learning_rate": 1.999902149468151e-05, "loss": 0.076, "step": 40660 }, { "epoch": 49.059143029571516, "grad_norm": 6.993434429168701, "learning_rate": 1.9999021253426715e-05, "loss": 0.0787, "step": 40670 }, { "epoch": 49.07121303560652, "grad_norm": 7.3582353591918945, "learning_rate": 1.999902101217192e-05, "loss": 0.0796, "step": 40680 }, { "epoch": 49.08328304164152, "grad_norm": 7.049600124359131, "learning_rate": 1.9999020770917124e-05, "loss": 0.0818, "step": 40690 }, { "epoch": 49.095353047676525, "grad_norm": 6.698635578155518, "learning_rate": 1.999902052966233e-05, "loss": 0.0794, "step": 40700 }, { "epoch": 49.10742305371153, "grad_norm": 6.116359710693359, "learning_rate": 1.9999020288407537e-05, "loss": 0.0824, "step": 40710 }, { "epoch": 49.11949305974653, "grad_norm": 6.9518890380859375, "learning_rate": 1.9999020047152743e-05, "loss": 0.0879, "step": 40720 }, { "epoch": 49.131563065781535, "grad_norm": 7.24831485748291, "learning_rate": 1.999901980589795e-05, "loss": 0.0795, "step": 40730 }, { "epoch": 49.14363307181654, "grad_norm": 6.788149833679199, "learning_rate": 1.9999019564643155e-05, "loss": 0.0808, "step": 40740 }, { "epoch": 49.15570307785154, "grad_norm": 6.962085723876953, "learning_rate": 1.999901932338836e-05, "loss": 0.083, "step": 40750 }, { "epoch": 49.167773083886544, "grad_norm": 7.301023483276367, "learning_rate": 1.9999019082133568e-05, "loss": 0.0816, "step": 40760 }, { "epoch": 49.17984308992155, "grad_norm": 7.577390193939209, "learning_rate": 1.9999018840878774e-05, "loss": 0.0876, "step": 40770 }, { "epoch": 49.19191309595655, "grad_norm": 7.300945281982422, "learning_rate": 1.999901859962398e-05, "loss": 0.085, "step": 40780 }, { "epoch": 49.203983101991554, "grad_norm": 7.300275802612305, "learning_rate": 1.9999018358369186e-05, "loss": 0.0888, "step": 40790 }, { "epoch": 49.21605310802656, "grad_norm": 7.751604080200195, "learning_rate": 1.9999018117114392e-05, "loss": 0.0904, "step": 40800 }, { "epoch": 49.22812311406156, "grad_norm": 7.878165245056152, "learning_rate": 1.99990178758596e-05, "loss": 0.0845, "step": 40810 }, { "epoch": 49.24019312009656, "grad_norm": 7.549356937408447, "learning_rate": 1.9999017634604805e-05, "loss": 0.0862, "step": 40820 }, { "epoch": 49.25226312613156, "grad_norm": 7.47953987121582, "learning_rate": 1.999901739335001e-05, "loss": 0.0869, "step": 40830 }, { "epoch": 49.26433313216656, "grad_norm": 7.507737636566162, "learning_rate": 1.9999017152095217e-05, "loss": 0.088, "step": 40840 }, { "epoch": 49.276403138201566, "grad_norm": 7.5186076164245605, "learning_rate": 1.9999016910840424e-05, "loss": 0.0882, "step": 40850 }, { "epoch": 49.28847314423657, "grad_norm": 7.891443252563477, "learning_rate": 1.999901666958563e-05, "loss": 0.0883, "step": 40860 }, { "epoch": 49.30054315027157, "grad_norm": 7.985027313232422, "learning_rate": 1.9999016428330836e-05, "loss": 0.0902, "step": 40870 }, { "epoch": 49.312613156306575, "grad_norm": 7.538743495941162, "learning_rate": 1.9999016187076042e-05, "loss": 0.0897, "step": 40880 }, { "epoch": 49.32468316234158, "grad_norm": 7.45040225982666, "learning_rate": 1.999901594582125e-05, "loss": 0.0922, "step": 40890 }, { "epoch": 49.33675316837658, "grad_norm": 6.993686199188232, "learning_rate": 1.9999015704566455e-05, "loss": 0.0921, "step": 40900 }, { "epoch": 49.348823174411585, "grad_norm": 7.736401557922363, "learning_rate": 1.999901546331166e-05, "loss": 0.0895, "step": 40910 }, { "epoch": 49.36089318044659, "grad_norm": 6.990508556365967, "learning_rate": 1.9999015222056867e-05, "loss": 0.0912, "step": 40920 }, { "epoch": 49.37296318648159, "grad_norm": 7.685385227203369, "learning_rate": 1.9999014980802073e-05, "loss": 0.0915, "step": 40930 }, { "epoch": 49.385033192516595, "grad_norm": 7.269428253173828, "learning_rate": 1.999901473954728e-05, "loss": 0.0882, "step": 40940 }, { "epoch": 49.3971031985516, "grad_norm": 7.542537689208984, "learning_rate": 1.9999014498292486e-05, "loss": 0.09, "step": 40950 }, { "epoch": 49.4091732045866, "grad_norm": 6.707280158996582, "learning_rate": 1.9999014257037692e-05, "loss": 0.0893, "step": 40960 }, { "epoch": 49.421243210621604, "grad_norm": 8.230257034301758, "learning_rate": 1.9999014015782898e-05, "loss": 0.0885, "step": 40970 }, { "epoch": 49.43331321665661, "grad_norm": 6.946557521820068, "learning_rate": 1.9999013774528104e-05, "loss": 0.0916, "step": 40980 }, { "epoch": 49.44538322269161, "grad_norm": 7.145899772644043, "learning_rate": 1.999901353327331e-05, "loss": 0.0933, "step": 40990 }, { "epoch": 49.457453228726614, "grad_norm": 7.594334125518799, "learning_rate": 1.9999013292018517e-05, "loss": 0.0895, "step": 41000 }, { "epoch": 49.457453228726614, "eval_loss": 11.82146167755127, "eval_runtime": 8.1309, "eval_samples_per_second": 85.722, "eval_steps_per_second": 10.823, "step": 41000 }, { "epoch": 49.46952323476162, "grad_norm": 7.5774335861206055, "learning_rate": 1.9999013050763723e-05, "loss": 0.0918, "step": 41010 }, { "epoch": 49.48159324079662, "grad_norm": 8.207365036010742, "learning_rate": 1.999901280950893e-05, "loss": 0.0936, "step": 41020 }, { "epoch": 49.49366324683162, "grad_norm": 7.343356609344482, "learning_rate": 1.9999012568254135e-05, "loss": 0.0947, "step": 41030 }, { "epoch": 49.50573325286663, "grad_norm": 7.512302398681641, "learning_rate": 1.999901232699934e-05, "loss": 0.0958, "step": 41040 }, { "epoch": 49.51780325890163, "grad_norm": 7.518713474273682, "learning_rate": 1.9999012085744548e-05, "loss": 0.0958, "step": 41050 }, { "epoch": 49.52987326493663, "grad_norm": 6.992223739624023, "learning_rate": 1.9999011844489754e-05, "loss": 0.0929, "step": 41060 }, { "epoch": 49.541943270971636, "grad_norm": 8.24531078338623, "learning_rate": 1.999901160323496e-05, "loss": 0.0903, "step": 41070 }, { "epoch": 49.55401327700664, "grad_norm": 7.245900630950928, "learning_rate": 1.9999011361980167e-05, "loss": 0.0938, "step": 41080 }, { "epoch": 49.56608328304164, "grad_norm": 7.352336406707764, "learning_rate": 1.9999011120725373e-05, "loss": 0.0941, "step": 41090 }, { "epoch": 49.578153289076646, "grad_norm": 7.1685872077941895, "learning_rate": 1.999901087947058e-05, "loss": 0.0941, "step": 41100 }, { "epoch": 49.59022329511165, "grad_norm": 7.561819553375244, "learning_rate": 1.9999010638215782e-05, "loss": 0.0956, "step": 41110 }, { "epoch": 49.60229330114665, "grad_norm": 7.396695613861084, "learning_rate": 1.9999010396960988e-05, "loss": 0.0975, "step": 41120 }, { "epoch": 49.614363307181655, "grad_norm": 7.895127773284912, "learning_rate": 1.9999010155706194e-05, "loss": 0.0981, "step": 41130 }, { "epoch": 49.62643331321666, "grad_norm": 7.562243461608887, "learning_rate": 1.99990099144514e-05, "loss": 0.0996, "step": 41140 }, { "epoch": 49.63850331925166, "grad_norm": 7.458862781524658, "learning_rate": 1.9999009673196607e-05, "loss": 0.0973, "step": 41150 }, { "epoch": 49.650573325286665, "grad_norm": 7.899185657501221, "learning_rate": 1.9999009431941813e-05, "loss": 0.0959, "step": 41160 }, { "epoch": 49.66264333132167, "grad_norm": 8.947956085205078, "learning_rate": 1.999900919068702e-05, "loss": 0.098, "step": 41170 }, { "epoch": 49.67471333735667, "grad_norm": 7.4821038246154785, "learning_rate": 1.9999008949432225e-05, "loss": 0.0989, "step": 41180 }, { "epoch": 49.686783343391674, "grad_norm": 7.757851600646973, "learning_rate": 1.999900870817743e-05, "loss": 0.0975, "step": 41190 }, { "epoch": 49.69885334942668, "grad_norm": 7.443010330200195, "learning_rate": 1.9999008466922638e-05, "loss": 0.0953, "step": 41200 }, { "epoch": 49.71092335546168, "grad_norm": 7.199835777282715, "learning_rate": 1.9999008225667844e-05, "loss": 0.0962, "step": 41210 }, { "epoch": 49.722993361496684, "grad_norm": 7.883698463439941, "learning_rate": 1.999900798441305e-05, "loss": 0.1001, "step": 41220 }, { "epoch": 49.73506336753169, "grad_norm": 7.289809703826904, "learning_rate": 1.9999007743158256e-05, "loss": 0.0994, "step": 41230 }, { "epoch": 49.74713337356668, "grad_norm": 7.798990249633789, "learning_rate": 1.9999007501903463e-05, "loss": 0.0958, "step": 41240 }, { "epoch": 49.759203379601686, "grad_norm": 7.510777473449707, "learning_rate": 1.999900726064867e-05, "loss": 0.0974, "step": 41250 }, { "epoch": 49.77127338563669, "grad_norm": 8.66215705871582, "learning_rate": 1.9999007019393875e-05, "loss": 0.0998, "step": 41260 }, { "epoch": 49.78334339167169, "grad_norm": 8.959178924560547, "learning_rate": 1.999900677813908e-05, "loss": 0.0987, "step": 41270 }, { "epoch": 49.795413397706696, "grad_norm": 7.981107234954834, "learning_rate": 1.9999006536884287e-05, "loss": 0.0981, "step": 41280 }, { "epoch": 49.8074834037417, "grad_norm": 7.780637264251709, "learning_rate": 1.9999006295629494e-05, "loss": 0.103, "step": 41290 }, { "epoch": 49.8195534097767, "grad_norm": 7.37398624420166, "learning_rate": 1.99990060543747e-05, "loss": 0.102, "step": 41300 }, { "epoch": 49.831623415811706, "grad_norm": 6.928779125213623, "learning_rate": 1.9999005813119906e-05, "loss": 0.0958, "step": 41310 }, { "epoch": 49.84369342184671, "grad_norm": 7.928514003753662, "learning_rate": 1.9999005571865112e-05, "loss": 0.102, "step": 41320 }, { "epoch": 49.85576342788171, "grad_norm": 7.169568061828613, "learning_rate": 1.999900533061032e-05, "loss": 0.1009, "step": 41330 }, { "epoch": 49.867833433916715, "grad_norm": 7.676843643188477, "learning_rate": 1.9999005089355525e-05, "loss": 0.0954, "step": 41340 }, { "epoch": 49.87990343995172, "grad_norm": 8.562459945678711, "learning_rate": 1.999900484810073e-05, "loss": 0.0969, "step": 41350 }, { "epoch": 49.89197344598672, "grad_norm": 7.4752326011657715, "learning_rate": 1.9999004606845934e-05, "loss": 0.0973, "step": 41360 }, { "epoch": 49.904043452021725, "grad_norm": 7.315005302429199, "learning_rate": 1.999900436559114e-05, "loss": 0.0999, "step": 41370 }, { "epoch": 49.91611345805673, "grad_norm": 7.940873146057129, "learning_rate": 1.9999004124336346e-05, "loss": 0.0992, "step": 41380 }, { "epoch": 49.92818346409173, "grad_norm": 7.425535678863525, "learning_rate": 1.9999003883081552e-05, "loss": 0.0997, "step": 41390 }, { "epoch": 49.940253470126734, "grad_norm": 7.609869480133057, "learning_rate": 1.999900364182676e-05, "loss": 0.1003, "step": 41400 }, { "epoch": 49.95232347616174, "grad_norm": 7.664175510406494, "learning_rate": 1.9999003400571965e-05, "loss": 0.1022, "step": 41410 }, { "epoch": 49.96439348219674, "grad_norm": 8.104321479797363, "learning_rate": 1.999900315931717e-05, "loss": 0.1005, "step": 41420 }, { "epoch": 49.976463488231744, "grad_norm": 8.076833724975586, "learning_rate": 1.999900291806238e-05, "loss": 0.1018, "step": 41430 }, { "epoch": 49.98853349426675, "grad_norm": 8.044425964355469, "learning_rate": 1.9999002676807587e-05, "loss": 0.1029, "step": 41440 }, { "epoch": 50.0, "grad_norm": 12.945068359375, "learning_rate": 1.9999002435552793e-05, "loss": 0.1036, "step": 41450 }, { "epoch": 50.012070006035, "grad_norm": 6.317172527313232, "learning_rate": 1.9999002194298e-05, "loss": 0.0695, "step": 41460 }, { "epoch": 50.024140012070006, "grad_norm": 6.775774002075195, "learning_rate": 1.9999001953043206e-05, "loss": 0.0729, "step": 41470 }, { "epoch": 50.03621001810501, "grad_norm": 6.579026222229004, "learning_rate": 1.9999001711788412e-05, "loss": 0.0769, "step": 41480 }, { "epoch": 50.04828002414001, "grad_norm": 7.096086502075195, "learning_rate": 1.9999001470533618e-05, "loss": 0.0749, "step": 41490 }, { "epoch": 50.060350030175016, "grad_norm": 6.369539737701416, "learning_rate": 1.9999001229278824e-05, "loss": 0.074, "step": 41500 }, { "epoch": 50.060350030175016, "eval_loss": 11.850563049316406, "eval_runtime": 8.1271, "eval_samples_per_second": 85.762, "eval_steps_per_second": 10.828, "step": 41500 }, { "epoch": 50.07242003621002, "grad_norm": 6.551559925079346, "learning_rate": 1.999900098802403e-05, "loss": 0.0774, "step": 41510 }, { "epoch": 50.08449004224502, "grad_norm": 6.984485149383545, "learning_rate": 1.9999000746769233e-05, "loss": 0.0758, "step": 41520 }, { "epoch": 50.096560048280026, "grad_norm": 6.764638423919678, "learning_rate": 1.999900050551444e-05, "loss": 0.0794, "step": 41530 }, { "epoch": 50.10863005431503, "grad_norm": 6.708288192749023, "learning_rate": 1.9999000264259646e-05, "loss": 0.0834, "step": 41540 }, { "epoch": 50.12070006035003, "grad_norm": 6.975231647491455, "learning_rate": 1.9999000023004852e-05, "loss": 0.0819, "step": 41550 }, { "epoch": 50.132770066385035, "grad_norm": 7.128840923309326, "learning_rate": 1.9998999781750058e-05, "loss": 0.0801, "step": 41560 }, { "epoch": 50.14484007242004, "grad_norm": 6.8460612297058105, "learning_rate": 1.9998999540495264e-05, "loss": 0.0844, "step": 41570 }, { "epoch": 50.15691007845504, "grad_norm": 6.806500434875488, "learning_rate": 1.999899929924047e-05, "loss": 0.0809, "step": 41580 }, { "epoch": 50.168980084490045, "grad_norm": 6.840192794799805, "learning_rate": 1.9998999057985677e-05, "loss": 0.0812, "step": 41590 }, { "epoch": 50.18105009052505, "grad_norm": 6.755086421966553, "learning_rate": 1.9998998816730883e-05, "loss": 0.0848, "step": 41600 }, { "epoch": 50.19312009656005, "grad_norm": 7.035580158233643, "learning_rate": 1.999899857547609e-05, "loss": 0.0857, "step": 41610 }, { "epoch": 50.205190102595054, "grad_norm": 7.5524773597717285, "learning_rate": 1.9998998334221295e-05, "loss": 0.0845, "step": 41620 }, { "epoch": 50.21726010863006, "grad_norm": 7.534791469573975, "learning_rate": 1.99989980929665e-05, "loss": 0.0849, "step": 41630 }, { "epoch": 50.22933011466506, "grad_norm": 6.735876083374023, "learning_rate": 1.9998997851711708e-05, "loss": 0.0855, "step": 41640 }, { "epoch": 50.241400120700064, "grad_norm": 7.6889753341674805, "learning_rate": 1.9998997610456914e-05, "loss": 0.0876, "step": 41650 }, { "epoch": 50.25347012673506, "grad_norm": 6.810657024383545, "learning_rate": 1.999899736920212e-05, "loss": 0.0864, "step": 41660 }, { "epoch": 50.26554013277006, "grad_norm": 7.756162166595459, "learning_rate": 1.9998997127947326e-05, "loss": 0.086, "step": 41670 }, { "epoch": 50.277610138805066, "grad_norm": 6.841807842254639, "learning_rate": 1.9998996886692533e-05, "loss": 0.0888, "step": 41680 }, { "epoch": 50.28968014484007, "grad_norm": 6.971548557281494, "learning_rate": 1.999899664543774e-05, "loss": 0.0874, "step": 41690 }, { "epoch": 50.30175015087507, "grad_norm": 7.46088981628418, "learning_rate": 1.9998996404182945e-05, "loss": 0.0886, "step": 41700 }, { "epoch": 50.313820156910076, "grad_norm": 7.828610420227051, "learning_rate": 1.999899616292815e-05, "loss": 0.0873, "step": 41710 }, { "epoch": 50.32589016294508, "grad_norm": 7.65648078918457, "learning_rate": 1.9998995921673358e-05, "loss": 0.0909, "step": 41720 }, { "epoch": 50.33796016898008, "grad_norm": 7.093748569488525, "learning_rate": 1.9998995680418564e-05, "loss": 0.0881, "step": 41730 }, { "epoch": 50.350030175015085, "grad_norm": 7.230619430541992, "learning_rate": 1.999899543916377e-05, "loss": 0.0904, "step": 41740 }, { "epoch": 50.36210018105009, "grad_norm": 7.452841281890869, "learning_rate": 1.9998995197908976e-05, "loss": 0.088, "step": 41750 }, { "epoch": 50.37417018708509, "grad_norm": 6.912395477294922, "learning_rate": 1.9998994956654182e-05, "loss": 0.0885, "step": 41760 }, { "epoch": 50.386240193120095, "grad_norm": 8.679211616516113, "learning_rate": 1.9998994715399385e-05, "loss": 0.0897, "step": 41770 }, { "epoch": 50.3983101991551, "grad_norm": 7.263469696044922, "learning_rate": 1.999899447414459e-05, "loss": 0.0907, "step": 41780 }, { "epoch": 50.4103802051901, "grad_norm": 8.111063957214355, "learning_rate": 1.9998994232889798e-05, "loss": 0.0897, "step": 41790 }, { "epoch": 50.422450211225105, "grad_norm": 7.671141147613525, "learning_rate": 1.9998993991635004e-05, "loss": 0.0883, "step": 41800 }, { "epoch": 50.43452021726011, "grad_norm": 7.561425685882568, "learning_rate": 1.999899375038021e-05, "loss": 0.0903, "step": 41810 }, { "epoch": 50.44659022329511, "grad_norm": 7.290135383605957, "learning_rate": 1.9998993509125416e-05, "loss": 0.0895, "step": 41820 }, { "epoch": 50.458660229330114, "grad_norm": 7.544571399688721, "learning_rate": 1.9998993267870623e-05, "loss": 0.0895, "step": 41830 }, { "epoch": 50.47073023536512, "grad_norm": 7.311888217926025, "learning_rate": 1.999899302661583e-05, "loss": 0.0916, "step": 41840 }, { "epoch": 50.48280024140012, "grad_norm": 7.157449722290039, "learning_rate": 1.9998992785361035e-05, "loss": 0.09, "step": 41850 }, { "epoch": 50.494870247435124, "grad_norm": 6.926903247833252, "learning_rate": 1.999899254410624e-05, "loss": 0.0924, "step": 41860 }, { "epoch": 50.50694025347013, "grad_norm": 6.998215675354004, "learning_rate": 1.9998992302851447e-05, "loss": 0.0934, "step": 41870 }, { "epoch": 50.51901025950513, "grad_norm": 7.420600414276123, "learning_rate": 1.9998992061596654e-05, "loss": 0.0928, "step": 41880 }, { "epoch": 50.53108026554013, "grad_norm": 7.173928260803223, "learning_rate": 1.999899182034186e-05, "loss": 0.0917, "step": 41890 }, { "epoch": 50.543150271575136, "grad_norm": 7.2388386726379395, "learning_rate": 1.9998991579087066e-05, "loss": 0.0918, "step": 41900 }, { "epoch": 50.55522027761014, "grad_norm": 7.20490837097168, "learning_rate": 1.9998991337832272e-05, "loss": 0.0935, "step": 41910 }, { "epoch": 50.56729028364514, "grad_norm": 7.6196818351745605, "learning_rate": 1.999899109657748e-05, "loss": 0.0917, "step": 41920 }, { "epoch": 50.579360289680146, "grad_norm": 7.437852382659912, "learning_rate": 1.9998990855322685e-05, "loss": 0.0942, "step": 41930 }, { "epoch": 50.59143029571515, "grad_norm": 7.516562461853027, "learning_rate": 1.999899061406789e-05, "loss": 0.0934, "step": 41940 }, { "epoch": 50.60350030175015, "grad_norm": 7.634312629699707, "learning_rate": 1.9998990372813097e-05, "loss": 0.0912, "step": 41950 }, { "epoch": 50.615570307785156, "grad_norm": 7.382517337799072, "learning_rate": 1.9998990131558303e-05, "loss": 0.0928, "step": 41960 }, { "epoch": 50.62764031382016, "grad_norm": 7.696516513824463, "learning_rate": 1.999898989030351e-05, "loss": 0.094, "step": 41970 }, { "epoch": 50.63971031985516, "grad_norm": 7.254597187042236, "learning_rate": 1.9998989649048716e-05, "loss": 0.0936, "step": 41980 }, { "epoch": 50.651780325890165, "grad_norm": 6.623546600341797, "learning_rate": 1.9998989407793922e-05, "loss": 0.0932, "step": 41990 }, { "epoch": 50.66385033192517, "grad_norm": 7.215982913970947, "learning_rate": 1.9998989166539128e-05, "loss": 0.0928, "step": 42000 }, { "epoch": 50.66385033192517, "eval_loss": 11.867677688598633, "eval_runtime": 8.1218, "eval_samples_per_second": 85.819, "eval_steps_per_second": 10.835, "step": 42000 }, { "epoch": 50.67592033796017, "grad_norm": 7.6255035400390625, "learning_rate": 1.9998988925284334e-05, "loss": 0.095, "step": 42010 }, { "epoch": 50.687990343995175, "grad_norm": 6.900207042694092, "learning_rate": 1.999898868402954e-05, "loss": 0.0943, "step": 42020 }, { "epoch": 50.70006035003018, "grad_norm": 6.902481555938721, "learning_rate": 1.9998988442774747e-05, "loss": 0.0941, "step": 42030 }, { "epoch": 50.71213035606518, "grad_norm": 7.560008525848389, "learning_rate": 1.9998988201519953e-05, "loss": 0.0931, "step": 42040 }, { "epoch": 50.724200362100184, "grad_norm": 7.488542079925537, "learning_rate": 1.999898796026516e-05, "loss": 0.097, "step": 42050 }, { "epoch": 50.73627036813519, "grad_norm": 7.3805999755859375, "learning_rate": 1.9998987719010365e-05, "loss": 0.0964, "step": 42060 }, { "epoch": 50.74834037417018, "grad_norm": 7.635969638824463, "learning_rate": 1.999898747775557e-05, "loss": 0.0931, "step": 42070 }, { "epoch": 50.76041038020519, "grad_norm": 7.718633651733398, "learning_rate": 1.9998987236500778e-05, "loss": 0.0948, "step": 42080 }, { "epoch": 50.77248038624019, "grad_norm": 7.813375473022461, "learning_rate": 1.9998986995245984e-05, "loss": 0.0921, "step": 42090 }, { "epoch": 50.78455039227519, "grad_norm": 7.77449893951416, "learning_rate": 1.999898675399119e-05, "loss": 0.0959, "step": 42100 }, { "epoch": 50.796620398310196, "grad_norm": 7.652085304260254, "learning_rate": 1.9998986512736397e-05, "loss": 0.0963, "step": 42110 }, { "epoch": 50.8086904043452, "grad_norm": 7.818132400512695, "learning_rate": 1.9998986271481603e-05, "loss": 0.0983, "step": 42120 }, { "epoch": 50.8207604103802, "grad_norm": 8.085147857666016, "learning_rate": 1.999898603022681e-05, "loss": 0.0924, "step": 42130 }, { "epoch": 50.832830416415206, "grad_norm": 7.229625701904297, "learning_rate": 1.9998985788972015e-05, "loss": 0.0983, "step": 42140 }, { "epoch": 50.84490042245021, "grad_norm": 7.925867080688477, "learning_rate": 1.999898554771722e-05, "loss": 0.096, "step": 42150 }, { "epoch": 50.85697042848521, "grad_norm": 8.019272804260254, "learning_rate": 1.9998985306462428e-05, "loss": 0.0975, "step": 42160 }, { "epoch": 50.869040434520215, "grad_norm": 8.248308181762695, "learning_rate": 1.9998985065207634e-05, "loss": 0.1003, "step": 42170 }, { "epoch": 50.88111044055522, "grad_norm": 8.201766014099121, "learning_rate": 1.999898482395284e-05, "loss": 0.1013, "step": 42180 }, { "epoch": 50.89318044659022, "grad_norm": 7.58929967880249, "learning_rate": 1.9998984582698043e-05, "loss": 0.0972, "step": 42190 }, { "epoch": 50.905250452625225, "grad_norm": 7.5391082763671875, "learning_rate": 1.999898434144325e-05, "loss": 0.0968, "step": 42200 }, { "epoch": 50.91732045866023, "grad_norm": 7.501089096069336, "learning_rate": 1.9998984100188455e-05, "loss": 0.0959, "step": 42210 }, { "epoch": 50.92939046469523, "grad_norm": 8.310490608215332, "learning_rate": 1.999898385893366e-05, "loss": 0.097, "step": 42220 }, { "epoch": 50.941460470730235, "grad_norm": 7.550339698791504, "learning_rate": 1.9998983617678868e-05, "loss": 0.1015, "step": 42230 }, { "epoch": 50.95353047676524, "grad_norm": 7.178874969482422, "learning_rate": 1.9998983376424074e-05, "loss": 0.0949, "step": 42240 }, { "epoch": 50.96560048280024, "grad_norm": 7.8004279136657715, "learning_rate": 1.999898313516928e-05, "loss": 0.1005, "step": 42250 }, { "epoch": 50.977670488835244, "grad_norm": 7.403468608856201, "learning_rate": 1.9998982893914486e-05, "loss": 0.0962, "step": 42260 }, { "epoch": 50.98974049487025, "grad_norm": 7.488941669464111, "learning_rate": 1.9998982652659693e-05, "loss": 0.0988, "step": 42270 }, { "epoch": 51.0012070006035, "grad_norm": 6.650633335113525, "learning_rate": 1.99989824114049e-05, "loss": 0.095, "step": 42280 }, { "epoch": 51.0132770066385, "grad_norm": 6.781513690948486, "learning_rate": 1.9998982170150105e-05, "loss": 0.067, "step": 42290 }, { "epoch": 51.02534701267351, "grad_norm": 6.693936347961426, "learning_rate": 1.999898192889531e-05, "loss": 0.0714, "step": 42300 }, { "epoch": 51.03741701870851, "grad_norm": 5.840860843658447, "learning_rate": 1.9998981687640517e-05, "loss": 0.0701, "step": 42310 }, { "epoch": 51.04948702474351, "grad_norm": 7.046181678771973, "learning_rate": 1.9998981446385724e-05, "loss": 0.0739, "step": 42320 }, { "epoch": 51.061557030778516, "grad_norm": 6.7377471923828125, "learning_rate": 1.999898120513093e-05, "loss": 0.0766, "step": 42330 }, { "epoch": 51.07362703681352, "grad_norm": 6.901611328125, "learning_rate": 1.9998980963876136e-05, "loss": 0.0797, "step": 42340 }, { "epoch": 51.08569704284852, "grad_norm": 6.684874057769775, "learning_rate": 1.9998980722621342e-05, "loss": 0.0788, "step": 42350 }, { "epoch": 51.097767048883526, "grad_norm": 6.3973565101623535, "learning_rate": 1.999898048136655e-05, "loss": 0.0759, "step": 42360 }, { "epoch": 51.10983705491853, "grad_norm": 6.6150078773498535, "learning_rate": 1.9998980240111755e-05, "loss": 0.0768, "step": 42370 }, { "epoch": 51.12190706095353, "grad_norm": 6.847794055938721, "learning_rate": 1.999897999885696e-05, "loss": 0.0765, "step": 42380 }, { "epoch": 51.133977066988535, "grad_norm": 6.867654800415039, "learning_rate": 1.9998979757602167e-05, "loss": 0.0814, "step": 42390 }, { "epoch": 51.14604707302354, "grad_norm": 7.304542064666748, "learning_rate": 1.9998979516347373e-05, "loss": 0.0798, "step": 42400 }, { "epoch": 51.15811707905854, "grad_norm": 7.251595497131348, "learning_rate": 1.999897927509258e-05, "loss": 0.0799, "step": 42410 }, { "epoch": 51.170187085093545, "grad_norm": 6.215214729309082, "learning_rate": 1.9998979033837786e-05, "loss": 0.0798, "step": 42420 }, { "epoch": 51.18225709112855, "grad_norm": 7.087857723236084, "learning_rate": 1.9998978792582992e-05, "loss": 0.0788, "step": 42430 }, { "epoch": 51.19432709716355, "grad_norm": 6.838620185852051, "learning_rate": 1.9998978551328195e-05, "loss": 0.0796, "step": 42440 }, { "epoch": 51.206397103198555, "grad_norm": 7.363870143890381, "learning_rate": 1.99989783100734e-05, "loss": 0.0838, "step": 42450 }, { "epoch": 51.21846710923356, "grad_norm": 6.919394016265869, "learning_rate": 1.9998978068818607e-05, "loss": 0.0812, "step": 42460 }, { "epoch": 51.23053711526856, "grad_norm": 6.285120010375977, "learning_rate": 1.9998977827563814e-05, "loss": 0.0807, "step": 42470 }, { "epoch": 51.242607121303564, "grad_norm": 6.799203872680664, "learning_rate": 1.999897758630902e-05, "loss": 0.0796, "step": 42480 }, { "epoch": 51.25467712733856, "grad_norm": 6.683582305908203, "learning_rate": 1.9998977345054226e-05, "loss": 0.0821, "step": 42490 }, { "epoch": 51.26674713337356, "grad_norm": 5.873566150665283, "learning_rate": 1.9998977103799432e-05, "loss": 0.0804, "step": 42500 }, { "epoch": 51.26674713337356, "eval_loss": 11.88502311706543, "eval_runtime": 8.1339, "eval_samples_per_second": 85.69, "eval_steps_per_second": 10.819, "step": 42500 }, { "epoch": 51.27881713940857, "grad_norm": 7.020887851715088, "learning_rate": 1.9998976862544642e-05, "loss": 0.0812, "step": 42510 }, { "epoch": 51.29088714544357, "grad_norm": 7.55595064163208, "learning_rate": 1.9998976621289848e-05, "loss": 0.0841, "step": 42520 }, { "epoch": 51.30295715147857, "grad_norm": 7.245715141296387, "learning_rate": 1.9998976380035054e-05, "loss": 0.0857, "step": 42530 }, { "epoch": 51.315027157513576, "grad_norm": 7.003671646118164, "learning_rate": 1.999897613878026e-05, "loss": 0.0877, "step": 42540 }, { "epoch": 51.32709716354858, "grad_norm": 7.113409519195557, "learning_rate": 1.9998975897525467e-05, "loss": 0.086, "step": 42550 }, { "epoch": 51.33916716958358, "grad_norm": 6.948751449584961, "learning_rate": 1.9998975656270673e-05, "loss": 0.0861, "step": 42560 }, { "epoch": 51.351237175618586, "grad_norm": 7.359129428863525, "learning_rate": 1.999897541501588e-05, "loss": 0.0855, "step": 42570 }, { "epoch": 51.36330718165359, "grad_norm": 7.370497226715088, "learning_rate": 1.9998975173761085e-05, "loss": 0.084, "step": 42580 }, { "epoch": 51.37537718768859, "grad_norm": 6.756912708282471, "learning_rate": 1.999897493250629e-05, "loss": 0.0858, "step": 42590 }, { "epoch": 51.387447193723595, "grad_norm": 7.362685680389404, "learning_rate": 1.9998974691251494e-05, "loss": 0.0875, "step": 42600 }, { "epoch": 51.3995171997586, "grad_norm": 6.915769577026367, "learning_rate": 1.99989744499967e-05, "loss": 0.0895, "step": 42610 }, { "epoch": 51.4115872057936, "grad_norm": 7.797753810882568, "learning_rate": 1.9998974208741907e-05, "loss": 0.0867, "step": 42620 }, { "epoch": 51.423657211828605, "grad_norm": 7.21412992477417, "learning_rate": 1.9998973967487113e-05, "loss": 0.0862, "step": 42630 }, { "epoch": 51.43572721786361, "grad_norm": 7.601367473602295, "learning_rate": 1.999897372623232e-05, "loss": 0.0882, "step": 42640 }, { "epoch": 51.44779722389861, "grad_norm": 6.841019153594971, "learning_rate": 1.9998973484977525e-05, "loss": 0.0864, "step": 42650 }, { "epoch": 51.459867229933614, "grad_norm": 7.325252056121826, "learning_rate": 1.999897324372273e-05, "loss": 0.0872, "step": 42660 }, { "epoch": 51.47193723596862, "grad_norm": 7.794105052947998, "learning_rate": 1.9998973002467938e-05, "loss": 0.0859, "step": 42670 }, { "epoch": 51.48400724200362, "grad_norm": 7.695290565490723, "learning_rate": 1.9998972761213144e-05, "loss": 0.0878, "step": 42680 }, { "epoch": 51.496077248038624, "grad_norm": 6.834549427032471, "learning_rate": 1.999897251995835e-05, "loss": 0.0897, "step": 42690 }, { "epoch": 51.50814725407363, "grad_norm": 7.625329494476318, "learning_rate": 1.9998972278703556e-05, "loss": 0.0874, "step": 42700 }, { "epoch": 51.52021726010863, "grad_norm": 6.9875102043151855, "learning_rate": 1.9998972037448763e-05, "loss": 0.0935, "step": 42710 }, { "epoch": 51.53228726614363, "grad_norm": 7.020789623260498, "learning_rate": 1.999897179619397e-05, "loss": 0.0926, "step": 42720 }, { "epoch": 51.54435727217864, "grad_norm": 7.939413070678711, "learning_rate": 1.9998971554939175e-05, "loss": 0.0917, "step": 42730 }, { "epoch": 51.55642727821364, "grad_norm": 6.5830278396606445, "learning_rate": 1.999897131368438e-05, "loss": 0.0872, "step": 42740 }, { "epoch": 51.56849728424864, "grad_norm": 6.87249755859375, "learning_rate": 1.9998971072429588e-05, "loss": 0.0905, "step": 42750 }, { "epoch": 51.580567290283646, "grad_norm": 8.125486373901367, "learning_rate": 1.9998970831174794e-05, "loss": 0.0947, "step": 42760 }, { "epoch": 51.59263729631865, "grad_norm": 7.628318786621094, "learning_rate": 1.999897058992e-05, "loss": 0.0933, "step": 42770 }, { "epoch": 51.60470730235365, "grad_norm": 7.584105014801025, "learning_rate": 1.9998970348665206e-05, "loss": 0.0898, "step": 42780 }, { "epoch": 51.616777308388656, "grad_norm": 7.404370307922363, "learning_rate": 1.9998970107410412e-05, "loss": 0.0937, "step": 42790 }, { "epoch": 51.62884731442366, "grad_norm": 8.160100936889648, "learning_rate": 1.999896986615562e-05, "loss": 0.0928, "step": 42800 }, { "epoch": 51.64091732045866, "grad_norm": 7.72684907913208, "learning_rate": 1.9998969624900825e-05, "loss": 0.0899, "step": 42810 }, { "epoch": 51.652987326493665, "grad_norm": 6.870789527893066, "learning_rate": 1.999896938364603e-05, "loss": 0.0921, "step": 42820 }, { "epoch": 51.66505733252867, "grad_norm": 7.269778251647949, "learning_rate": 1.9998969142391237e-05, "loss": 0.089, "step": 42830 }, { "epoch": 51.67712733856367, "grad_norm": 7.254339694976807, "learning_rate": 1.9998968901136443e-05, "loss": 0.0924, "step": 42840 }, { "epoch": 51.689197344598675, "grad_norm": 6.974227428436279, "learning_rate": 1.9998968659881646e-05, "loss": 0.0881, "step": 42850 }, { "epoch": 51.70126735063368, "grad_norm": 6.806041240692139, "learning_rate": 1.9998968418626853e-05, "loss": 0.0941, "step": 42860 }, { "epoch": 51.71333735666868, "grad_norm": 7.572941303253174, "learning_rate": 1.999896817737206e-05, "loss": 0.0917, "step": 42870 }, { "epoch": 51.725407362703685, "grad_norm": 7.393711090087891, "learning_rate": 1.9998967936117265e-05, "loss": 0.0935, "step": 42880 }, { "epoch": 51.73747736873869, "grad_norm": 7.1062469482421875, "learning_rate": 1.999896769486247e-05, "loss": 0.0966, "step": 42890 }, { "epoch": 51.749547374773684, "grad_norm": 7.845593452453613, "learning_rate": 1.9998967453607677e-05, "loss": 0.0964, "step": 42900 }, { "epoch": 51.76161738080869, "grad_norm": 8.012523651123047, "learning_rate": 1.9998967212352884e-05, "loss": 0.0938, "step": 42910 }, { "epoch": 51.77368738684369, "grad_norm": 7.5524749755859375, "learning_rate": 1.999896697109809e-05, "loss": 0.0947, "step": 42920 }, { "epoch": 51.78575739287869, "grad_norm": 7.0904765129089355, "learning_rate": 1.9998966729843296e-05, "loss": 0.091, "step": 42930 }, { "epoch": 51.7978273989137, "grad_norm": 7.445565223693848, "learning_rate": 1.9998966488588502e-05, "loss": 0.093, "step": 42940 }, { "epoch": 51.8098974049487, "grad_norm": 7.469458103179932, "learning_rate": 1.999896624733371e-05, "loss": 0.0959, "step": 42950 }, { "epoch": 51.8219674109837, "grad_norm": 7.1390180587768555, "learning_rate": 1.9998966006078915e-05, "loss": 0.0961, "step": 42960 }, { "epoch": 51.834037417018706, "grad_norm": 7.013580322265625, "learning_rate": 1.999896576482412e-05, "loss": 0.0923, "step": 42970 }, { "epoch": 51.84610742305371, "grad_norm": 7.707505702972412, "learning_rate": 1.9998965523569327e-05, "loss": 0.0916, "step": 42980 }, { "epoch": 51.85817742908871, "grad_norm": 7.6918721199035645, "learning_rate": 1.9998965282314533e-05, "loss": 0.096, "step": 42990 }, { "epoch": 51.870247435123716, "grad_norm": 7.387284755706787, "learning_rate": 1.999896504105974e-05, "loss": 0.0929, "step": 43000 }, { "epoch": 51.870247435123716, "eval_loss": 11.909481048583984, "eval_runtime": 8.1326, "eval_samples_per_second": 85.704, "eval_steps_per_second": 10.821, "step": 43000 }, { "epoch": 51.88231744115872, "grad_norm": 7.718961238861084, "learning_rate": 1.9998964799804946e-05, "loss": 0.0972, "step": 43010 }, { "epoch": 51.89438744719372, "grad_norm": 7.535588264465332, "learning_rate": 1.9998964558550152e-05, "loss": 0.0971, "step": 43020 }, { "epoch": 51.906457453228725, "grad_norm": 7.978545188903809, "learning_rate": 1.9998964317295358e-05, "loss": 0.1004, "step": 43030 }, { "epoch": 51.91852745926373, "grad_norm": 7.425403118133545, "learning_rate": 1.9998964076040564e-05, "loss": 0.0957, "step": 43040 }, { "epoch": 51.93059746529873, "grad_norm": 7.404405117034912, "learning_rate": 1.999896383478577e-05, "loss": 0.0986, "step": 43050 }, { "epoch": 51.942667471333735, "grad_norm": 7.511752605438232, "learning_rate": 1.9998963593530977e-05, "loss": 0.0936, "step": 43060 }, { "epoch": 51.95473747736874, "grad_norm": 7.794232368469238, "learning_rate": 1.9998963352276183e-05, "loss": 0.0979, "step": 43070 }, { "epoch": 51.96680748340374, "grad_norm": 7.78120756149292, "learning_rate": 1.999896311102139e-05, "loss": 0.094, "step": 43080 }, { "epoch": 51.978877489438744, "grad_norm": 8.202223777770996, "learning_rate": 1.9998962869766595e-05, "loss": 0.0965, "step": 43090 }, { "epoch": 51.99094749547375, "grad_norm": 7.976480484008789, "learning_rate": 1.9998962628511802e-05, "loss": 0.0959, "step": 43100 }, { "epoch": 52.002414001207, "grad_norm": 6.012881278991699, "learning_rate": 1.9998962387257008e-05, "loss": 0.0869, "step": 43110 }, { "epoch": 52.014484007242004, "grad_norm": 6.301485538482666, "learning_rate": 1.9998962146002214e-05, "loss": 0.0689, "step": 43120 }, { "epoch": 52.02655401327701, "grad_norm": 5.735093116760254, "learning_rate": 1.999896190474742e-05, "loss": 0.069, "step": 43130 }, { "epoch": 52.03862401931201, "grad_norm": 5.989753246307373, "learning_rate": 1.9998961663492627e-05, "loss": 0.071, "step": 43140 }, { "epoch": 52.05069402534701, "grad_norm": 6.781700134277344, "learning_rate": 1.9998961422237833e-05, "loss": 0.0726, "step": 43150 }, { "epoch": 52.06276403138202, "grad_norm": 6.569777965545654, "learning_rate": 1.999896118098304e-05, "loss": 0.0756, "step": 43160 }, { "epoch": 52.07483403741702, "grad_norm": 6.510046005249023, "learning_rate": 1.9998960939728245e-05, "loss": 0.0762, "step": 43170 }, { "epoch": 52.08690404345202, "grad_norm": 6.0067572593688965, "learning_rate": 1.999896069847345e-05, "loss": 0.0752, "step": 43180 }, { "epoch": 52.098974049487026, "grad_norm": 6.526090621948242, "learning_rate": 1.9998960457218658e-05, "loss": 0.0754, "step": 43190 }, { "epoch": 52.11104405552203, "grad_norm": 6.691077709197998, "learning_rate": 1.9998960215963864e-05, "loss": 0.0767, "step": 43200 }, { "epoch": 52.12311406155703, "grad_norm": 6.682351112365723, "learning_rate": 1.999895997470907e-05, "loss": 0.0782, "step": 43210 }, { "epoch": 52.135184067592036, "grad_norm": 6.8772664070129395, "learning_rate": 1.9998959733454276e-05, "loss": 0.0817, "step": 43220 }, { "epoch": 52.14725407362704, "grad_norm": 6.790073871612549, "learning_rate": 1.9998959492199482e-05, "loss": 0.0763, "step": 43230 }, { "epoch": 52.15932407966204, "grad_norm": 6.61805534362793, "learning_rate": 1.999895925094469e-05, "loss": 0.0781, "step": 43240 }, { "epoch": 52.171394085697045, "grad_norm": 7.175724983215332, "learning_rate": 1.9998959009689895e-05, "loss": 0.0799, "step": 43250 }, { "epoch": 52.18346409173205, "grad_norm": 6.671657562255859, "learning_rate": 1.9998958768435098e-05, "loss": 0.079, "step": 43260 }, { "epoch": 52.19553409776705, "grad_norm": 6.7628703117370605, "learning_rate": 1.9998958527180304e-05, "loss": 0.0819, "step": 43270 }, { "epoch": 52.207604103802055, "grad_norm": 6.382618427276611, "learning_rate": 1.999895828592551e-05, "loss": 0.0818, "step": 43280 }, { "epoch": 52.21967410983706, "grad_norm": 7.170132160186768, "learning_rate": 1.9998958044670716e-05, "loss": 0.0775, "step": 43290 }, { "epoch": 52.23174411587206, "grad_norm": 7.016255855560303, "learning_rate": 1.9998957803415923e-05, "loss": 0.0803, "step": 43300 }, { "epoch": 52.243814121907064, "grad_norm": 6.747597694396973, "learning_rate": 1.999895756216113e-05, "loss": 0.0829, "step": 43310 }, { "epoch": 52.25588412794206, "grad_norm": 7.204065322875977, "learning_rate": 1.9998957320906335e-05, "loss": 0.0828, "step": 43320 }, { "epoch": 52.267954133977064, "grad_norm": 7.226229190826416, "learning_rate": 1.999895707965154e-05, "loss": 0.0849, "step": 43330 }, { "epoch": 52.28002414001207, "grad_norm": 6.577888011932373, "learning_rate": 1.9998956838396747e-05, "loss": 0.0806, "step": 43340 }, { "epoch": 52.29209414604707, "grad_norm": 6.945920467376709, "learning_rate": 1.9998956597141954e-05, "loss": 0.0813, "step": 43350 }, { "epoch": 52.30416415208207, "grad_norm": 6.958191394805908, "learning_rate": 1.999895635588716e-05, "loss": 0.0811, "step": 43360 }, { "epoch": 52.316234158117076, "grad_norm": 6.541409969329834, "learning_rate": 1.9998956114632366e-05, "loss": 0.0798, "step": 43370 }, { "epoch": 52.32830416415208, "grad_norm": 7.3670172691345215, "learning_rate": 1.9998955873377572e-05, "loss": 0.0846, "step": 43380 }, { "epoch": 52.34037417018708, "grad_norm": 7.350748538970947, "learning_rate": 1.999895563212278e-05, "loss": 0.0854, "step": 43390 }, { "epoch": 52.352444176222086, "grad_norm": 7.282080173492432, "learning_rate": 1.9998955390867985e-05, "loss": 0.0845, "step": 43400 }, { "epoch": 52.36451418225709, "grad_norm": 7.10967493057251, "learning_rate": 1.999895514961319e-05, "loss": 0.0855, "step": 43410 }, { "epoch": 52.37658418829209, "grad_norm": 7.3864898681640625, "learning_rate": 1.9998954908358397e-05, "loss": 0.0839, "step": 43420 }, { "epoch": 52.388654194327096, "grad_norm": 7.699806213378906, "learning_rate": 1.9998954667103603e-05, "loss": 0.0868, "step": 43430 }, { "epoch": 52.4007242003621, "grad_norm": 6.952051162719727, "learning_rate": 1.999895442584881e-05, "loss": 0.0847, "step": 43440 }, { "epoch": 52.4127942063971, "grad_norm": 7.122276306152344, "learning_rate": 1.9998954184594016e-05, "loss": 0.0874, "step": 43450 }, { "epoch": 52.424864212432105, "grad_norm": 7.095997333526611, "learning_rate": 1.9998953943339222e-05, "loss": 0.0855, "step": 43460 }, { "epoch": 52.43693421846711, "grad_norm": 6.670654296875, "learning_rate": 1.9998953702084428e-05, "loss": 0.0857, "step": 43470 }, { "epoch": 52.44900422450211, "grad_norm": 6.80975341796875, "learning_rate": 1.9998953460829634e-05, "loss": 0.0836, "step": 43480 }, { "epoch": 52.461074230537115, "grad_norm": 7.208990097045898, "learning_rate": 1.999895321957484e-05, "loss": 0.0873, "step": 43490 }, { "epoch": 52.47314423657212, "grad_norm": 7.50467586517334, "learning_rate": 1.9998952978320047e-05, "loss": 0.0873, "step": 43500 }, { "epoch": 52.47314423657212, "eval_loss": 11.920095443725586, "eval_runtime": 8.132, "eval_samples_per_second": 85.711, "eval_steps_per_second": 10.821, "step": 43500 }, { "epoch": 52.48521424260712, "grad_norm": 6.790525436401367, "learning_rate": 1.999895273706525e-05, "loss": 0.0876, "step": 43510 }, { "epoch": 52.497284248642124, "grad_norm": 7.238017559051514, "learning_rate": 1.9998952495810456e-05, "loss": 0.0868, "step": 43520 }, { "epoch": 52.50935425467713, "grad_norm": 7.459434509277344, "learning_rate": 1.9998952254555662e-05, "loss": 0.0871, "step": 43530 }, { "epoch": 52.52142426071213, "grad_norm": 7.5746660232543945, "learning_rate": 1.999895201330087e-05, "loss": 0.0846, "step": 43540 }, { "epoch": 52.533494266747134, "grad_norm": 6.787229537963867, "learning_rate": 1.9998951772046075e-05, "loss": 0.0883, "step": 43550 }, { "epoch": 52.54556427278214, "grad_norm": 6.773422718048096, "learning_rate": 1.999895153079128e-05, "loss": 0.0876, "step": 43560 }, { "epoch": 52.55763427881714, "grad_norm": 7.553560256958008, "learning_rate": 1.9998951289536487e-05, "loss": 0.0863, "step": 43570 }, { "epoch": 52.56970428485214, "grad_norm": 7.276688098907471, "learning_rate": 1.9998951048281693e-05, "loss": 0.089, "step": 43580 }, { "epoch": 52.58177429088715, "grad_norm": 7.843964099884033, "learning_rate": 1.9998950807026903e-05, "loss": 0.0934, "step": 43590 }, { "epoch": 52.59384429692215, "grad_norm": 7.578711032867432, "learning_rate": 1.999895056577211e-05, "loss": 0.0942, "step": 43600 }, { "epoch": 52.60591430295715, "grad_norm": 6.995774745941162, "learning_rate": 1.9998950324517315e-05, "loss": 0.0912, "step": 43610 }, { "epoch": 52.617984308992156, "grad_norm": 7.592054843902588, "learning_rate": 1.999895008326252e-05, "loss": 0.0867, "step": 43620 }, { "epoch": 52.63005431502716, "grad_norm": 7.131338596343994, "learning_rate": 1.9998949842007728e-05, "loss": 0.0863, "step": 43630 }, { "epoch": 52.64212432106216, "grad_norm": 6.8159050941467285, "learning_rate": 1.9998949600752934e-05, "loss": 0.0879, "step": 43640 }, { "epoch": 52.654194327097166, "grad_norm": 8.062071800231934, "learning_rate": 1.999894935949814e-05, "loss": 0.0911, "step": 43650 }, { "epoch": 52.66626433313217, "grad_norm": 7.604842185974121, "learning_rate": 1.9998949118243346e-05, "loss": 0.0939, "step": 43660 }, { "epoch": 52.67833433916717, "grad_norm": 7.472267150878906, "learning_rate": 1.9998948876988553e-05, "loss": 0.0921, "step": 43670 }, { "epoch": 52.690404345202175, "grad_norm": 7.718752861022949, "learning_rate": 1.9998948635733755e-05, "loss": 0.0903, "step": 43680 }, { "epoch": 52.70247435123718, "grad_norm": 7.430314064025879, "learning_rate": 1.999894839447896e-05, "loss": 0.0917, "step": 43690 }, { "epoch": 52.71454435727218, "grad_norm": 7.161830902099609, "learning_rate": 1.9998948153224168e-05, "loss": 0.0916, "step": 43700 }, { "epoch": 52.726614363307185, "grad_norm": 8.26166820526123, "learning_rate": 1.9998947911969374e-05, "loss": 0.0933, "step": 43710 }, { "epoch": 52.73868436934219, "grad_norm": 7.477504253387451, "learning_rate": 1.999894767071458e-05, "loss": 0.094, "step": 43720 }, { "epoch": 52.750754375377184, "grad_norm": 7.871866703033447, "learning_rate": 1.9998947429459786e-05, "loss": 0.0924, "step": 43730 }, { "epoch": 52.76282438141219, "grad_norm": 7.108973979949951, "learning_rate": 1.9998947188204993e-05, "loss": 0.0924, "step": 43740 }, { "epoch": 52.77489438744719, "grad_norm": 6.738912105560303, "learning_rate": 1.99989469469502e-05, "loss": 0.0917, "step": 43750 }, { "epoch": 52.786964393482194, "grad_norm": 7.478668689727783, "learning_rate": 1.9998946705695405e-05, "loss": 0.0914, "step": 43760 }, { "epoch": 52.7990343995172, "grad_norm": 7.768315315246582, "learning_rate": 1.999894646444061e-05, "loss": 0.0934, "step": 43770 }, { "epoch": 52.8111044055522, "grad_norm": 7.454723358154297, "learning_rate": 1.9998946223185818e-05, "loss": 0.0932, "step": 43780 }, { "epoch": 52.8231744115872, "grad_norm": 7.094920635223389, "learning_rate": 1.9998945981931024e-05, "loss": 0.093, "step": 43790 }, { "epoch": 52.83524441762221, "grad_norm": 6.8752875328063965, "learning_rate": 1.999894574067623e-05, "loss": 0.0943, "step": 43800 }, { "epoch": 52.84731442365721, "grad_norm": 7.271029472351074, "learning_rate": 1.9998945499421436e-05, "loss": 0.0913, "step": 43810 }, { "epoch": 52.85938442969221, "grad_norm": 7.698000907897949, "learning_rate": 1.9998945258166642e-05, "loss": 0.0936, "step": 43820 }, { "epoch": 52.871454435727216, "grad_norm": 7.736641883850098, "learning_rate": 1.999894501691185e-05, "loss": 0.0939, "step": 43830 }, { "epoch": 52.88352444176222, "grad_norm": 7.411783695220947, "learning_rate": 1.9998944775657055e-05, "loss": 0.0938, "step": 43840 }, { "epoch": 52.89559444779722, "grad_norm": 6.810129165649414, "learning_rate": 1.999894453440226e-05, "loss": 0.0946, "step": 43850 }, { "epoch": 52.907664453832226, "grad_norm": 7.232804298400879, "learning_rate": 1.9998944293147467e-05, "loss": 0.0943, "step": 43860 }, { "epoch": 52.91973445986723, "grad_norm": 7.380960941314697, "learning_rate": 1.9998944051892674e-05, "loss": 0.0968, "step": 43870 }, { "epoch": 52.93180446590223, "grad_norm": 7.591912746429443, "learning_rate": 1.999894381063788e-05, "loss": 0.0925, "step": 43880 }, { "epoch": 52.943874471937235, "grad_norm": 7.585309028625488, "learning_rate": 1.9998943569383086e-05, "loss": 0.0966, "step": 43890 }, { "epoch": 52.95594447797224, "grad_norm": 8.09441089630127, "learning_rate": 1.9998943328128292e-05, "loss": 0.095, "step": 43900 }, { "epoch": 52.96801448400724, "grad_norm": 7.366138935089111, "learning_rate": 1.99989430868735e-05, "loss": 0.0952, "step": 43910 }, { "epoch": 52.980084490042245, "grad_norm": 7.862508773803711, "learning_rate": 1.9998942845618705e-05, "loss": 0.0934, "step": 43920 }, { "epoch": 52.99215449607725, "grad_norm": 7.415218353271484, "learning_rate": 1.9998942604363907e-05, "loss": 0.0937, "step": 43930 }, { "epoch": 53.0036210018105, "grad_norm": 5.987264633178711, "learning_rate": 1.9998942363109114e-05, "loss": 0.084, "step": 43940 }, { "epoch": 53.015691007845504, "grad_norm": 6.033254146575928, "learning_rate": 1.999894212185432e-05, "loss": 0.065, "step": 43950 }, { "epoch": 53.02776101388051, "grad_norm": 6.69768762588501, "learning_rate": 1.9998941880599526e-05, "loss": 0.0708, "step": 43960 }, { "epoch": 53.03983101991551, "grad_norm": 6.354555606842041, "learning_rate": 1.9998941639344732e-05, "loss": 0.0708, "step": 43970 }, { "epoch": 53.051901025950514, "grad_norm": 6.4700727462768555, "learning_rate": 1.999894139808994e-05, "loss": 0.073, "step": 43980 }, { "epoch": 53.06397103198552, "grad_norm": 6.757737636566162, "learning_rate": 1.9998941156835145e-05, "loss": 0.0708, "step": 43990 }, { "epoch": 53.07604103802052, "grad_norm": 5.93432092666626, "learning_rate": 1.999894091558035e-05, "loss": 0.0713, "step": 44000 }, { "epoch": 53.07604103802052, "eval_loss": 11.941197395324707, "eval_runtime": 8.1369, "eval_samples_per_second": 85.659, "eval_steps_per_second": 10.815, "step": 44000 }, { "epoch": 53.08811104405552, "grad_norm": 6.432583332061768, "learning_rate": 1.9998940674325557e-05, "loss": 0.0775, "step": 44010 }, { "epoch": 53.10018105009053, "grad_norm": 6.249329090118408, "learning_rate": 1.9998940433070763e-05, "loss": 0.0725, "step": 44020 }, { "epoch": 53.11225105612553, "grad_norm": 6.906961441040039, "learning_rate": 1.999894019181597e-05, "loss": 0.0772, "step": 44030 }, { "epoch": 53.12432106216053, "grad_norm": 6.928810119628906, "learning_rate": 1.9998939950561176e-05, "loss": 0.0764, "step": 44040 }, { "epoch": 53.136391068195536, "grad_norm": 7.055656909942627, "learning_rate": 1.9998939709306382e-05, "loss": 0.0762, "step": 44050 }, { "epoch": 53.14846107423054, "grad_norm": 6.774951934814453, "learning_rate": 1.9998939468051588e-05, "loss": 0.0776, "step": 44060 }, { "epoch": 53.16053108026554, "grad_norm": 7.005372047424316, "learning_rate": 1.9998939226796794e-05, "loss": 0.0754, "step": 44070 }, { "epoch": 53.172601086300546, "grad_norm": 6.597530841827393, "learning_rate": 1.9998938985542e-05, "loss": 0.0744, "step": 44080 }, { "epoch": 53.18467109233555, "grad_norm": 7.047434329986572, "learning_rate": 1.9998938744287207e-05, "loss": 0.0784, "step": 44090 }, { "epoch": 53.19674109837055, "grad_norm": 7.269530773162842, "learning_rate": 1.9998938503032413e-05, "loss": 0.0785, "step": 44100 }, { "epoch": 53.208811104405555, "grad_norm": 7.235238552093506, "learning_rate": 1.999893826177762e-05, "loss": 0.0803, "step": 44110 }, { "epoch": 53.22088111044056, "grad_norm": 6.891387939453125, "learning_rate": 1.9998938020522826e-05, "loss": 0.0813, "step": 44120 }, { "epoch": 53.23295111647556, "grad_norm": 7.609297752380371, "learning_rate": 1.9998937779268032e-05, "loss": 0.0825, "step": 44130 }, { "epoch": 53.245021122510565, "grad_norm": 7.095157623291016, "learning_rate": 1.9998937538013238e-05, "loss": 0.0836, "step": 44140 }, { "epoch": 53.25709112854556, "grad_norm": 6.51215124130249, "learning_rate": 1.9998937296758444e-05, "loss": 0.0802, "step": 44150 }, { "epoch": 53.269161134580564, "grad_norm": 7.1923394203186035, "learning_rate": 1.999893705550365e-05, "loss": 0.0813, "step": 44160 }, { "epoch": 53.28123114061557, "grad_norm": 6.681440353393555, "learning_rate": 1.9998936814248857e-05, "loss": 0.0808, "step": 44170 }, { "epoch": 53.29330114665057, "grad_norm": 6.785576820373535, "learning_rate": 1.9998936572994063e-05, "loss": 0.0854, "step": 44180 }, { "epoch": 53.305371152685574, "grad_norm": 7.315168857574463, "learning_rate": 1.999893633173927e-05, "loss": 0.0844, "step": 44190 }, { "epoch": 53.31744115872058, "grad_norm": 6.538464069366455, "learning_rate": 1.9998936090484475e-05, "loss": 0.0841, "step": 44200 }, { "epoch": 53.32951116475558, "grad_norm": 6.6492486000061035, "learning_rate": 1.999893584922968e-05, "loss": 0.0802, "step": 44210 }, { "epoch": 53.34158117079058, "grad_norm": 6.845884323120117, "learning_rate": 1.9998935607974888e-05, "loss": 0.0797, "step": 44220 }, { "epoch": 53.353651176825586, "grad_norm": 6.8548760414123535, "learning_rate": 1.9998935366720094e-05, "loss": 0.0824, "step": 44230 }, { "epoch": 53.36572118286059, "grad_norm": 7.303312301635742, "learning_rate": 1.99989351254653e-05, "loss": 0.0812, "step": 44240 }, { "epoch": 53.37779118889559, "grad_norm": 7.5092453956604, "learning_rate": 1.9998934884210506e-05, "loss": 0.0846, "step": 44250 }, { "epoch": 53.389861194930596, "grad_norm": 6.404805660247803, "learning_rate": 1.9998934642955713e-05, "loss": 0.0863, "step": 44260 }, { "epoch": 53.4019312009656, "grad_norm": 7.592354774475098, "learning_rate": 1.999893440170092e-05, "loss": 0.0855, "step": 44270 }, { "epoch": 53.4140012070006, "grad_norm": 6.435563564300537, "learning_rate": 1.9998934160446125e-05, "loss": 0.0851, "step": 44280 }, { "epoch": 53.426071213035605, "grad_norm": 6.343258857727051, "learning_rate": 1.999893391919133e-05, "loss": 0.0801, "step": 44290 }, { "epoch": 53.43814121907061, "grad_norm": 7.412819862365723, "learning_rate": 1.9998933677936537e-05, "loss": 0.0857, "step": 44300 }, { "epoch": 53.45021122510561, "grad_norm": 6.397753715515137, "learning_rate": 1.9998933436681744e-05, "loss": 0.0812, "step": 44310 }, { "epoch": 53.462281231140615, "grad_norm": 7.528954982757568, "learning_rate": 1.999893319542695e-05, "loss": 0.0886, "step": 44320 }, { "epoch": 53.47435123717562, "grad_norm": 7.17124605178833, "learning_rate": 1.9998932954172156e-05, "loss": 0.0826, "step": 44330 }, { "epoch": 53.48642124321062, "grad_norm": 6.743661880493164, "learning_rate": 1.999893271291736e-05, "loss": 0.0827, "step": 44340 }, { "epoch": 53.498491249245625, "grad_norm": 6.915576934814453, "learning_rate": 1.9998932471662565e-05, "loss": 0.0838, "step": 44350 }, { "epoch": 53.51056125528063, "grad_norm": 6.751610279083252, "learning_rate": 1.999893223040777e-05, "loss": 0.0857, "step": 44360 }, { "epoch": 53.52263126131563, "grad_norm": 7.467238426208496, "learning_rate": 1.9998931989152978e-05, "loss": 0.0872, "step": 44370 }, { "epoch": 53.534701267350634, "grad_norm": 6.661594390869141, "learning_rate": 1.9998931747898184e-05, "loss": 0.0875, "step": 44380 }, { "epoch": 53.54677127338564, "grad_norm": 7.575488567352295, "learning_rate": 1.999893150664339e-05, "loss": 0.084, "step": 44390 }, { "epoch": 53.55884127942064, "grad_norm": 7.225195407867432, "learning_rate": 1.9998931265388596e-05, "loss": 0.0857, "step": 44400 }, { "epoch": 53.570911285455644, "grad_norm": 6.996842861175537, "learning_rate": 1.9998931024133802e-05, "loss": 0.0876, "step": 44410 }, { "epoch": 53.58298129149065, "grad_norm": 6.856820106506348, "learning_rate": 1.999893078287901e-05, "loss": 0.0879, "step": 44420 }, { "epoch": 53.59505129752565, "grad_norm": 7.2516045570373535, "learning_rate": 1.9998930541624215e-05, "loss": 0.0838, "step": 44430 }, { "epoch": 53.60712130356065, "grad_norm": 6.95167875289917, "learning_rate": 1.999893030036942e-05, "loss": 0.0855, "step": 44440 }, { "epoch": 53.61919130959566, "grad_norm": 7.077497959136963, "learning_rate": 1.9998930059114627e-05, "loss": 0.0862, "step": 44450 }, { "epoch": 53.63126131563066, "grad_norm": 6.741082191467285, "learning_rate": 1.9998929817859833e-05, "loss": 0.0884, "step": 44460 }, { "epoch": 53.64333132166566, "grad_norm": 6.278419494628906, "learning_rate": 1.999892957660504e-05, "loss": 0.0844, "step": 44470 }, { "epoch": 53.655401327700666, "grad_norm": 7.968720436096191, "learning_rate": 1.9998929335350246e-05, "loss": 0.087, "step": 44480 }, { "epoch": 53.66747133373567, "grad_norm": 6.945417404174805, "learning_rate": 1.9998929094095452e-05, "loss": 0.0895, "step": 44490 }, { "epoch": 53.67954133977067, "grad_norm": 6.7355828285217285, "learning_rate": 1.9998928852840658e-05, "loss": 0.0859, "step": 44500 }, { "epoch": 53.67954133977067, "eval_loss": 11.952727317810059, "eval_runtime": 8.1281, "eval_samples_per_second": 85.752, "eval_steps_per_second": 10.827, "step": 44500 }, { "epoch": 53.691611345805676, "grad_norm": 7.042218208312988, "learning_rate": 1.9998928611585865e-05, "loss": 0.0911, "step": 44510 }, { "epoch": 53.70368135184068, "grad_norm": 7.077144145965576, "learning_rate": 1.999892837033107e-05, "loss": 0.0876, "step": 44520 }, { "epoch": 53.71575135787568, "grad_norm": 6.960136890411377, "learning_rate": 1.9998928129076277e-05, "loss": 0.087, "step": 44530 }, { "epoch": 53.727821363910685, "grad_norm": 7.464016914367676, "learning_rate": 1.9998927887821483e-05, "loss": 0.0913, "step": 44540 }, { "epoch": 53.73989136994569, "grad_norm": 7.865756988525391, "learning_rate": 1.999892764656669e-05, "loss": 0.0892, "step": 44550 }, { "epoch": 53.751961375980684, "grad_norm": 7.182476997375488, "learning_rate": 1.9998927405311896e-05, "loss": 0.0874, "step": 44560 }, { "epoch": 53.76403138201569, "grad_norm": 6.787639141082764, "learning_rate": 1.9998927164057102e-05, "loss": 0.093, "step": 44570 }, { "epoch": 53.77610138805069, "grad_norm": 7.324212551116943, "learning_rate": 1.9998926922802308e-05, "loss": 0.0873, "step": 44580 }, { "epoch": 53.788171394085694, "grad_norm": 6.476593494415283, "learning_rate": 1.999892668154751e-05, "loss": 0.09, "step": 44590 }, { "epoch": 53.8002414001207, "grad_norm": 7.172462463378906, "learning_rate": 1.9998926440292717e-05, "loss": 0.0897, "step": 44600 }, { "epoch": 53.8123114061557, "grad_norm": 6.859938621520996, "learning_rate": 1.9998926199037923e-05, "loss": 0.0935, "step": 44610 }, { "epoch": 53.824381412190704, "grad_norm": 7.8837056159973145, "learning_rate": 1.999892595778313e-05, "loss": 0.093, "step": 44620 }, { "epoch": 53.83645141822571, "grad_norm": 7.292132377624512, "learning_rate": 1.9998925716528336e-05, "loss": 0.0941, "step": 44630 }, { "epoch": 53.84852142426071, "grad_norm": Infinity, "learning_rate": 1.9998925475273542e-05, "loss": 0.0938, "step": 44640 }, { "epoch": 53.86059143029571, "grad_norm": 7.544769763946533, "learning_rate": 1.9998925234018748e-05, "loss": 0.0909, "step": 44650 }, { "epoch": 53.872661436330716, "grad_norm": 7.677807807922363, "learning_rate": 1.9998924992763954e-05, "loss": 0.091, "step": 44660 }, { "epoch": 53.88473144236572, "grad_norm": 7.27214241027832, "learning_rate": 1.9998924751509164e-05, "loss": 0.0915, "step": 44670 }, { "epoch": 53.89680144840072, "grad_norm": 6.883355140686035, "learning_rate": 1.999892451025437e-05, "loss": 0.0895, "step": 44680 }, { "epoch": 53.908871454435726, "grad_norm": 7.431040287017822, "learning_rate": 1.9998924268999576e-05, "loss": 0.0912, "step": 44690 }, { "epoch": 53.92094146047073, "grad_norm": 7.88872766494751, "learning_rate": 1.9998924027744783e-05, "loss": 0.0919, "step": 44700 }, { "epoch": 53.93301146650573, "grad_norm": 7.618211269378662, "learning_rate": 1.999892378648999e-05, "loss": 0.0952, "step": 44710 }, { "epoch": 53.945081472540735, "grad_norm": 7.670217514038086, "learning_rate": 1.9998923545235195e-05, "loss": 0.0935, "step": 44720 }, { "epoch": 53.95715147857574, "grad_norm": 7.219125270843506, "learning_rate": 1.99989233039804e-05, "loss": 0.0943, "step": 44730 }, { "epoch": 53.96922148461074, "grad_norm": 7.0218071937561035, "learning_rate": 1.9998923062725607e-05, "loss": 0.0917, "step": 44740 }, { "epoch": 53.981291490645745, "grad_norm": 7.607489109039307, "learning_rate": 1.999892282147081e-05, "loss": 0.0943, "step": 44750 }, { "epoch": 53.99336149668075, "grad_norm": 7.986001491546631, "learning_rate": 1.9998922580216017e-05, "loss": 0.0976, "step": 44760 }, { "epoch": 54.004828002414, "grad_norm": 5.605995178222656, "learning_rate": 1.9998922338961223e-05, "loss": 0.0777, "step": 44770 }, { "epoch": 54.016898008449004, "grad_norm": 6.09039306640625, "learning_rate": 1.999892209770643e-05, "loss": 0.0629, "step": 44780 }, { "epoch": 54.02896801448401, "grad_norm": 6.2176103591918945, "learning_rate": 1.9998921856451635e-05, "loss": 0.0657, "step": 44790 }, { "epoch": 54.04103802051901, "grad_norm": 6.517750263214111, "learning_rate": 1.999892161519684e-05, "loss": 0.0687, "step": 44800 }, { "epoch": 54.053108026554014, "grad_norm": 6.175445079803467, "learning_rate": 1.9998921373942048e-05, "loss": 0.0707, "step": 44810 }, { "epoch": 54.06517803258902, "grad_norm": 6.336923122406006, "learning_rate": 1.9998921132687254e-05, "loss": 0.0737, "step": 44820 }, { "epoch": 54.07724803862402, "grad_norm": 6.385021209716797, "learning_rate": 1.999892089143246e-05, "loss": 0.0732, "step": 44830 }, { "epoch": 54.089318044659024, "grad_norm": 6.12568998336792, "learning_rate": 1.9998920650177666e-05, "loss": 0.0728, "step": 44840 }, { "epoch": 54.10138805069403, "grad_norm": 6.56370210647583, "learning_rate": 1.9998920408922872e-05, "loss": 0.0749, "step": 44850 }, { "epoch": 54.11345805672903, "grad_norm": 6.214873313903809, "learning_rate": 1.999892016766808e-05, "loss": 0.0753, "step": 44860 }, { "epoch": 54.12552806276403, "grad_norm": 6.3482561111450195, "learning_rate": 1.9998919926413285e-05, "loss": 0.0765, "step": 44870 }, { "epoch": 54.137598068799036, "grad_norm": 7.051959991455078, "learning_rate": 1.999891968515849e-05, "loss": 0.0763, "step": 44880 }, { "epoch": 54.14966807483404, "grad_norm": 6.012462615966797, "learning_rate": 1.9998919443903697e-05, "loss": 0.0756, "step": 44890 }, { "epoch": 54.16173808086904, "grad_norm": 6.317353248596191, "learning_rate": 1.9998919202648904e-05, "loss": 0.075, "step": 44900 }, { "epoch": 54.173808086904046, "grad_norm": 6.16589879989624, "learning_rate": 1.999891896139411e-05, "loss": 0.0744, "step": 44910 }, { "epoch": 54.18587809293905, "grad_norm": 7.146300315856934, "learning_rate": 1.9998918720139316e-05, "loss": 0.0761, "step": 44920 }, { "epoch": 54.19794809897405, "grad_norm": 6.725696563720703, "learning_rate": 1.9998918478884522e-05, "loss": 0.077, "step": 44930 }, { "epoch": 54.210018105009055, "grad_norm": 7.320751190185547, "learning_rate": 1.999891823762973e-05, "loss": 0.0788, "step": 44940 }, { "epoch": 54.22208811104406, "grad_norm": 6.509537696838379, "learning_rate": 1.9998917996374935e-05, "loss": 0.0799, "step": 44950 }, { "epoch": 54.23415811707906, "grad_norm": 6.197441577911377, "learning_rate": 1.999891775512014e-05, "loss": 0.0749, "step": 44960 }, { "epoch": 54.246228123114065, "grad_norm": 6.357359886169434, "learning_rate": 1.9998917513865347e-05, "loss": 0.0787, "step": 44970 }, { "epoch": 54.25829812914906, "grad_norm": 7.508486747741699, "learning_rate": 1.9998917272610553e-05, "loss": 0.0801, "step": 44980 }, { "epoch": 54.270368135184064, "grad_norm": 7.3297505378723145, "learning_rate": 1.999891703135576e-05, "loss": 0.0833, "step": 44990 }, { "epoch": 54.28243814121907, "grad_norm": 7.156322479248047, "learning_rate": 1.9998916790100962e-05, "loss": 0.0816, "step": 45000 }, { "epoch": 54.28243814121907, "eval_loss": 11.964884757995605, "eval_runtime": 8.1325, "eval_samples_per_second": 85.705, "eval_steps_per_second": 10.821, "step": 45000 }, { "epoch": 54.29450814725407, "grad_norm": 6.898538112640381, "learning_rate": 1.999891654884617e-05, "loss": 0.0799, "step": 45010 }, { "epoch": 54.306578153289074, "grad_norm": 6.9805402755737305, "learning_rate": 1.9998916307591375e-05, "loss": 0.081, "step": 45020 }, { "epoch": 54.31864815932408, "grad_norm": 6.737439155578613, "learning_rate": 1.999891606633658e-05, "loss": 0.0803, "step": 45030 }, { "epoch": 54.33071816535908, "grad_norm": 6.1942009925842285, "learning_rate": 1.9998915825081787e-05, "loss": 0.0797, "step": 45040 }, { "epoch": 54.34278817139408, "grad_norm": 7.2402262687683105, "learning_rate": 1.9998915583826993e-05, "loss": 0.0825, "step": 45050 }, { "epoch": 54.35485817742909, "grad_norm": 6.562304973602295, "learning_rate": 1.99989153425722e-05, "loss": 0.082, "step": 45060 }, { "epoch": 54.36692818346409, "grad_norm": 6.572335720062256, "learning_rate": 1.9998915101317406e-05, "loss": 0.0805, "step": 45070 }, { "epoch": 54.37899818949909, "grad_norm": 7.386560916900635, "learning_rate": 1.9998914860062612e-05, "loss": 0.081, "step": 45080 }, { "epoch": 54.391068195534096, "grad_norm": 6.700051784515381, "learning_rate": 1.9998914618807818e-05, "loss": 0.0842, "step": 45090 }, { "epoch": 54.4031382015691, "grad_norm": 6.876665115356445, "learning_rate": 1.9998914377553024e-05, "loss": 0.0841, "step": 45100 }, { "epoch": 54.4152082076041, "grad_norm": 6.910575866699219, "learning_rate": 1.999891413629823e-05, "loss": 0.084, "step": 45110 }, { "epoch": 54.427278213639106, "grad_norm": 7.1263909339904785, "learning_rate": 1.9998913895043437e-05, "loss": 0.0851, "step": 45120 }, { "epoch": 54.43934821967411, "grad_norm": 7.813811302185059, "learning_rate": 1.9998913653788643e-05, "loss": 0.0841, "step": 45130 }, { "epoch": 54.45141822570911, "grad_norm": 6.1861982345581055, "learning_rate": 1.999891341253385e-05, "loss": 0.0836, "step": 45140 }, { "epoch": 54.463488231744115, "grad_norm": 6.700695037841797, "learning_rate": 1.9998913171279056e-05, "loss": 0.0818, "step": 45150 }, { "epoch": 54.47555823777912, "grad_norm": 6.526190757751465, "learning_rate": 1.9998912930024262e-05, "loss": 0.0845, "step": 45160 }, { "epoch": 54.48762824381412, "grad_norm": 7.239593505859375, "learning_rate": 1.9998912688769468e-05, "loss": 0.0828, "step": 45170 }, { "epoch": 54.499698249849125, "grad_norm": 6.580841064453125, "learning_rate": 1.9998912447514674e-05, "loss": 0.0833, "step": 45180 }, { "epoch": 54.51176825588413, "grad_norm": 6.798619270324707, "learning_rate": 1.999891220625988e-05, "loss": 0.0822, "step": 45190 }, { "epoch": 54.52383826191913, "grad_norm": 6.52896785736084, "learning_rate": 1.9998911965005087e-05, "loss": 0.0817, "step": 45200 }, { "epoch": 54.535908267954134, "grad_norm": 6.924264907836914, "learning_rate": 1.9998911723750293e-05, "loss": 0.0842, "step": 45210 }, { "epoch": 54.54797827398914, "grad_norm": 6.870396137237549, "learning_rate": 1.99989114824955e-05, "loss": 0.0844, "step": 45220 }, { "epoch": 54.56004828002414, "grad_norm": 7.10415506362915, "learning_rate": 1.9998911241240705e-05, "loss": 0.0859, "step": 45230 }, { "epoch": 54.572118286059144, "grad_norm": 6.694983959197998, "learning_rate": 1.999891099998591e-05, "loss": 0.0837, "step": 45240 }, { "epoch": 54.58418829209415, "grad_norm": 6.66759729385376, "learning_rate": 1.9998910758731118e-05, "loss": 0.0841, "step": 45250 }, { "epoch": 54.59625829812915, "grad_norm": 6.45742130279541, "learning_rate": 1.9998910517476324e-05, "loss": 0.0863, "step": 45260 }, { "epoch": 54.608328304164154, "grad_norm": 6.766964912414551, "learning_rate": 1.999891027622153e-05, "loss": 0.0895, "step": 45270 }, { "epoch": 54.62039831019916, "grad_norm": 7.654740810394287, "learning_rate": 1.9998910034966736e-05, "loss": 0.0857, "step": 45280 }, { "epoch": 54.63246831623416, "grad_norm": 7.040428161621094, "learning_rate": 1.9998909793711943e-05, "loss": 0.0847, "step": 45290 }, { "epoch": 54.64453832226916, "grad_norm": 6.96520471572876, "learning_rate": 1.999890955245715e-05, "loss": 0.0853, "step": 45300 }, { "epoch": 54.656608328304166, "grad_norm": 7.760935306549072, "learning_rate": 1.9998909311202355e-05, "loss": 0.0843, "step": 45310 }, { "epoch": 54.66867833433917, "grad_norm": 7.324920654296875, "learning_rate": 1.999890906994756e-05, "loss": 0.0872, "step": 45320 }, { "epoch": 54.68074834037417, "grad_norm": 6.679690837860107, "learning_rate": 1.9998908828692767e-05, "loss": 0.0861, "step": 45330 }, { "epoch": 54.692818346409176, "grad_norm": 7.184390068054199, "learning_rate": 1.9998908587437974e-05, "loss": 0.0855, "step": 45340 }, { "epoch": 54.70488835244418, "grad_norm": 7.016421794891357, "learning_rate": 1.999890834618318e-05, "loss": 0.0873, "step": 45350 }, { "epoch": 54.71695835847918, "grad_norm": 7.332810878753662, "learning_rate": 1.9998908104928386e-05, "loss": 0.0859, "step": 45360 }, { "epoch": 54.729028364514186, "grad_norm": 7.199855327606201, "learning_rate": 1.9998907863673592e-05, "loss": 0.0878, "step": 45370 }, { "epoch": 54.74109837054919, "grad_norm": 6.644885540008545, "learning_rate": 1.99989076224188e-05, "loss": 0.0882, "step": 45380 }, { "epoch": 54.753168376584185, "grad_norm": 6.500704288482666, "learning_rate": 1.9998907381164005e-05, "loss": 0.0905, "step": 45390 }, { "epoch": 54.76523838261919, "grad_norm": 7.7537970542907715, "learning_rate": 1.999890713990921e-05, "loss": 0.0881, "step": 45400 }, { "epoch": 54.77730838865419, "grad_norm": 6.792665481567383, "learning_rate": 1.9998906898654417e-05, "loss": 0.0915, "step": 45410 }, { "epoch": 54.789378394689194, "grad_norm": 7.205349445343018, "learning_rate": 1.999890665739962e-05, "loss": 0.0908, "step": 45420 }, { "epoch": 54.8014484007242, "grad_norm": 7.397759914398193, "learning_rate": 1.9998906416144826e-05, "loss": 0.0925, "step": 45430 }, { "epoch": 54.8135184067592, "grad_norm": 6.577895164489746, "learning_rate": 1.9998906174890032e-05, "loss": 0.0878, "step": 45440 }, { "epoch": 54.825588412794204, "grad_norm": 7.137807369232178, "learning_rate": 1.999890593363524e-05, "loss": 0.0903, "step": 45450 }, { "epoch": 54.83765841882921, "grad_norm": 7.105906963348389, "learning_rate": 1.9998905692380445e-05, "loss": 0.0894, "step": 45460 }, { "epoch": 54.84972842486421, "grad_norm": 7.926321983337402, "learning_rate": 1.999890545112565e-05, "loss": 0.0897, "step": 45470 }, { "epoch": 54.86179843089921, "grad_norm": 7.0323100090026855, "learning_rate": 1.9998905209870857e-05, "loss": 0.0895, "step": 45480 }, { "epoch": 54.87386843693422, "grad_norm": 7.092626094818115, "learning_rate": 1.9998904968616063e-05, "loss": 0.0916, "step": 45490 }, { "epoch": 54.88593844296922, "grad_norm": 6.816891193389893, "learning_rate": 1.999890472736127e-05, "loss": 0.0888, "step": 45500 }, { "epoch": 54.88593844296922, "eval_loss": 11.998488426208496, "eval_runtime": 8.1251, "eval_samples_per_second": 85.783, "eval_steps_per_second": 10.831, "step": 45500 }, { "epoch": 54.89800844900422, "grad_norm": 7.111242771148682, "learning_rate": 1.9998904486106476e-05, "loss": 0.0875, "step": 45510 }, { "epoch": 54.910078455039226, "grad_norm": 7.46934175491333, "learning_rate": 1.9998904244851682e-05, "loss": 0.088, "step": 45520 }, { "epoch": 54.92214846107423, "grad_norm": 6.887979030609131, "learning_rate": 1.999890400359689e-05, "loss": 0.0908, "step": 45530 }, { "epoch": 54.93421846710923, "grad_norm": 7.332968711853027, "learning_rate": 1.9998903762342095e-05, "loss": 0.0937, "step": 45540 }, { "epoch": 54.946288473144236, "grad_norm": 8.038729667663574, "learning_rate": 1.99989035210873e-05, "loss": 0.0915, "step": 45550 }, { "epoch": 54.95835847917924, "grad_norm": 7.556331634521484, "learning_rate": 1.9998903279832507e-05, "loss": 0.0924, "step": 45560 }, { "epoch": 54.97042848521424, "grad_norm": 7.162786483764648, "learning_rate": 1.9998903038577713e-05, "loss": 0.0894, "step": 45570 }, { "epoch": 54.982498491249245, "grad_norm": 6.78795051574707, "learning_rate": 1.999890279732292e-05, "loss": 0.0898, "step": 45580 }, { "epoch": 54.99456849728425, "grad_norm": 7.283093452453613, "learning_rate": 1.9998902556068126e-05, "loss": 0.0926, "step": 45590 }, { "epoch": 55.0060350030175, "grad_norm": 5.807419776916504, "learning_rate": 1.9998902314813332e-05, "loss": 0.0766, "step": 45600 }, { "epoch": 55.018105009052505, "grad_norm": 6.284664630889893, "learning_rate": 1.9998902073558538e-05, "loss": 0.0601, "step": 45610 }, { "epoch": 55.03017501508751, "grad_norm": 6.072381496429443, "learning_rate": 1.9998901832303744e-05, "loss": 0.0657, "step": 45620 }, { "epoch": 55.04224502112251, "grad_norm": 6.474797248840332, "learning_rate": 1.999890159104895e-05, "loss": 0.0675, "step": 45630 }, { "epoch": 55.054315027157514, "grad_norm": 6.182034492492676, "learning_rate": 1.9998901349794157e-05, "loss": 0.0676, "step": 45640 }, { "epoch": 55.06638503319252, "grad_norm": 5.962343215942383, "learning_rate": 1.9998901108539363e-05, "loss": 0.0692, "step": 45650 }, { "epoch": 55.07845503922752, "grad_norm": 6.297152996063232, "learning_rate": 1.999890086728457e-05, "loss": 0.0724, "step": 45660 }, { "epoch": 55.090525045262524, "grad_norm": 6.129759311676025, "learning_rate": 1.9998900626029772e-05, "loss": 0.0721, "step": 45670 }, { "epoch": 55.10259505129753, "grad_norm": 6.280327320098877, "learning_rate": 1.9998900384774978e-05, "loss": 0.0711, "step": 45680 }, { "epoch": 55.11466505733253, "grad_norm": 6.550265789031982, "learning_rate": 1.9998900143520184e-05, "loss": 0.0719, "step": 45690 }, { "epoch": 55.12673506336753, "grad_norm": 6.477368354797363, "learning_rate": 1.999889990226539e-05, "loss": 0.0739, "step": 45700 }, { "epoch": 55.13880506940254, "grad_norm": 6.609169960021973, "learning_rate": 1.9998899661010597e-05, "loss": 0.0749, "step": 45710 }, { "epoch": 55.15087507543754, "grad_norm": 6.80898904800415, "learning_rate": 1.9998899419755803e-05, "loss": 0.0718, "step": 45720 }, { "epoch": 55.16294508147254, "grad_norm": 5.869493007659912, "learning_rate": 1.999889917850101e-05, "loss": 0.0721, "step": 45730 }, { "epoch": 55.175015087507546, "grad_norm": 6.2465291023254395, "learning_rate": 1.9998898937246215e-05, "loss": 0.0752, "step": 45740 }, { "epoch": 55.18708509354255, "grad_norm": 6.979106903076172, "learning_rate": 1.9998898695991425e-05, "loss": 0.075, "step": 45750 }, { "epoch": 55.19915509957755, "grad_norm": 7.024055480957031, "learning_rate": 1.999889845473663e-05, "loss": 0.0788, "step": 45760 }, { "epoch": 55.211225105612556, "grad_norm": 6.018229007720947, "learning_rate": 1.9998898213481837e-05, "loss": 0.0752, "step": 45770 }, { "epoch": 55.22329511164756, "grad_norm": 6.785912036895752, "learning_rate": 1.9998897972227044e-05, "loss": 0.0755, "step": 45780 }, { "epoch": 55.23536511768256, "grad_norm": 6.387754440307617, "learning_rate": 1.999889773097225e-05, "loss": 0.0759, "step": 45790 }, { "epoch": 55.247435123717565, "grad_norm": 6.369368553161621, "learning_rate": 1.9998897489717456e-05, "loss": 0.0784, "step": 45800 }, { "epoch": 55.25950512975256, "grad_norm": 7.498253345489502, "learning_rate": 1.9998897248462662e-05, "loss": 0.0781, "step": 45810 }, { "epoch": 55.271575135787565, "grad_norm": 6.387593746185303, "learning_rate": 1.999889700720787e-05, "loss": 0.0795, "step": 45820 }, { "epoch": 55.28364514182257, "grad_norm": 6.995980739593506, "learning_rate": 1.999889676595307e-05, "loss": 0.0805, "step": 45830 }, { "epoch": 55.29571514785757, "grad_norm": 6.475743770599365, "learning_rate": 1.9998896524698278e-05, "loss": 0.0765, "step": 45840 }, { "epoch": 55.307785153892574, "grad_norm": 6.928640365600586, "learning_rate": 1.9998896283443484e-05, "loss": 0.0834, "step": 45850 }, { "epoch": 55.31985515992758, "grad_norm": 7.258347511291504, "learning_rate": 1.999889604218869e-05, "loss": 0.0798, "step": 45860 }, { "epoch": 55.33192516596258, "grad_norm": 6.31530237197876, "learning_rate": 1.9998895800933896e-05, "loss": 0.0788, "step": 45870 }, { "epoch": 55.343995171997584, "grad_norm": 7.108455181121826, "learning_rate": 1.9998895559679102e-05, "loss": 0.0764, "step": 45880 }, { "epoch": 55.35606517803259, "grad_norm": 7.010908603668213, "learning_rate": 1.999889531842431e-05, "loss": 0.0811, "step": 45890 }, { "epoch": 55.36813518406759, "grad_norm": 6.632464408874512, "learning_rate": 1.9998895077169515e-05, "loss": 0.0831, "step": 45900 }, { "epoch": 55.38020519010259, "grad_norm": 6.590086460113525, "learning_rate": 1.999889483591472e-05, "loss": 0.081, "step": 45910 }, { "epoch": 55.3922751961376, "grad_norm": 7.644220352172852, "learning_rate": 1.9998894594659927e-05, "loss": 0.0856, "step": 45920 }, { "epoch": 55.4043452021726, "grad_norm": 7.152490615844727, "learning_rate": 1.9998894353405134e-05, "loss": 0.0829, "step": 45930 }, { "epoch": 55.4164152082076, "grad_norm": 7.086870193481445, "learning_rate": 1.999889411215034e-05, "loss": 0.0811, "step": 45940 }, { "epoch": 55.428485214242606, "grad_norm": 7.6456146240234375, "learning_rate": 1.9998893870895546e-05, "loss": 0.0833, "step": 45950 }, { "epoch": 55.44055522027761, "grad_norm": 7.124616622924805, "learning_rate": 1.9998893629640752e-05, "loss": 0.082, "step": 45960 }, { "epoch": 55.45262522631261, "grad_norm": 7.855136871337891, "learning_rate": 1.999889338838596e-05, "loss": 0.0815, "step": 45970 }, { "epoch": 55.464695232347616, "grad_norm": 6.946135520935059, "learning_rate": 1.9998893147131165e-05, "loss": 0.0841, "step": 45980 }, { "epoch": 55.47676523838262, "grad_norm": 6.876055717468262, "learning_rate": 1.999889290587637e-05, "loss": 0.084, "step": 45990 }, { "epoch": 55.48883524441762, "grad_norm": 6.853201866149902, "learning_rate": 1.9998892664621577e-05, "loss": 0.0828, "step": 46000 }, { "epoch": 55.48883524441762, "eval_loss": 11.990106582641602, "eval_runtime": 8.1337, "eval_samples_per_second": 85.693, "eval_steps_per_second": 10.819, "step": 46000 }, { "epoch": 55.500905250452625, "grad_norm": 7.0872344970703125, "learning_rate": 1.9998892423366783e-05, "loss": 0.0813, "step": 46010 }, { "epoch": 55.51297525648763, "grad_norm": 8.086292266845703, "learning_rate": 1.999889218211199e-05, "loss": 0.0823, "step": 46020 }, { "epoch": 55.52504526252263, "grad_norm": 6.478053092956543, "learning_rate": 1.9998891940857196e-05, "loss": 0.0846, "step": 46030 }, { "epoch": 55.537115268557635, "grad_norm": 6.737484931945801, "learning_rate": 1.9998891699602402e-05, "loss": 0.0805, "step": 46040 }, { "epoch": 55.54918527459264, "grad_norm": 6.573988437652588, "learning_rate": 1.9998891458347608e-05, "loss": 0.0817, "step": 46050 }, { "epoch": 55.56125528062764, "grad_norm": 6.989092826843262, "learning_rate": 1.9998891217092814e-05, "loss": 0.0839, "step": 46060 }, { "epoch": 55.573325286662644, "grad_norm": 6.873456954956055, "learning_rate": 1.999889097583802e-05, "loss": 0.0839, "step": 46070 }, { "epoch": 55.58539529269765, "grad_norm": 6.466777324676514, "learning_rate": 1.9998890734583223e-05, "loss": 0.0823, "step": 46080 }, { "epoch": 55.59746529873265, "grad_norm": 6.875060081481934, "learning_rate": 1.999889049332843e-05, "loss": 0.0842, "step": 46090 }, { "epoch": 55.609535304767654, "grad_norm": 6.951246738433838, "learning_rate": 1.9998890252073636e-05, "loss": 0.0846, "step": 46100 }, { "epoch": 55.62160531080266, "grad_norm": 7.413021564483643, "learning_rate": 1.9998890010818842e-05, "loss": 0.087, "step": 46110 }, { "epoch": 55.63367531683766, "grad_norm": 7.34155797958374, "learning_rate": 1.9998889769564048e-05, "loss": 0.0829, "step": 46120 }, { "epoch": 55.64574532287266, "grad_norm": 7.147198677062988, "learning_rate": 1.9998889528309254e-05, "loss": 0.0869, "step": 46130 }, { "epoch": 55.65781532890767, "grad_norm": 7.7059550285339355, "learning_rate": 1.999888928705446e-05, "loss": 0.0886, "step": 46140 }, { "epoch": 55.66988533494267, "grad_norm": 7.064149379730225, "learning_rate": 1.9998889045799667e-05, "loss": 0.0866, "step": 46150 }, { "epoch": 55.68195534097767, "grad_norm": 6.937314033508301, "learning_rate": 1.9998888804544873e-05, "loss": 0.0847, "step": 46160 }, { "epoch": 55.694025347012676, "grad_norm": 7.097194194793701, "learning_rate": 1.999888856329008e-05, "loss": 0.0853, "step": 46170 }, { "epoch": 55.70609535304768, "grad_norm": 6.93108606338501, "learning_rate": 1.9998888322035286e-05, "loss": 0.084, "step": 46180 }, { "epoch": 55.71816535908268, "grad_norm": 7.010037422180176, "learning_rate": 1.9998888080780492e-05, "loss": 0.0856, "step": 46190 }, { "epoch": 55.730235365117686, "grad_norm": 7.061589241027832, "learning_rate": 1.9998887839525698e-05, "loss": 0.0851, "step": 46200 }, { "epoch": 55.74230537115269, "grad_norm": 7.137219429016113, "learning_rate": 1.9998887598270904e-05, "loss": 0.0845, "step": 46210 }, { "epoch": 55.754375377187685, "grad_norm": 6.762034893035889, "learning_rate": 1.999888735701611e-05, "loss": 0.0873, "step": 46220 }, { "epoch": 55.76644538322269, "grad_norm": 6.824985027313232, "learning_rate": 1.9998887115761317e-05, "loss": 0.0882, "step": 46230 }, { "epoch": 55.77851538925769, "grad_norm": 6.89382266998291, "learning_rate": 1.9998886874506523e-05, "loss": 0.0883, "step": 46240 }, { "epoch": 55.790585395292695, "grad_norm": 6.882702827453613, "learning_rate": 1.999888663325173e-05, "loss": 0.0887, "step": 46250 }, { "epoch": 55.8026554013277, "grad_norm": 7.277578353881836, "learning_rate": 1.9998886391996935e-05, "loss": 0.0888, "step": 46260 }, { "epoch": 55.8147254073627, "grad_norm": 7.4289350509643555, "learning_rate": 1.999888615074214e-05, "loss": 0.0854, "step": 46270 }, { "epoch": 55.826795413397704, "grad_norm": 7.2801923751831055, "learning_rate": 1.9998885909487348e-05, "loss": 0.0891, "step": 46280 }, { "epoch": 55.83886541943271, "grad_norm": 6.865201473236084, "learning_rate": 1.9998885668232554e-05, "loss": 0.087, "step": 46290 }, { "epoch": 55.85093542546771, "grad_norm": 7.589267253875732, "learning_rate": 1.999888542697776e-05, "loss": 0.0917, "step": 46300 }, { "epoch": 55.863005431502714, "grad_norm": 6.810831069946289, "learning_rate": 1.9998885185722966e-05, "loss": 0.0848, "step": 46310 }, { "epoch": 55.87507543753772, "grad_norm": 7.942761421203613, "learning_rate": 1.9998884944468173e-05, "loss": 0.0917, "step": 46320 }, { "epoch": 55.88714544357272, "grad_norm": 7.063400745391846, "learning_rate": 1.999888470321338e-05, "loss": 0.0899, "step": 46330 }, { "epoch": 55.89921544960772, "grad_norm": 7.569224834442139, "learning_rate": 1.9998884461958585e-05, "loss": 0.0922, "step": 46340 }, { "epoch": 55.91128545564273, "grad_norm": 7.245153903961182, "learning_rate": 1.999888422070379e-05, "loss": 0.088, "step": 46350 }, { "epoch": 55.92335546167773, "grad_norm": 7.418308734893799, "learning_rate": 1.9998883979448997e-05, "loss": 0.0893, "step": 46360 }, { "epoch": 55.93542546771273, "grad_norm": 6.932007312774658, "learning_rate": 1.9998883738194204e-05, "loss": 0.0872, "step": 46370 }, { "epoch": 55.947495473747736, "grad_norm": 7.641567707061768, "learning_rate": 1.999888349693941e-05, "loss": 0.0904, "step": 46380 }, { "epoch": 55.95956547978274, "grad_norm": 6.7337965965271, "learning_rate": 1.9998883255684616e-05, "loss": 0.0908, "step": 46390 }, { "epoch": 55.97163548581774, "grad_norm": 6.927544116973877, "learning_rate": 1.9998883014429822e-05, "loss": 0.0893, "step": 46400 }, { "epoch": 55.983705491852746, "grad_norm": 7.055612564086914, "learning_rate": 1.999888277317503e-05, "loss": 0.0893, "step": 46410 }, { "epoch": 55.99577549788775, "grad_norm": 6.579563617706299, "learning_rate": 1.9998882531920235e-05, "loss": 0.0905, "step": 46420 }, { "epoch": 56.007242003621, "grad_norm": 6.008296489715576, "learning_rate": 1.999888229066544e-05, "loss": 0.0689, "step": 46430 }, { "epoch": 56.019312009656005, "grad_norm": 5.988351345062256, "learning_rate": 1.9998882049410647e-05, "loss": 0.0624, "step": 46440 }, { "epoch": 56.03138201569101, "grad_norm": 6.305265426635742, "learning_rate": 1.9998881808155853e-05, "loss": 0.0629, "step": 46450 }, { "epoch": 56.04345202172601, "grad_norm": 5.934441089630127, "learning_rate": 1.999888156690106e-05, "loss": 0.0668, "step": 46460 }, { "epoch": 56.055522027761015, "grad_norm": 6.00499153137207, "learning_rate": 1.9998881325646266e-05, "loss": 0.0659, "step": 46470 }, { "epoch": 56.06759203379602, "grad_norm": 5.89204740524292, "learning_rate": 1.9998881084391472e-05, "loss": 0.0681, "step": 46480 }, { "epoch": 56.07966203983102, "grad_norm": 6.954041004180908, "learning_rate": 1.9998880843136678e-05, "loss": 0.069, "step": 46490 }, { "epoch": 56.091732045866024, "grad_norm": 5.850103378295898, "learning_rate": 1.999888060188188e-05, "loss": 0.0728, "step": 46500 }, { "epoch": 56.091732045866024, "eval_loss": 12.009778022766113, "eval_runtime": 8.1278, "eval_samples_per_second": 85.755, "eval_steps_per_second": 10.827, "step": 46500 }, { "epoch": 56.10380205190103, "grad_norm": 6.475535869598389, "learning_rate": 1.9998880360627087e-05, "loss": 0.0699, "step": 46510 }, { "epoch": 56.11587205793603, "grad_norm": 6.517557621002197, "learning_rate": 1.9998880119372293e-05, "loss": 0.0721, "step": 46520 }, { "epoch": 56.127942063971034, "grad_norm": 5.844270706176758, "learning_rate": 1.99988798781175e-05, "loss": 0.0747, "step": 46530 }, { "epoch": 56.14001207000604, "grad_norm": 5.833677291870117, "learning_rate": 1.9998879636862706e-05, "loss": 0.0694, "step": 46540 }, { "epoch": 56.15208207604104, "grad_norm": 5.9518232345581055, "learning_rate": 1.9998879395607912e-05, "loss": 0.0725, "step": 46550 }, { "epoch": 56.16415208207604, "grad_norm": 6.632372856140137, "learning_rate": 1.999887915435312e-05, "loss": 0.0728, "step": 46560 }, { "epoch": 56.17622208811105, "grad_norm": 6.205124378204346, "learning_rate": 1.9998878913098325e-05, "loss": 0.0736, "step": 46570 }, { "epoch": 56.18829209414605, "grad_norm": 6.647336959838867, "learning_rate": 1.999887867184353e-05, "loss": 0.0722, "step": 46580 }, { "epoch": 56.20036210018105, "grad_norm": 6.069302082061768, "learning_rate": 1.9998878430588737e-05, "loss": 0.0752, "step": 46590 }, { "epoch": 56.212432106216056, "grad_norm": 6.590200901031494, "learning_rate": 1.9998878189333943e-05, "loss": 0.0745, "step": 46600 }, { "epoch": 56.22450211225106, "grad_norm": 6.885687351226807, "learning_rate": 1.999887794807915e-05, "loss": 0.0736, "step": 46610 }, { "epoch": 56.23657211828606, "grad_norm": 7.062088489532471, "learning_rate": 1.9998877706824356e-05, "loss": 0.0739, "step": 46620 }, { "epoch": 56.248642124321066, "grad_norm": 7.015197277069092, "learning_rate": 1.9998877465569562e-05, "loss": 0.0756, "step": 46630 }, { "epoch": 56.26071213035606, "grad_norm": 6.766689300537109, "learning_rate": 1.9998877224314768e-05, "loss": 0.0764, "step": 46640 }, { "epoch": 56.272782136391065, "grad_norm": 6.780015468597412, "learning_rate": 1.9998876983059974e-05, "loss": 0.0789, "step": 46650 }, { "epoch": 56.28485214242607, "grad_norm": 6.513746738433838, "learning_rate": 1.999887674180518e-05, "loss": 0.0768, "step": 46660 }, { "epoch": 56.29692214846107, "grad_norm": 6.755357265472412, "learning_rate": 1.9998876500550387e-05, "loss": 0.0784, "step": 46670 }, { "epoch": 56.308992154496075, "grad_norm": 6.698294162750244, "learning_rate": 1.9998876259295593e-05, "loss": 0.0772, "step": 46680 }, { "epoch": 56.32106216053108, "grad_norm": 6.6621503829956055, "learning_rate": 1.99988760180408e-05, "loss": 0.0781, "step": 46690 }, { "epoch": 56.33313216656608, "grad_norm": 6.389278888702393, "learning_rate": 1.9998875776786005e-05, "loss": 0.0784, "step": 46700 }, { "epoch": 56.345202172601084, "grad_norm": 5.869836807250977, "learning_rate": 1.999887553553121e-05, "loss": 0.0783, "step": 46710 }, { "epoch": 56.35727217863609, "grad_norm": 6.821190357208252, "learning_rate": 1.9998875294276418e-05, "loss": 0.0792, "step": 46720 }, { "epoch": 56.36934218467109, "grad_norm": 6.86661958694458, "learning_rate": 1.9998875053021624e-05, "loss": 0.0781, "step": 46730 }, { "epoch": 56.381412190706094, "grad_norm": 6.682448863983154, "learning_rate": 1.999887481176683e-05, "loss": 0.0789, "step": 46740 }, { "epoch": 56.3934821967411, "grad_norm": 6.074601650238037, "learning_rate": 1.9998874570512033e-05, "loss": 0.0791, "step": 46750 }, { "epoch": 56.4055522027761, "grad_norm": 6.856705188751221, "learning_rate": 1.999887432925724e-05, "loss": 0.0797, "step": 46760 }, { "epoch": 56.4176222088111, "grad_norm": 7.175174236297607, "learning_rate": 1.9998874088002445e-05, "loss": 0.079, "step": 46770 }, { "epoch": 56.429692214846106, "grad_norm": 6.9286789894104, "learning_rate": 1.9998873846747652e-05, "loss": 0.0819, "step": 46780 }, { "epoch": 56.44176222088111, "grad_norm": 7.065433025360107, "learning_rate": 1.9998873605492858e-05, "loss": 0.081, "step": 46790 }, { "epoch": 56.45383222691611, "grad_norm": 6.0164899826049805, "learning_rate": 1.9998873364238064e-05, "loss": 0.081, "step": 46800 }, { "epoch": 56.465902232951116, "grad_norm": 6.138931751251221, "learning_rate": 1.999887312298327e-05, "loss": 0.0792, "step": 46810 }, { "epoch": 56.47797223898612, "grad_norm": 6.737030982971191, "learning_rate": 1.9998872881728477e-05, "loss": 0.0793, "step": 46820 }, { "epoch": 56.49004224502112, "grad_norm": 6.9888081550598145, "learning_rate": 1.9998872640473686e-05, "loss": 0.0827, "step": 46830 }, { "epoch": 56.502112251056126, "grad_norm": 7.076657772064209, "learning_rate": 1.9998872399218892e-05, "loss": 0.0799, "step": 46840 }, { "epoch": 56.51418225709113, "grad_norm": 7.101436614990234, "learning_rate": 1.99988721579641e-05, "loss": 0.0824, "step": 46850 }, { "epoch": 56.52625226312613, "grad_norm": 6.887584686279297, "learning_rate": 1.9998871916709305e-05, "loss": 0.0822, "step": 46860 }, { "epoch": 56.538322269161135, "grad_norm": 6.891708850860596, "learning_rate": 1.999887167545451e-05, "loss": 0.0821, "step": 46870 }, { "epoch": 56.55039227519614, "grad_norm": 7.065457344055176, "learning_rate": 1.9998871434199717e-05, "loss": 0.0819, "step": 46880 }, { "epoch": 56.56246228123114, "grad_norm": 6.708883762359619, "learning_rate": 1.9998871192944923e-05, "loss": 0.0811, "step": 46890 }, { "epoch": 56.574532287266145, "grad_norm": 6.738123893737793, "learning_rate": 1.999887095169013e-05, "loss": 0.0836, "step": 46900 }, { "epoch": 56.58660229330115, "grad_norm": 7.079942226409912, "learning_rate": 1.9998870710435332e-05, "loss": 0.0858, "step": 46910 }, { "epoch": 56.59867229933615, "grad_norm": 6.967862129211426, "learning_rate": 1.999887046918054e-05, "loss": 0.0841, "step": 46920 }, { "epoch": 56.610742305371154, "grad_norm": 7.277012825012207, "learning_rate": 1.9998870227925745e-05, "loss": 0.0825, "step": 46930 }, { "epoch": 56.62281231140616, "grad_norm": 7.530037879943848, "learning_rate": 1.999886998667095e-05, "loss": 0.0836, "step": 46940 }, { "epoch": 56.63488231744116, "grad_norm": 6.448859214782715, "learning_rate": 1.9998869745416157e-05, "loss": 0.084, "step": 46950 }, { "epoch": 56.646952323476164, "grad_norm": 7.243363857269287, "learning_rate": 1.9998869504161364e-05, "loss": 0.0815, "step": 46960 }, { "epoch": 56.65902232951117, "grad_norm": 6.495532512664795, "learning_rate": 1.999886926290657e-05, "loss": 0.0803, "step": 46970 }, { "epoch": 56.67109233554617, "grad_norm": 6.942462921142578, "learning_rate": 1.9998869021651776e-05, "loss": 0.0827, "step": 46980 }, { "epoch": 56.68316234158117, "grad_norm": 6.974822044372559, "learning_rate": 1.9998868780396982e-05, "loss": 0.0817, "step": 46990 }, { "epoch": 56.69523234761618, "grad_norm": 7.323622703552246, "learning_rate": 1.999886853914219e-05, "loss": 0.0863, "step": 47000 }, { "epoch": 56.69523234761618, "eval_loss": 12.036081314086914, "eval_runtime": 8.1242, "eval_samples_per_second": 85.794, "eval_steps_per_second": 10.832, "step": 47000 }, { "epoch": 56.70730235365118, "grad_norm": 6.870702743530273, "learning_rate": 1.9998868297887395e-05, "loss": 0.0866, "step": 47010 }, { "epoch": 56.71937235968618, "grad_norm": 7.307497501373291, "learning_rate": 1.99988680566326e-05, "loss": 0.0853, "step": 47020 }, { "epoch": 56.731442365721186, "grad_norm": 6.893991947174072, "learning_rate": 1.9998867815377807e-05, "loss": 0.0842, "step": 47030 }, { "epoch": 56.74351237175619, "grad_norm": 6.440523624420166, "learning_rate": 1.9998867574123013e-05, "loss": 0.0827, "step": 47040 }, { "epoch": 56.755582377791185, "grad_norm": 6.88933801651001, "learning_rate": 1.999886733286822e-05, "loss": 0.0846, "step": 47050 }, { "epoch": 56.76765238382619, "grad_norm": 6.238834857940674, "learning_rate": 1.9998867091613426e-05, "loss": 0.0812, "step": 47060 }, { "epoch": 56.77972238986119, "grad_norm": 6.303887367248535, "learning_rate": 1.9998866850358632e-05, "loss": 0.0862, "step": 47070 }, { "epoch": 56.791792395896195, "grad_norm": 6.2289581298828125, "learning_rate": 1.9998866609103838e-05, "loss": 0.0854, "step": 47080 }, { "epoch": 56.8038624019312, "grad_norm": 7.142195224761963, "learning_rate": 1.9998866367849044e-05, "loss": 0.0865, "step": 47090 }, { "epoch": 56.8159324079662, "grad_norm": 6.61541223526001, "learning_rate": 1.999886612659425e-05, "loss": 0.0848, "step": 47100 }, { "epoch": 56.828002414001205, "grad_norm": 6.768244743347168, "learning_rate": 1.9998865885339457e-05, "loss": 0.0913, "step": 47110 }, { "epoch": 56.84007242003621, "grad_norm": 6.874731063842773, "learning_rate": 1.9998865644084663e-05, "loss": 0.09, "step": 47120 }, { "epoch": 56.85214242607121, "grad_norm": 7.022034168243408, "learning_rate": 1.999886540282987e-05, "loss": 0.0866, "step": 47130 }, { "epoch": 56.864212432106214, "grad_norm": 7.579927921295166, "learning_rate": 1.9998865161575075e-05, "loss": 0.0873, "step": 47140 }, { "epoch": 56.87628243814122, "grad_norm": 7.162403583526611, "learning_rate": 1.999886492032028e-05, "loss": 0.086, "step": 47150 }, { "epoch": 56.88835244417622, "grad_norm": 6.6309943199157715, "learning_rate": 1.9998864679065484e-05, "loss": 0.0867, "step": 47160 }, { "epoch": 56.900422450211224, "grad_norm": 7.3544135093688965, "learning_rate": 1.999886443781069e-05, "loss": 0.0845, "step": 47170 }, { "epoch": 56.91249245624623, "grad_norm": 7.600104808807373, "learning_rate": 1.9998864196555897e-05, "loss": 0.0867, "step": 47180 }, { "epoch": 56.92456246228123, "grad_norm": 6.77623987197876, "learning_rate": 1.9998863955301103e-05, "loss": 0.0892, "step": 47190 }, { "epoch": 56.93663246831623, "grad_norm": 6.9741010665893555, "learning_rate": 1.999886371404631e-05, "loss": 0.0886, "step": 47200 }, { "epoch": 56.948702474351236, "grad_norm": 6.794913291931152, "learning_rate": 1.9998863472791516e-05, "loss": 0.0869, "step": 47210 }, { "epoch": 56.96077248038624, "grad_norm": 7.0547919273376465, "learning_rate": 1.9998863231536722e-05, "loss": 0.0875, "step": 47220 }, { "epoch": 56.97284248642124, "grad_norm": 7.661638259887695, "learning_rate": 1.9998862990281928e-05, "loss": 0.0914, "step": 47230 }, { "epoch": 56.984912492456246, "grad_norm": 7.78618049621582, "learning_rate": 1.9998862749027134e-05, "loss": 0.0879, "step": 47240 }, { "epoch": 56.99698249849125, "grad_norm": 7.022690773010254, "learning_rate": 1.999886250777234e-05, "loss": 0.0905, "step": 47250 }, { "epoch": 57.0084490042245, "grad_norm": 5.879391670227051, "learning_rate": 1.9998862266517547e-05, "loss": 0.0698, "step": 47260 }, { "epoch": 57.020519010259505, "grad_norm": 6.4538187980651855, "learning_rate": 1.9998862025262753e-05, "loss": 0.0616, "step": 47270 }, { "epoch": 57.03258901629451, "grad_norm": 5.957611560821533, "learning_rate": 1.999886178400796e-05, "loss": 0.0648, "step": 47280 }, { "epoch": 57.04465902232951, "grad_norm": 5.357992649078369, "learning_rate": 1.9998861542753165e-05, "loss": 0.0663, "step": 47290 }, { "epoch": 57.056729028364515, "grad_norm": 6.037294387817383, "learning_rate": 1.999886130149837e-05, "loss": 0.068, "step": 47300 }, { "epoch": 57.06879903439952, "grad_norm": 6.173604965209961, "learning_rate": 1.9998861060243578e-05, "loss": 0.0654, "step": 47310 }, { "epoch": 57.08086904043452, "grad_norm": 6.014338970184326, "learning_rate": 1.9998860818988784e-05, "loss": 0.0676, "step": 47320 }, { "epoch": 57.092939046469525, "grad_norm": 6.215699195861816, "learning_rate": 1.999886057773399e-05, "loss": 0.0706, "step": 47330 }, { "epoch": 57.10500905250453, "grad_norm": 6.38226842880249, "learning_rate": 1.9998860336479196e-05, "loss": 0.0713, "step": 47340 }, { "epoch": 57.11707905853953, "grad_norm": 6.320430755615234, "learning_rate": 1.9998860095224403e-05, "loss": 0.0723, "step": 47350 }, { "epoch": 57.129149064574534, "grad_norm": 6.566775321960449, "learning_rate": 1.999885985396961e-05, "loss": 0.0721, "step": 47360 }, { "epoch": 57.14121907060954, "grad_norm": 6.125016689300537, "learning_rate": 1.9998859612714815e-05, "loss": 0.072, "step": 47370 }, { "epoch": 57.15328907664454, "grad_norm": 6.204831123352051, "learning_rate": 1.999885937146002e-05, "loss": 0.0732, "step": 47380 }, { "epoch": 57.165359082679544, "grad_norm": 6.724245071411133, "learning_rate": 1.9998859130205227e-05, "loss": 0.073, "step": 47390 }, { "epoch": 57.17742908871455, "grad_norm": 6.290757656097412, "learning_rate": 1.9998858888950434e-05, "loss": 0.0723, "step": 47400 }, { "epoch": 57.18949909474955, "grad_norm": 6.403512001037598, "learning_rate": 1.999885864769564e-05, "loss": 0.0748, "step": 47410 }, { "epoch": 57.20156910078455, "grad_norm": 7.0052995681762695, "learning_rate": 1.9998858406440846e-05, "loss": 0.0741, "step": 47420 }, { "epoch": 57.213639106819556, "grad_norm": 6.725490093231201, "learning_rate": 1.9998858165186052e-05, "loss": 0.0745, "step": 47430 }, { "epoch": 57.22570911285456, "grad_norm": 6.458691120147705, "learning_rate": 1.999885792393126e-05, "loss": 0.0768, "step": 47440 }, { "epoch": 57.23777911888956, "grad_norm": 6.382293701171875, "learning_rate": 1.9998857682676465e-05, "loss": 0.075, "step": 47450 }, { "epoch": 57.249849124924566, "grad_norm": 6.930854320526123, "learning_rate": 1.999885744142167e-05, "loss": 0.0754, "step": 47460 }, { "epoch": 57.26191913095956, "grad_norm": 7.21970796585083, "learning_rate": 1.9998857200166877e-05, "loss": 0.0746, "step": 47470 }, { "epoch": 57.273989136994565, "grad_norm": 6.38486385345459, "learning_rate": 1.9998856958912083e-05, "loss": 0.0737, "step": 47480 }, { "epoch": 57.28605914302957, "grad_norm": 6.267906188964844, "learning_rate": 1.999885671765729e-05, "loss": 0.0751, "step": 47490 }, { "epoch": 57.29812914906457, "grad_norm": 6.188350200653076, "learning_rate": 1.9998856476402496e-05, "loss": 0.0746, "step": 47500 }, { "epoch": 57.29812914906457, "eval_loss": 12.047042846679688, "eval_runtime": 8.1523, "eval_samples_per_second": 85.497, "eval_steps_per_second": 10.794, "step": 47500 }, { "epoch": 57.310199155099575, "grad_norm": 6.614111423492432, "learning_rate": 1.9998856235147702e-05, "loss": 0.0754, "step": 47510 }, { "epoch": 57.32226916113458, "grad_norm": 6.472341537475586, "learning_rate": 1.9998855993892908e-05, "loss": 0.0746, "step": 47520 }, { "epoch": 57.33433916716958, "grad_norm": 6.155925750732422, "learning_rate": 1.9998855752638114e-05, "loss": 0.0759, "step": 47530 }, { "epoch": 57.346409173204584, "grad_norm": 6.794980525970459, "learning_rate": 1.999885551138332e-05, "loss": 0.0744, "step": 47540 }, { "epoch": 57.35847917923959, "grad_norm": 6.20490026473999, "learning_rate": 1.9998855270128527e-05, "loss": 0.0751, "step": 47550 }, { "epoch": 57.37054918527459, "grad_norm": 5.7641987800598145, "learning_rate": 1.9998855028873733e-05, "loss": 0.074, "step": 47560 }, { "epoch": 57.382619191309594, "grad_norm": 7.1773481369018555, "learning_rate": 1.9998854787618936e-05, "loss": 0.0764, "step": 47570 }, { "epoch": 57.3946891973446, "grad_norm": 6.4199066162109375, "learning_rate": 1.9998854546364142e-05, "loss": 0.0787, "step": 47580 }, { "epoch": 57.4067592033796, "grad_norm": 7.049473285675049, "learning_rate": 1.999885430510935e-05, "loss": 0.0769, "step": 47590 }, { "epoch": 57.418829209414604, "grad_norm": 7.214880466461182, "learning_rate": 1.9998854063854555e-05, "loss": 0.08, "step": 47600 }, { "epoch": 57.43089921544961, "grad_norm": 6.801029682159424, "learning_rate": 1.999885382259976e-05, "loss": 0.0788, "step": 47610 }, { "epoch": 57.44296922148461, "grad_norm": 6.094153881072998, "learning_rate": 1.9998853581344967e-05, "loss": 0.0808, "step": 47620 }, { "epoch": 57.45503922751961, "grad_norm": 6.250072479248047, "learning_rate": 1.9998853340090173e-05, "loss": 0.08, "step": 47630 }, { "epoch": 57.467109233554616, "grad_norm": 7.159348011016846, "learning_rate": 1.999885309883538e-05, "loss": 0.077, "step": 47640 }, { "epoch": 57.47917923958962, "grad_norm": 6.671236038208008, "learning_rate": 1.9998852857580586e-05, "loss": 0.0805, "step": 47650 }, { "epoch": 57.49124924562462, "grad_norm": 6.7146897315979, "learning_rate": 1.9998852616325792e-05, "loss": 0.0798, "step": 47660 }, { "epoch": 57.503319251659626, "grad_norm": 6.947461128234863, "learning_rate": 1.9998852375070998e-05, "loss": 0.0843, "step": 47670 }, { "epoch": 57.51538925769463, "grad_norm": 6.806591033935547, "learning_rate": 1.9998852133816204e-05, "loss": 0.0814, "step": 47680 }, { "epoch": 57.52745926372963, "grad_norm": 6.615697860717773, "learning_rate": 1.999885189256141e-05, "loss": 0.0801, "step": 47690 }, { "epoch": 57.539529269764635, "grad_norm": 6.54819393157959, "learning_rate": 1.9998851651306617e-05, "loss": 0.0808, "step": 47700 }, { "epoch": 57.55159927579964, "grad_norm": 6.3051018714904785, "learning_rate": 1.9998851410051823e-05, "loss": 0.0771, "step": 47710 }, { "epoch": 57.56366928183464, "grad_norm": 6.862579345703125, "learning_rate": 1.999885116879703e-05, "loss": 0.0804, "step": 47720 }, { "epoch": 57.575739287869645, "grad_norm": 6.550202369689941, "learning_rate": 1.9998850927542235e-05, "loss": 0.0807, "step": 47730 }, { "epoch": 57.58780929390465, "grad_norm": 6.725715637207031, "learning_rate": 1.999885068628744e-05, "loss": 0.0833, "step": 47740 }, { "epoch": 57.59987929993965, "grad_norm": 7.00395393371582, "learning_rate": 1.9998850445032648e-05, "loss": 0.0817, "step": 47750 }, { "epoch": 57.611949305974655, "grad_norm": 7.28579568862915, "learning_rate": 1.9998850203777854e-05, "loss": 0.0819, "step": 47760 }, { "epoch": 57.62401931200966, "grad_norm": 7.173821449279785, "learning_rate": 1.999884996252306e-05, "loss": 0.0816, "step": 47770 }, { "epoch": 57.63608931804466, "grad_norm": 6.719356060028076, "learning_rate": 1.9998849721268266e-05, "loss": 0.0825, "step": 47780 }, { "epoch": 57.648159324079664, "grad_norm": 6.767759799957275, "learning_rate": 1.9998849480013473e-05, "loss": 0.0828, "step": 47790 }, { "epoch": 57.66022933011467, "grad_norm": 6.655294418334961, "learning_rate": 1.999884923875868e-05, "loss": 0.0841, "step": 47800 }, { "epoch": 57.67229933614967, "grad_norm": 6.5449700355529785, "learning_rate": 1.9998848997503885e-05, "loss": 0.0818, "step": 47810 }, { "epoch": 57.684369342184674, "grad_norm": 7.181440353393555, "learning_rate": 1.9998848756249088e-05, "loss": 0.0852, "step": 47820 }, { "epoch": 57.69643934821968, "grad_norm": 6.515029430389404, "learning_rate": 1.9998848514994294e-05, "loss": 0.0807, "step": 47830 }, { "epoch": 57.70850935425468, "grad_norm": 7.434506893157959, "learning_rate": 1.99988482737395e-05, "loss": 0.0825, "step": 47840 }, { "epoch": 57.72057936028968, "grad_norm": 7.199892044067383, "learning_rate": 1.9998848032484707e-05, "loss": 0.0856, "step": 47850 }, { "epoch": 57.73264936632469, "grad_norm": 6.515588283538818, "learning_rate": 1.9998847791229913e-05, "loss": 0.0834, "step": 47860 }, { "epoch": 57.74471937235969, "grad_norm": 6.402098655700684, "learning_rate": 1.999884754997512e-05, "loss": 0.0829, "step": 47870 }, { "epoch": 57.756789378394686, "grad_norm": 7.133780479431152, "learning_rate": 1.9998847308720325e-05, "loss": 0.084, "step": 47880 }, { "epoch": 57.76885938442969, "grad_norm": 6.520052909851074, "learning_rate": 1.999884706746553e-05, "loss": 0.0828, "step": 47890 }, { "epoch": 57.78092939046469, "grad_norm": 7.0803070068359375, "learning_rate": 1.9998846826210738e-05, "loss": 0.0848, "step": 47900 }, { "epoch": 57.792999396499695, "grad_norm": 7.311006546020508, "learning_rate": 1.9998846584955947e-05, "loss": 0.0828, "step": 47910 }, { "epoch": 57.8050694025347, "grad_norm": 7.478081226348877, "learning_rate": 1.9998846343701153e-05, "loss": 0.0845, "step": 47920 }, { "epoch": 57.8171394085697, "grad_norm": 6.270883560180664, "learning_rate": 1.999884610244636e-05, "loss": 0.0822, "step": 47930 }, { "epoch": 57.829209414604705, "grad_norm": 7.425621032714844, "learning_rate": 1.9998845861191566e-05, "loss": 0.0831, "step": 47940 }, { "epoch": 57.84127942063971, "grad_norm": 7.238455772399902, "learning_rate": 1.9998845619936772e-05, "loss": 0.0821, "step": 47950 }, { "epoch": 57.85334942667471, "grad_norm": 6.897167682647705, "learning_rate": 1.999884537868198e-05, "loss": 0.0866, "step": 47960 }, { "epoch": 57.865419432709714, "grad_norm": 6.330563068389893, "learning_rate": 1.9998845137427185e-05, "loss": 0.0861, "step": 47970 }, { "epoch": 57.87748943874472, "grad_norm": 7.1732964515686035, "learning_rate": 1.999884489617239e-05, "loss": 0.0864, "step": 47980 }, { "epoch": 57.88955944477972, "grad_norm": 6.9474101066589355, "learning_rate": 1.9998844654917594e-05, "loss": 0.0832, "step": 47990 }, { "epoch": 57.901629450814724, "grad_norm": 6.987394332885742, "learning_rate": 1.99988444136628e-05, "loss": 0.0867, "step": 48000 }, { "epoch": 57.901629450814724, "eval_loss": 12.05471134185791, "eval_runtime": 8.1379, "eval_samples_per_second": 85.649, "eval_steps_per_second": 10.814, "step": 48000 }, { "epoch": 57.91369945684973, "grad_norm": 6.585873603820801, "learning_rate": 1.9998844172408006e-05, "loss": 0.0865, "step": 48010 }, { "epoch": 57.92576946288473, "grad_norm": 6.927563190460205, "learning_rate": 1.9998843931153212e-05, "loss": 0.0874, "step": 48020 }, { "epoch": 57.937839468919734, "grad_norm": 7.411097049713135, "learning_rate": 1.999884368989842e-05, "loss": 0.0849, "step": 48030 }, { "epoch": 57.94990947495474, "grad_norm": 7.371321201324463, "learning_rate": 1.9998843448643625e-05, "loss": 0.0872, "step": 48040 }, { "epoch": 57.96197948098974, "grad_norm": 7.71838903427124, "learning_rate": 1.999884320738883e-05, "loss": 0.0867, "step": 48050 }, { "epoch": 57.97404948702474, "grad_norm": 7.437282562255859, "learning_rate": 1.9998842966134037e-05, "loss": 0.0891, "step": 48060 }, { "epoch": 57.986119493059746, "grad_norm": 7.127756118774414, "learning_rate": 1.9998842724879243e-05, "loss": 0.0834, "step": 48070 }, { "epoch": 57.99818949909475, "grad_norm": 6.9979248046875, "learning_rate": 1.999884248362445e-05, "loss": 0.0898, "step": 48080 }, { "epoch": 58.009656004828, "grad_norm": 5.819109916687012, "learning_rate": 1.9998842242369656e-05, "loss": 0.0659, "step": 48090 }, { "epoch": 58.021726010863006, "grad_norm": 6.479397296905518, "learning_rate": 1.9998842001114862e-05, "loss": 0.0593, "step": 48100 }, { "epoch": 58.03379601689801, "grad_norm": 5.517875671386719, "learning_rate": 1.9998841759860068e-05, "loss": 0.066, "step": 48110 }, { "epoch": 58.04586602293301, "grad_norm": 5.648434638977051, "learning_rate": 1.9998841518605274e-05, "loss": 0.0615, "step": 48120 }, { "epoch": 58.057936028968015, "grad_norm": 5.345137596130371, "learning_rate": 1.999884127735048e-05, "loss": 0.0644, "step": 48130 }, { "epoch": 58.07000603500302, "grad_norm": 5.457415580749512, "learning_rate": 1.9998841036095687e-05, "loss": 0.068, "step": 48140 }, { "epoch": 58.08207604103802, "grad_norm": 6.295966148376465, "learning_rate": 1.9998840794840893e-05, "loss": 0.0662, "step": 48150 }, { "epoch": 58.094146047073025, "grad_norm": 5.982841968536377, "learning_rate": 1.99988405535861e-05, "loss": 0.0674, "step": 48160 }, { "epoch": 58.10621605310803, "grad_norm": 6.0876665115356445, "learning_rate": 1.9998840312331305e-05, "loss": 0.0635, "step": 48170 }, { "epoch": 58.11828605914303, "grad_norm": 6.173526763916016, "learning_rate": 1.999884007107651e-05, "loss": 0.069, "step": 48180 }, { "epoch": 58.130356065178034, "grad_norm": 6.189559459686279, "learning_rate": 1.9998839829821718e-05, "loss": 0.0687, "step": 48190 }, { "epoch": 58.14242607121304, "grad_norm": 6.708794116973877, "learning_rate": 1.9998839588566924e-05, "loss": 0.0699, "step": 48200 }, { "epoch": 58.15449607724804, "grad_norm": 6.214231014251709, "learning_rate": 1.999883934731213e-05, "loss": 0.0696, "step": 48210 }, { "epoch": 58.166566083283044, "grad_norm": 6.1660661697387695, "learning_rate": 1.9998839106057337e-05, "loss": 0.0697, "step": 48220 }, { "epoch": 58.17863608931805, "grad_norm": 6.196230888366699, "learning_rate": 1.9998838864802543e-05, "loss": 0.0707, "step": 48230 }, { "epoch": 58.19070609535305, "grad_norm": 6.262844085693359, "learning_rate": 1.9998838623547746e-05, "loss": 0.0692, "step": 48240 }, { "epoch": 58.202776101388054, "grad_norm": 6.492417335510254, "learning_rate": 1.9998838382292952e-05, "loss": 0.0712, "step": 48250 }, { "epoch": 58.21484610742306, "grad_norm": 6.752836227416992, "learning_rate": 1.9998838141038158e-05, "loss": 0.0701, "step": 48260 }, { "epoch": 58.22691611345806, "grad_norm": 6.2872796058654785, "learning_rate": 1.9998837899783364e-05, "loss": 0.0707, "step": 48270 }, { "epoch": 58.23898611949306, "grad_norm": 7.012877464294434, "learning_rate": 1.999883765852857e-05, "loss": 0.0731, "step": 48280 }, { "epoch": 58.251056125528066, "grad_norm": 5.608834266662598, "learning_rate": 1.9998837417273777e-05, "loss": 0.0746, "step": 48290 }, { "epoch": 58.26312613156306, "grad_norm": 5.7780680656433105, "learning_rate": 1.9998837176018983e-05, "loss": 0.0706, "step": 48300 }, { "epoch": 58.275196137598066, "grad_norm": 6.437652111053467, "learning_rate": 1.999883693476419e-05, "loss": 0.0754, "step": 48310 }, { "epoch": 58.28726614363307, "grad_norm": 6.733063697814941, "learning_rate": 1.9998836693509395e-05, "loss": 0.0748, "step": 48320 }, { "epoch": 58.29933614966807, "grad_norm": 6.482746601104736, "learning_rate": 1.99988364522546e-05, "loss": 0.0711, "step": 48330 }, { "epoch": 58.311406155703075, "grad_norm": 6.4365129470825195, "learning_rate": 1.9998836210999808e-05, "loss": 0.0757, "step": 48340 }, { "epoch": 58.32347616173808, "grad_norm": 6.866331577301025, "learning_rate": 1.9998835969745014e-05, "loss": 0.0763, "step": 48350 }, { "epoch": 58.33554616777308, "grad_norm": 6.8773884773254395, "learning_rate": 1.999883572849022e-05, "loss": 0.076, "step": 48360 }, { "epoch": 58.347616173808085, "grad_norm": 6.987476825714111, "learning_rate": 1.9998835487235426e-05, "loss": 0.0804, "step": 48370 }, { "epoch": 58.35968617984309, "grad_norm": 6.215305805206299, "learning_rate": 1.9998835245980633e-05, "loss": 0.0745, "step": 48380 }, { "epoch": 58.37175618587809, "grad_norm": 6.467689514160156, "learning_rate": 1.999883500472584e-05, "loss": 0.0746, "step": 48390 }, { "epoch": 58.383826191913094, "grad_norm": 6.476369380950928, "learning_rate": 1.9998834763471045e-05, "loss": 0.0768, "step": 48400 }, { "epoch": 58.3958961979481, "grad_norm": 5.7465314865112305, "learning_rate": 1.999883452221625e-05, "loss": 0.078, "step": 48410 }, { "epoch": 58.4079662039831, "grad_norm": 6.188863754272461, "learning_rate": 1.9998834280961457e-05, "loss": 0.0748, "step": 48420 }, { "epoch": 58.420036210018104, "grad_norm": 5.923492908477783, "learning_rate": 1.9998834039706664e-05, "loss": 0.073, "step": 48430 }, { "epoch": 58.43210621605311, "grad_norm": 7.018369197845459, "learning_rate": 1.999883379845187e-05, "loss": 0.0789, "step": 48440 }, { "epoch": 58.44417622208811, "grad_norm": 6.459028720855713, "learning_rate": 1.9998833557197076e-05, "loss": 0.0802, "step": 48450 }, { "epoch": 58.45624622812311, "grad_norm": 6.544118404388428, "learning_rate": 1.9998833315942282e-05, "loss": 0.0801, "step": 48460 }, { "epoch": 58.46831623415812, "grad_norm": 6.707405090332031, "learning_rate": 1.999883307468749e-05, "loss": 0.0766, "step": 48470 }, { "epoch": 58.48038624019312, "grad_norm": 6.194316864013672, "learning_rate": 1.9998832833432695e-05, "loss": 0.0756, "step": 48480 }, { "epoch": 58.49245624622812, "grad_norm": 6.2134222984313965, "learning_rate": 1.99988325921779e-05, "loss": 0.0781, "step": 48490 }, { "epoch": 58.504526252263126, "grad_norm": 6.354086399078369, "learning_rate": 1.9998832350923107e-05, "loss": 0.0795, "step": 48500 }, { "epoch": 58.504526252263126, "eval_loss": 12.06265926361084, "eval_runtime": 8.1398, "eval_samples_per_second": 85.628, "eval_steps_per_second": 10.811, "step": 48500 }, { "epoch": 58.51659625829813, "grad_norm": 7.113185405731201, "learning_rate": 1.9998832109668313e-05, "loss": 0.0796, "step": 48510 }, { "epoch": 58.52866626433313, "grad_norm": 7.165708065032959, "learning_rate": 1.999883186841352e-05, "loss": 0.0785, "step": 48520 }, { "epoch": 58.540736270368136, "grad_norm": 6.273062229156494, "learning_rate": 1.9998831627158726e-05, "loss": 0.0788, "step": 48530 }, { "epoch": 58.55280627640314, "grad_norm": 6.39658260345459, "learning_rate": 1.9998831385903932e-05, "loss": 0.0801, "step": 48540 }, { "epoch": 58.56487628243814, "grad_norm": 6.541940212249756, "learning_rate": 1.9998831144649138e-05, "loss": 0.0777, "step": 48550 }, { "epoch": 58.576946288473145, "grad_norm": 6.595402240753174, "learning_rate": 1.9998830903394344e-05, "loss": 0.078, "step": 48560 }, { "epoch": 58.58901629450815, "grad_norm": 6.078832149505615, "learning_rate": 1.999883066213955e-05, "loss": 0.0821, "step": 48570 }, { "epoch": 58.60108630054315, "grad_norm": 6.669593334197998, "learning_rate": 1.9998830420884757e-05, "loss": 0.081, "step": 48580 }, { "epoch": 58.613156306578155, "grad_norm": 6.373267650604248, "learning_rate": 1.9998830179629963e-05, "loss": 0.0769, "step": 48590 }, { "epoch": 58.62522631261316, "grad_norm": 7.306614398956299, "learning_rate": 1.999882993837517e-05, "loss": 0.0795, "step": 48600 }, { "epoch": 58.63729631864816, "grad_norm": 6.372115612030029, "learning_rate": 1.9998829697120376e-05, "loss": 0.081, "step": 48610 }, { "epoch": 58.649366324683164, "grad_norm": 6.913067817687988, "learning_rate": 1.9998829455865582e-05, "loss": 0.0816, "step": 48620 }, { "epoch": 58.66143633071817, "grad_norm": 6.942342758178711, "learning_rate": 1.9998829214610788e-05, "loss": 0.0822, "step": 48630 }, { "epoch": 58.67350633675317, "grad_norm": 6.708662033081055, "learning_rate": 1.9998828973355994e-05, "loss": 0.0832, "step": 48640 }, { "epoch": 58.685576342788174, "grad_norm": 6.4877610206604, "learning_rate": 1.9998828732101197e-05, "loss": 0.0824, "step": 48650 }, { "epoch": 58.69764634882318, "grad_norm": 7.038614749908447, "learning_rate": 1.9998828490846403e-05, "loss": 0.0806, "step": 48660 }, { "epoch": 58.70971635485818, "grad_norm": 6.439931392669678, "learning_rate": 1.999882824959161e-05, "loss": 0.0808, "step": 48670 }, { "epoch": 58.721786360893184, "grad_norm": 7.180122375488281, "learning_rate": 1.9998828008336816e-05, "loss": 0.0819, "step": 48680 }, { "epoch": 58.73385636692819, "grad_norm": 7.105950832366943, "learning_rate": 1.9998827767082022e-05, "loss": 0.0833, "step": 48690 }, { "epoch": 58.74592637296319, "grad_norm": 7.070135593414307, "learning_rate": 1.9998827525827228e-05, "loss": 0.0808, "step": 48700 }, { "epoch": 58.757996378998186, "grad_norm": 6.524704456329346, "learning_rate": 1.9998827284572434e-05, "loss": 0.0794, "step": 48710 }, { "epoch": 58.77006638503319, "grad_norm": 6.191720962524414, "learning_rate": 1.999882704331764e-05, "loss": 0.0797, "step": 48720 }, { "epoch": 58.78213639106819, "grad_norm": 6.500565052032471, "learning_rate": 1.9998826802062847e-05, "loss": 0.0804, "step": 48730 }, { "epoch": 58.794206397103196, "grad_norm": 7.08833122253418, "learning_rate": 1.9998826560808053e-05, "loss": 0.0838, "step": 48740 }, { "epoch": 58.8062764031382, "grad_norm": 7.329761505126953, "learning_rate": 1.999882631955326e-05, "loss": 0.0822, "step": 48750 }, { "epoch": 58.8183464091732, "grad_norm": 7.27813196182251, "learning_rate": 1.9998826078298465e-05, "loss": 0.0801, "step": 48760 }, { "epoch": 58.830416415208205, "grad_norm": 6.701543807983398, "learning_rate": 1.999882583704367e-05, "loss": 0.0855, "step": 48770 }, { "epoch": 58.84248642124321, "grad_norm": 7.643983840942383, "learning_rate": 1.9998825595788878e-05, "loss": 0.0826, "step": 48780 }, { "epoch": 58.85455642727821, "grad_norm": 6.674863815307617, "learning_rate": 1.9998825354534084e-05, "loss": 0.0841, "step": 48790 }, { "epoch": 58.866626433313215, "grad_norm": 6.624147415161133, "learning_rate": 1.999882511327929e-05, "loss": 0.0848, "step": 48800 }, { "epoch": 58.87869643934822, "grad_norm": 7.146792411804199, "learning_rate": 1.9998824872024496e-05, "loss": 0.082, "step": 48810 }, { "epoch": 58.89076644538322, "grad_norm": 7.26441764831543, "learning_rate": 1.9998824630769703e-05, "loss": 0.0822, "step": 48820 }, { "epoch": 58.902836451418224, "grad_norm": 7.128889083862305, "learning_rate": 1.999882438951491e-05, "loss": 0.0853, "step": 48830 }, { "epoch": 58.91490645745323, "grad_norm": 7.650243282318115, "learning_rate": 1.9998824148260115e-05, "loss": 0.0853, "step": 48840 }, { "epoch": 58.92697646348823, "grad_norm": 6.653583526611328, "learning_rate": 1.999882390700532e-05, "loss": 0.0868, "step": 48850 }, { "epoch": 58.939046469523234, "grad_norm": 6.765328407287598, "learning_rate": 1.9998823665750528e-05, "loss": 0.0834, "step": 48860 }, { "epoch": 58.95111647555824, "grad_norm": 6.69940710067749, "learning_rate": 1.9998823424495734e-05, "loss": 0.0836, "step": 48870 }, { "epoch": 58.96318648159324, "grad_norm": 6.5928192138671875, "learning_rate": 1.999882318324094e-05, "loss": 0.0869, "step": 48880 }, { "epoch": 58.97525648762824, "grad_norm": 6.428217887878418, "learning_rate": 1.9998822941986146e-05, "loss": 0.0831, "step": 48890 }, { "epoch": 58.98732649366325, "grad_norm": 6.512378692626953, "learning_rate": 1.999882270073135e-05, "loss": 0.0854, "step": 48900 }, { "epoch": 58.99939649969825, "grad_norm": 6.546364784240723, "learning_rate": 1.9998822459476555e-05, "loss": 0.0855, "step": 48910 }, { "epoch": 59.0108630054315, "grad_norm": 6.2703351974487305, "learning_rate": 1.999882221822176e-05, "loss": 0.0613, "step": 48920 }, { "epoch": 59.022933011466506, "grad_norm": 6.040076732635498, "learning_rate": 1.9998821976966968e-05, "loss": 0.0613, "step": 48930 }, { "epoch": 59.03500301750151, "grad_norm": 5.643435478210449, "learning_rate": 1.9998821735712174e-05, "loss": 0.0621, "step": 48940 }, { "epoch": 59.04707302353651, "grad_norm": 6.00700569152832, "learning_rate": 1.999882149445738e-05, "loss": 0.0597, "step": 48950 }, { "epoch": 59.059143029571516, "grad_norm": 6.3729329109191895, "learning_rate": 1.9998821253202586e-05, "loss": 0.0618, "step": 48960 }, { "epoch": 59.07121303560652, "grad_norm": 6.376018524169922, "learning_rate": 1.9998821011947793e-05, "loss": 0.0656, "step": 48970 }, { "epoch": 59.08328304164152, "grad_norm": 6.146434307098389, "learning_rate": 1.9998820770693e-05, "loss": 0.0662, "step": 48980 }, { "epoch": 59.095353047676525, "grad_norm": 5.5965986251831055, "learning_rate": 1.999882052943821e-05, "loss": 0.0671, "step": 48990 }, { "epoch": 59.10742305371153, "grad_norm": 6.100488185882568, "learning_rate": 1.9998820288183415e-05, "loss": 0.0676, "step": 49000 }, { "epoch": 59.10742305371153, "eval_loss": 12.076872825622559, "eval_runtime": 8.1233, "eval_samples_per_second": 85.803, "eval_steps_per_second": 10.833, "step": 49000 }, { "epoch": 59.11949305974653, "grad_norm": 6.14398193359375, "learning_rate": 1.999882004692862e-05, "loss": 0.0687, "step": 49010 }, { "epoch": 59.131563065781535, "grad_norm": 6.4818115234375, "learning_rate": 1.9998819805673827e-05, "loss": 0.0677, "step": 49020 }, { "epoch": 59.14363307181654, "grad_norm": 6.0789618492126465, "learning_rate": 1.9998819564419033e-05, "loss": 0.0704, "step": 49030 }, { "epoch": 59.15570307785154, "grad_norm": 6.276764392852783, "learning_rate": 1.999881932316424e-05, "loss": 0.0685, "step": 49040 }, { "epoch": 59.167773083886544, "grad_norm": 6.145805835723877, "learning_rate": 1.9998819081909446e-05, "loss": 0.0687, "step": 49050 }, { "epoch": 59.17984308992155, "grad_norm": 6.347751140594482, "learning_rate": 1.9998818840654652e-05, "loss": 0.0712, "step": 49060 }, { "epoch": 59.19191309595655, "grad_norm": 6.360344409942627, "learning_rate": 1.9998818599399855e-05, "loss": 0.0732, "step": 49070 }, { "epoch": 59.203983101991554, "grad_norm": 6.420252323150635, "learning_rate": 1.999881835814506e-05, "loss": 0.0708, "step": 49080 }, { "epoch": 59.21605310802656, "grad_norm": 6.283386707305908, "learning_rate": 1.9998818116890267e-05, "loss": 0.0715, "step": 49090 }, { "epoch": 59.22812311406156, "grad_norm": 5.944460391998291, "learning_rate": 1.9998817875635473e-05, "loss": 0.0709, "step": 49100 }, { "epoch": 59.24019312009656, "grad_norm": 6.255035400390625, "learning_rate": 1.999881763438068e-05, "loss": 0.0702, "step": 49110 }, { "epoch": 59.25226312613156, "grad_norm": 6.422883987426758, "learning_rate": 1.9998817393125886e-05, "loss": 0.0733, "step": 49120 }, { "epoch": 59.26433313216656, "grad_norm": 5.847757816314697, "learning_rate": 1.9998817151871092e-05, "loss": 0.069, "step": 49130 }, { "epoch": 59.276403138201566, "grad_norm": 6.699558734893799, "learning_rate": 1.9998816910616298e-05, "loss": 0.0729, "step": 49140 }, { "epoch": 59.28847314423657, "grad_norm": 6.051847457885742, "learning_rate": 1.9998816669361504e-05, "loss": 0.0735, "step": 49150 }, { "epoch": 59.30054315027157, "grad_norm": 6.731078624725342, "learning_rate": 1.999881642810671e-05, "loss": 0.0745, "step": 49160 }, { "epoch": 59.312613156306575, "grad_norm": 6.4356889724731445, "learning_rate": 1.9998816186851917e-05, "loss": 0.0735, "step": 49170 }, { "epoch": 59.32468316234158, "grad_norm": 6.281242370605469, "learning_rate": 1.9998815945597123e-05, "loss": 0.073, "step": 49180 }, { "epoch": 59.33675316837658, "grad_norm": 7.532817840576172, "learning_rate": 1.999881570434233e-05, "loss": 0.0749, "step": 49190 }, { "epoch": 59.348823174411585, "grad_norm": 6.807511329650879, "learning_rate": 1.9998815463087535e-05, "loss": 0.073, "step": 49200 }, { "epoch": 59.36089318044659, "grad_norm": 6.152385234832764, "learning_rate": 1.9998815221832742e-05, "loss": 0.0753, "step": 49210 }, { "epoch": 59.37296318648159, "grad_norm": 6.547994613647461, "learning_rate": 1.9998814980577948e-05, "loss": 0.0746, "step": 49220 }, { "epoch": 59.385033192516595, "grad_norm": 6.08713960647583, "learning_rate": 1.9998814739323154e-05, "loss": 0.077, "step": 49230 }, { "epoch": 59.3971031985516, "grad_norm": 6.351731300354004, "learning_rate": 1.999881449806836e-05, "loss": 0.077, "step": 49240 }, { "epoch": 59.4091732045866, "grad_norm": 6.172459602355957, "learning_rate": 1.9998814256813567e-05, "loss": 0.0758, "step": 49250 }, { "epoch": 59.421243210621604, "grad_norm": 6.52259635925293, "learning_rate": 1.9998814015558773e-05, "loss": 0.073, "step": 49260 }, { "epoch": 59.43331321665661, "grad_norm": 6.736072540283203, "learning_rate": 1.999881377430398e-05, "loss": 0.0734, "step": 49270 }, { "epoch": 59.44538322269161, "grad_norm": 6.507744312286377, "learning_rate": 1.9998813533049185e-05, "loss": 0.0777, "step": 49280 }, { "epoch": 59.457453228726614, "grad_norm": 7.035240650177002, "learning_rate": 1.999881329179439e-05, "loss": 0.0758, "step": 49290 }, { "epoch": 59.46952323476162, "grad_norm": 7.3035054206848145, "learning_rate": 1.9998813050539598e-05, "loss": 0.0786, "step": 49300 }, { "epoch": 59.48159324079662, "grad_norm": 6.565069198608398, "learning_rate": 1.9998812809284804e-05, "loss": 0.0779, "step": 49310 }, { "epoch": 59.49366324683162, "grad_norm": 6.5182623863220215, "learning_rate": 1.9998812568030007e-05, "loss": 0.0777, "step": 49320 }, { "epoch": 59.50573325286663, "grad_norm": 5.965514183044434, "learning_rate": 1.9998812326775213e-05, "loss": 0.0755, "step": 49330 }, { "epoch": 59.51780325890163, "grad_norm": 6.236616134643555, "learning_rate": 1.999881208552042e-05, "loss": 0.0753, "step": 49340 }, { "epoch": 59.52987326493663, "grad_norm": 6.300272464752197, "learning_rate": 1.9998811844265625e-05, "loss": 0.0761, "step": 49350 }, { "epoch": 59.541943270971636, "grad_norm": 5.953551292419434, "learning_rate": 1.999881160301083e-05, "loss": 0.0772, "step": 49360 }, { "epoch": 59.55401327700664, "grad_norm": 6.368104457855225, "learning_rate": 1.9998811361756038e-05, "loss": 0.0777, "step": 49370 }, { "epoch": 59.56608328304164, "grad_norm": 7.167659282684326, "learning_rate": 1.9998811120501244e-05, "loss": 0.0758, "step": 49380 }, { "epoch": 59.578153289076646, "grad_norm": 5.920461177825928, "learning_rate": 1.999881087924645e-05, "loss": 0.0763, "step": 49390 }, { "epoch": 59.59022329511165, "grad_norm": 6.394115924835205, "learning_rate": 1.9998810637991656e-05, "loss": 0.0776, "step": 49400 }, { "epoch": 59.60229330114665, "grad_norm": 6.116269588470459, "learning_rate": 1.9998810396736863e-05, "loss": 0.0763, "step": 49410 }, { "epoch": 59.614363307181655, "grad_norm": 5.985475540161133, "learning_rate": 1.999881015548207e-05, "loss": 0.075, "step": 49420 }, { "epoch": 59.62643331321666, "grad_norm": 7.007053852081299, "learning_rate": 1.9998809914227275e-05, "loss": 0.0808, "step": 49430 }, { "epoch": 59.63850331925166, "grad_norm": 6.246617317199707, "learning_rate": 1.999880967297248e-05, "loss": 0.0801, "step": 49440 }, { "epoch": 59.650573325286665, "grad_norm": 6.574655055999756, "learning_rate": 1.9998809431717687e-05, "loss": 0.0793, "step": 49450 }, { "epoch": 59.66264333132167, "grad_norm": 6.651595115661621, "learning_rate": 1.9998809190462894e-05, "loss": 0.0796, "step": 49460 }, { "epoch": 59.67471333735667, "grad_norm": 6.813037395477295, "learning_rate": 1.99988089492081e-05, "loss": 0.0809, "step": 49470 }, { "epoch": 59.686783343391674, "grad_norm": 7.081841945648193, "learning_rate": 1.9998808707953306e-05, "loss": 0.0797, "step": 49480 }, { "epoch": 59.69885334942668, "grad_norm": 6.3472089767456055, "learning_rate": 1.9998808466698512e-05, "loss": 0.0797, "step": 49490 }, { "epoch": 59.71092335546168, "grad_norm": 6.485531330108643, "learning_rate": 1.999880822544372e-05, "loss": 0.0788, "step": 49500 }, { "epoch": 59.71092335546168, "eval_loss": 12.103787422180176, "eval_runtime": 8.1455, "eval_samples_per_second": 85.569, "eval_steps_per_second": 10.804, "step": 49500 }, { "epoch": 59.722993361496684, "grad_norm": 6.840046405792236, "learning_rate": 1.9998807984188925e-05, "loss": 0.082, "step": 49510 }, { "epoch": 59.73506336753169, "grad_norm": 6.303740978240967, "learning_rate": 1.999880774293413e-05, "loss": 0.0813, "step": 49520 }, { "epoch": 59.74713337356668, "grad_norm": 6.909379959106445, "learning_rate": 1.9998807501679337e-05, "loss": 0.0834, "step": 49530 }, { "epoch": 59.759203379601686, "grad_norm": 6.772411346435547, "learning_rate": 1.9998807260424543e-05, "loss": 0.0768, "step": 49540 }, { "epoch": 59.77127338563669, "grad_norm": 6.908807277679443, "learning_rate": 1.999880701916975e-05, "loss": 0.0778, "step": 49550 }, { "epoch": 59.78334339167169, "grad_norm": 6.564243793487549, "learning_rate": 1.9998806777914956e-05, "loss": 0.0767, "step": 49560 }, { "epoch": 59.795413397706696, "grad_norm": 6.916593551635742, "learning_rate": 1.9998806536660162e-05, "loss": 0.0798, "step": 49570 }, { "epoch": 59.8074834037417, "grad_norm": 6.294717311859131, "learning_rate": 1.9998806295405368e-05, "loss": 0.0816, "step": 49580 }, { "epoch": 59.8195534097767, "grad_norm": 6.631183624267578, "learning_rate": 1.9998806054150575e-05, "loss": 0.0806, "step": 49590 }, { "epoch": 59.831623415811706, "grad_norm": 6.998116970062256, "learning_rate": 1.999880581289578e-05, "loss": 0.0835, "step": 49600 }, { "epoch": 59.84369342184671, "grad_norm": 6.216552257537842, "learning_rate": 1.9998805571640987e-05, "loss": 0.081, "step": 49610 }, { "epoch": 59.85576342788171, "grad_norm": 7.3694353103637695, "learning_rate": 1.9998805330386193e-05, "loss": 0.0821, "step": 49620 }, { "epoch": 59.867833433916715, "grad_norm": 7.1520256996154785, "learning_rate": 1.99988050891314e-05, "loss": 0.0808, "step": 49630 }, { "epoch": 59.87990343995172, "grad_norm": 6.410555362701416, "learning_rate": 1.9998804847876606e-05, "loss": 0.0849, "step": 49640 }, { "epoch": 59.89197344598672, "grad_norm": 6.652497291564941, "learning_rate": 1.9998804606621812e-05, "loss": 0.0854, "step": 49650 }, { "epoch": 59.904043452021725, "grad_norm": 6.744071960449219, "learning_rate": 1.9998804365367018e-05, "loss": 0.0803, "step": 49660 }, { "epoch": 59.91611345805673, "grad_norm": 6.435975074768066, "learning_rate": 1.9998804124112224e-05, "loss": 0.0825, "step": 49670 }, { "epoch": 59.92818346409173, "grad_norm": 7.116602420806885, "learning_rate": 1.999880388285743e-05, "loss": 0.0869, "step": 49680 }, { "epoch": 59.940253470126734, "grad_norm": 6.060074329376221, "learning_rate": 1.9998803641602637e-05, "loss": 0.0811, "step": 49690 }, { "epoch": 59.95232347616174, "grad_norm": 6.2991414070129395, "learning_rate": 1.9998803400347843e-05, "loss": 0.0834, "step": 49700 }, { "epoch": 59.96439348219674, "grad_norm": 7.084702014923096, "learning_rate": 1.999880315909305e-05, "loss": 0.0832, "step": 49710 }, { "epoch": 59.976463488231744, "grad_norm": 6.3010053634643555, "learning_rate": 1.9998802917838255e-05, "loss": 0.0818, "step": 49720 }, { "epoch": 59.98853349426675, "grad_norm": 6.747771263122559, "learning_rate": 1.9998802676583458e-05, "loss": 0.0853, "step": 49730 }, { "epoch": 60.0, "grad_norm": 9.3095703125, "learning_rate": 1.9998802435328664e-05, "loss": 0.0811, "step": 49740 }, { "epoch": 60.012070006035, "grad_norm": 5.326333522796631, "learning_rate": 1.999880219407387e-05, "loss": 0.0579, "step": 49750 }, { "epoch": 60.024140012070006, "grad_norm": 6.108403205871582, "learning_rate": 1.9998801952819077e-05, "loss": 0.0617, "step": 49760 }, { "epoch": 60.03621001810501, "grad_norm": 5.812510967254639, "learning_rate": 1.9998801711564283e-05, "loss": 0.0598, "step": 49770 }, { "epoch": 60.04828002414001, "grad_norm": 5.715899467468262, "learning_rate": 1.999880147030949e-05, "loss": 0.0609, "step": 49780 }, { "epoch": 60.060350030175016, "grad_norm": 6.1094160079956055, "learning_rate": 1.9998801229054695e-05, "loss": 0.0641, "step": 49790 }, { "epoch": 60.07242003621002, "grad_norm": 6.27839469909668, "learning_rate": 1.99988009877999e-05, "loss": 0.0637, "step": 49800 }, { "epoch": 60.08449004224502, "grad_norm": 5.819544792175293, "learning_rate": 1.9998800746545108e-05, "loss": 0.0637, "step": 49810 }, { "epoch": 60.096560048280026, "grad_norm": 5.457511901855469, "learning_rate": 1.9998800505290314e-05, "loss": 0.0643, "step": 49820 }, { "epoch": 60.10863005431503, "grad_norm": 6.519197940826416, "learning_rate": 1.999880026403552e-05, "loss": 0.0659, "step": 49830 }, { "epoch": 60.12070006035003, "grad_norm": 6.3116583824157715, "learning_rate": 1.9998800022780727e-05, "loss": 0.0678, "step": 49840 }, { "epoch": 60.132770066385035, "grad_norm": 6.496368885040283, "learning_rate": 1.9998799781525933e-05, "loss": 0.0673, "step": 49850 }, { "epoch": 60.14484007242004, "grad_norm": 5.87309455871582, "learning_rate": 1.999879954027114e-05, "loss": 0.0678, "step": 49860 }, { "epoch": 60.15691007845504, "grad_norm": 5.704241752624512, "learning_rate": 1.9998799299016345e-05, "loss": 0.0671, "step": 49870 }, { "epoch": 60.168980084490045, "grad_norm": 6.091196060180664, "learning_rate": 1.999879905776155e-05, "loss": 0.0657, "step": 49880 }, { "epoch": 60.18105009052505, "grad_norm": 6.3372626304626465, "learning_rate": 1.9998798816506758e-05, "loss": 0.067, "step": 49890 }, { "epoch": 60.19312009656005, "grad_norm": 6.16288948059082, "learning_rate": 1.9998798575251964e-05, "loss": 0.0708, "step": 49900 }, { "epoch": 60.205190102595054, "grad_norm": 6.035231113433838, "learning_rate": 1.999879833399717e-05, "loss": 0.0691, "step": 49910 }, { "epoch": 60.21726010863006, "grad_norm": 6.674966335296631, "learning_rate": 1.9998798092742376e-05, "loss": 0.0675, "step": 49920 }, { "epoch": 60.22933011466506, "grad_norm": 5.800307750701904, "learning_rate": 1.9998797851487582e-05, "loss": 0.0708, "step": 49930 }, { "epoch": 60.241400120700064, "grad_norm": 6.23104190826416, "learning_rate": 1.999879761023279e-05, "loss": 0.0696, "step": 49940 }, { "epoch": 60.25347012673506, "grad_norm": 6.141819000244141, "learning_rate": 1.9998797368977995e-05, "loss": 0.0695, "step": 49950 }, { "epoch": 60.26554013277006, "grad_norm": 6.353591442108154, "learning_rate": 1.99987971277232e-05, "loss": 0.072, "step": 49960 }, { "epoch": 60.277610138805066, "grad_norm": 6.341884613037109, "learning_rate": 1.9998796886468407e-05, "loss": 0.0723, "step": 49970 }, { "epoch": 60.28968014484007, "grad_norm": 6.212210655212402, "learning_rate": 1.999879664521361e-05, "loss": 0.0699, "step": 49980 }, { "epoch": 60.30175015087507, "grad_norm": 6.716305732727051, "learning_rate": 1.9998796403958816e-05, "loss": 0.0718, "step": 49990 }, { "epoch": 60.313820156910076, "grad_norm": 6.066877841949463, "learning_rate": 1.9998796162704023e-05, "loss": 0.0732, "step": 50000 }, { "epoch": 60.313820156910076, "eval_loss": 12.11163330078125, "eval_runtime": 8.1314, "eval_samples_per_second": 85.718, "eval_steps_per_second": 10.822, "step": 50000 }, { "epoch": 60.32589016294508, "grad_norm": 7.17020845413208, "learning_rate": 1.999879592144923e-05, "loss": 0.073, "step": 50010 }, { "epoch": 60.33796016898008, "grad_norm": 6.104100704193115, "learning_rate": 1.9998795680194435e-05, "loss": 0.0715, "step": 50020 }, { "epoch": 60.350030175015085, "grad_norm": 5.772665500640869, "learning_rate": 1.999879543893964e-05, "loss": 0.0707, "step": 50030 }, { "epoch": 60.36210018105009, "grad_norm": 6.527912139892578, "learning_rate": 1.9998795197684847e-05, "loss": 0.0746, "step": 50040 }, { "epoch": 60.37417018708509, "grad_norm": 6.619725704193115, "learning_rate": 1.9998794956430054e-05, "loss": 0.0735, "step": 50050 }, { "epoch": 60.386240193120095, "grad_norm": 6.087477207183838, "learning_rate": 1.999879471517526e-05, "loss": 0.0714, "step": 50060 }, { "epoch": 60.3983101991551, "grad_norm": 7.338958740234375, "learning_rate": 1.999879447392047e-05, "loss": 0.0755, "step": 50070 }, { "epoch": 60.4103802051901, "grad_norm": 6.972471237182617, "learning_rate": 1.9998794232665676e-05, "loss": 0.0758, "step": 50080 }, { "epoch": 60.422450211225105, "grad_norm": 6.783930778503418, "learning_rate": 1.9998793991410882e-05, "loss": 0.0731, "step": 50090 }, { "epoch": 60.43452021726011, "grad_norm": 6.29701042175293, "learning_rate": 1.9998793750156088e-05, "loss": 0.0747, "step": 50100 }, { "epoch": 60.44659022329511, "grad_norm": 6.74688196182251, "learning_rate": 1.9998793508901294e-05, "loss": 0.0764, "step": 50110 }, { "epoch": 60.458660229330114, "grad_norm": 6.752423286437988, "learning_rate": 1.99987932676465e-05, "loss": 0.0752, "step": 50120 }, { "epoch": 60.47073023536512, "grad_norm": 6.117467403411865, "learning_rate": 1.9998793026391707e-05, "loss": 0.0753, "step": 50130 }, { "epoch": 60.48280024140012, "grad_norm": 6.2216410636901855, "learning_rate": 1.999879278513691e-05, "loss": 0.0769, "step": 50140 }, { "epoch": 60.494870247435124, "grad_norm": 6.366013050079346, "learning_rate": 1.9998792543882116e-05, "loss": 0.0753, "step": 50150 }, { "epoch": 60.50694025347013, "grad_norm": 6.639504909515381, "learning_rate": 1.9998792302627322e-05, "loss": 0.0739, "step": 50160 }, { "epoch": 60.51901025950513, "grad_norm": 6.686197757720947, "learning_rate": 1.9998792061372528e-05, "loss": 0.0743, "step": 50170 }, { "epoch": 60.53108026554013, "grad_norm": 6.621576309204102, "learning_rate": 1.9998791820117734e-05, "loss": 0.0772, "step": 50180 }, { "epoch": 60.543150271575136, "grad_norm": 6.597472190856934, "learning_rate": 1.999879157886294e-05, "loss": 0.0774, "step": 50190 }, { "epoch": 60.55522027761014, "grad_norm": 6.2515411376953125, "learning_rate": 1.9998791337608147e-05, "loss": 0.0751, "step": 50200 }, { "epoch": 60.56729028364514, "grad_norm": 6.209715843200684, "learning_rate": 1.9998791096353353e-05, "loss": 0.0797, "step": 50210 }, { "epoch": 60.579360289680146, "grad_norm": 6.669098854064941, "learning_rate": 1.999879085509856e-05, "loss": 0.08, "step": 50220 }, { "epoch": 60.59143029571515, "grad_norm": 6.3807454109191895, "learning_rate": 1.9998790613843766e-05, "loss": 0.0762, "step": 50230 }, { "epoch": 60.60350030175015, "grad_norm": 6.608810901641846, "learning_rate": 1.9998790372588972e-05, "loss": 0.0763, "step": 50240 }, { "epoch": 60.615570307785156, "grad_norm": 6.615596294403076, "learning_rate": 1.9998790131334178e-05, "loss": 0.0763, "step": 50250 }, { "epoch": 60.62764031382016, "grad_norm": 6.525839805603027, "learning_rate": 1.9998789890079384e-05, "loss": 0.0797, "step": 50260 }, { "epoch": 60.63971031985516, "grad_norm": 6.663147926330566, "learning_rate": 1.999878964882459e-05, "loss": 0.0791, "step": 50270 }, { "epoch": 60.651780325890165, "grad_norm": 6.623457431793213, "learning_rate": 1.9998789407569797e-05, "loss": 0.0797, "step": 50280 }, { "epoch": 60.66385033192517, "grad_norm": 7.312437534332275, "learning_rate": 1.9998789166315003e-05, "loss": 0.0801, "step": 50290 }, { "epoch": 60.67592033796017, "grad_norm": 7.060746192932129, "learning_rate": 1.999878892506021e-05, "loss": 0.0812, "step": 50300 }, { "epoch": 60.687990343995175, "grad_norm": 6.759237289428711, "learning_rate": 1.9998788683805415e-05, "loss": 0.0754, "step": 50310 }, { "epoch": 60.70006035003018, "grad_norm": 7.392429351806641, "learning_rate": 1.999878844255062e-05, "loss": 0.0809, "step": 50320 }, { "epoch": 60.71213035606518, "grad_norm": 6.38303279876709, "learning_rate": 1.9998788201295828e-05, "loss": 0.0795, "step": 50330 }, { "epoch": 60.724200362100184, "grad_norm": 6.206445693969727, "learning_rate": 1.9998787960041034e-05, "loss": 0.0797, "step": 50340 }, { "epoch": 60.73627036813519, "grad_norm": 6.069600582122803, "learning_rate": 1.999878771878624e-05, "loss": 0.0836, "step": 50350 }, { "epoch": 60.74834037417018, "grad_norm": 6.733497619628906, "learning_rate": 1.9998787477531446e-05, "loss": 0.0796, "step": 50360 }, { "epoch": 60.76041038020519, "grad_norm": 7.067437648773193, "learning_rate": 1.9998787236276653e-05, "loss": 0.0779, "step": 50370 }, { "epoch": 60.77248038624019, "grad_norm": 6.915349006652832, "learning_rate": 1.999878699502186e-05, "loss": 0.0797, "step": 50380 }, { "epoch": 60.78455039227519, "grad_norm": 7.052371025085449, "learning_rate": 1.999878675376706e-05, "loss": 0.0852, "step": 50390 }, { "epoch": 60.796620398310196, "grad_norm": 6.5924577713012695, "learning_rate": 1.9998786512512268e-05, "loss": 0.0808, "step": 50400 }, { "epoch": 60.8086904043452, "grad_norm": 6.947057247161865, "learning_rate": 1.9998786271257474e-05, "loss": 0.0796, "step": 50410 }, { "epoch": 60.8207604103802, "grad_norm": 7.259156227111816, "learning_rate": 1.999878603000268e-05, "loss": 0.082, "step": 50420 }, { "epoch": 60.832830416415206, "grad_norm": 5.987196445465088, "learning_rate": 1.9998785788747886e-05, "loss": 0.0816, "step": 50430 }, { "epoch": 60.84490042245021, "grad_norm": 6.894785404205322, "learning_rate": 1.9998785547493093e-05, "loss": 0.0797, "step": 50440 }, { "epoch": 60.85697042848521, "grad_norm": 6.411137580871582, "learning_rate": 1.99987853062383e-05, "loss": 0.0798, "step": 50450 }, { "epoch": 60.869040434520215, "grad_norm": 6.43325138092041, "learning_rate": 1.9998785064983505e-05, "loss": 0.0795, "step": 50460 }, { "epoch": 60.88111044055522, "grad_norm": 6.648281574249268, "learning_rate": 1.999878482372871e-05, "loss": 0.0795, "step": 50470 }, { "epoch": 60.89318044659022, "grad_norm": 6.225253582000732, "learning_rate": 1.9998784582473918e-05, "loss": 0.0791, "step": 50480 }, { "epoch": 60.905250452625225, "grad_norm": 6.50905704498291, "learning_rate": 1.9998784341219124e-05, "loss": 0.0797, "step": 50490 }, { "epoch": 60.91732045866023, "grad_norm": 7.121996879577637, "learning_rate": 1.999878409996433e-05, "loss": 0.0811, "step": 50500 }, { "epoch": 60.91732045866023, "eval_loss": 12.133992195129395, "eval_runtime": 8.1295, "eval_samples_per_second": 85.737, "eval_steps_per_second": 10.825, "step": 50500 }, { "epoch": 60.92939046469523, "grad_norm": 6.2604660987854, "learning_rate": 1.9998783858709536e-05, "loss": 0.0786, "step": 50510 }, { "epoch": 60.941460470730235, "grad_norm": 6.709085941314697, "learning_rate": 1.9998783617454742e-05, "loss": 0.0822, "step": 50520 }, { "epoch": 60.95353047676524, "grad_norm": 7.3836870193481445, "learning_rate": 1.999878337619995e-05, "loss": 0.0826, "step": 50530 }, { "epoch": 60.96560048280024, "grad_norm": 6.801754474639893, "learning_rate": 1.9998783134945155e-05, "loss": 0.0841, "step": 50540 }, { "epoch": 60.977670488835244, "grad_norm": 6.78330135345459, "learning_rate": 1.999878289369036e-05, "loss": 0.084, "step": 50550 }, { "epoch": 60.98974049487025, "grad_norm": 6.769004821777344, "learning_rate": 1.9998782652435567e-05, "loss": 0.0832, "step": 50560 }, { "epoch": 61.0012070006035, "grad_norm": 5.425097942352295, "learning_rate": 1.9998782411180773e-05, "loss": 0.0781, "step": 50570 }, { "epoch": 61.0132770066385, "grad_norm": 5.793607234954834, "learning_rate": 1.999878216992598e-05, "loss": 0.0561, "step": 50580 }, { "epoch": 61.02534701267351, "grad_norm": 5.679696083068848, "learning_rate": 1.9998781928671186e-05, "loss": 0.0556, "step": 50590 }, { "epoch": 61.03741701870851, "grad_norm": 5.494134902954102, "learning_rate": 1.9998781687416392e-05, "loss": 0.0588, "step": 50600 }, { "epoch": 61.04948702474351, "grad_norm": 5.826220989227295, "learning_rate": 1.99987814461616e-05, "loss": 0.0594, "step": 50610 }, { "epoch": 61.061557030778516, "grad_norm": 6.825466632843018, "learning_rate": 1.9998781204906805e-05, "loss": 0.0627, "step": 50620 }, { "epoch": 61.07362703681352, "grad_norm": 6.0566487312316895, "learning_rate": 1.999878096365201e-05, "loss": 0.0615, "step": 50630 }, { "epoch": 61.08569704284852, "grad_norm": 5.532888412475586, "learning_rate": 1.9998780722397217e-05, "loss": 0.0638, "step": 50640 }, { "epoch": 61.097767048883526, "grad_norm": 6.036364555358887, "learning_rate": 1.9998780481142423e-05, "loss": 0.0633, "step": 50650 }, { "epoch": 61.10983705491853, "grad_norm": 6.261865615844727, "learning_rate": 1.999878023988763e-05, "loss": 0.0647, "step": 50660 }, { "epoch": 61.12190706095353, "grad_norm": 6.012265205383301, "learning_rate": 1.9998779998632836e-05, "loss": 0.0649, "step": 50670 }, { "epoch": 61.133977066988535, "grad_norm": 6.2051615715026855, "learning_rate": 1.9998779757378042e-05, "loss": 0.0656, "step": 50680 }, { "epoch": 61.14604707302354, "grad_norm": 6.001950263977051, "learning_rate": 1.9998779516123248e-05, "loss": 0.0675, "step": 50690 }, { "epoch": 61.15811707905854, "grad_norm": 6.479168891906738, "learning_rate": 1.9998779274868454e-05, "loss": 0.0651, "step": 50700 }, { "epoch": 61.170187085093545, "grad_norm": 6.177745342254639, "learning_rate": 1.999877903361366e-05, "loss": 0.0672, "step": 50710 }, { "epoch": 61.18225709112855, "grad_norm": 6.091300010681152, "learning_rate": 1.9998778792358867e-05, "loss": 0.0696, "step": 50720 }, { "epoch": 61.19432709716355, "grad_norm": 5.9211578369140625, "learning_rate": 1.9998778551104073e-05, "loss": 0.068, "step": 50730 }, { "epoch": 61.206397103198555, "grad_norm": 6.086590766906738, "learning_rate": 1.999877830984928e-05, "loss": 0.0688, "step": 50740 }, { "epoch": 61.21846710923356, "grad_norm": 5.576704025268555, "learning_rate": 1.9998778068594485e-05, "loss": 0.0685, "step": 50750 }, { "epoch": 61.23053711526856, "grad_norm": 6.0575127601623535, "learning_rate": 1.999877782733969e-05, "loss": 0.0691, "step": 50760 }, { "epoch": 61.242607121303564, "grad_norm": 5.633757591247559, "learning_rate": 1.9998777586084898e-05, "loss": 0.0669, "step": 50770 }, { "epoch": 61.25467712733856, "grad_norm": 6.065426349639893, "learning_rate": 1.9998777344830104e-05, "loss": 0.0677, "step": 50780 }, { "epoch": 61.26674713337356, "grad_norm": 6.290153980255127, "learning_rate": 1.999877710357531e-05, "loss": 0.0693, "step": 50790 }, { "epoch": 61.27881713940857, "grad_norm": 5.405603408813477, "learning_rate": 1.9998776862320516e-05, "loss": 0.0697, "step": 50800 }, { "epoch": 61.29088714544357, "grad_norm": 5.936741352081299, "learning_rate": 1.999877662106572e-05, "loss": 0.0685, "step": 50810 }, { "epoch": 61.30295715147857, "grad_norm": 6.447365760803223, "learning_rate": 1.9998776379810925e-05, "loss": 0.0693, "step": 50820 }, { "epoch": 61.315027157513576, "grad_norm": 6.525780200958252, "learning_rate": 1.999877613855613e-05, "loss": 0.0707, "step": 50830 }, { "epoch": 61.32709716354858, "grad_norm": 6.485653877258301, "learning_rate": 1.9998775897301338e-05, "loss": 0.0699, "step": 50840 }, { "epoch": 61.33916716958358, "grad_norm": 6.246899127960205, "learning_rate": 1.9998775656046544e-05, "loss": 0.0721, "step": 50850 }, { "epoch": 61.351237175618586, "grad_norm": 6.37040901184082, "learning_rate": 1.999877541479175e-05, "loss": 0.0713, "step": 50860 }, { "epoch": 61.36330718165359, "grad_norm": 6.927926063537598, "learning_rate": 1.9998775173536957e-05, "loss": 0.0715, "step": 50870 }, { "epoch": 61.37537718768859, "grad_norm": 6.6465911865234375, "learning_rate": 1.9998774932282163e-05, "loss": 0.0748, "step": 50880 }, { "epoch": 61.387447193723595, "grad_norm": 5.952831745147705, "learning_rate": 1.999877469102737e-05, "loss": 0.0717, "step": 50890 }, { "epoch": 61.3995171997586, "grad_norm": 6.191461563110352, "learning_rate": 1.9998774449772575e-05, "loss": 0.0723, "step": 50900 }, { "epoch": 61.4115872057936, "grad_norm": 6.388754367828369, "learning_rate": 1.999877420851778e-05, "loss": 0.0766, "step": 50910 }, { "epoch": 61.423657211828605, "grad_norm": 5.852834701538086, "learning_rate": 1.9998773967262988e-05, "loss": 0.0717, "step": 50920 }, { "epoch": 61.43572721786361, "grad_norm": 6.781495094299316, "learning_rate": 1.9998773726008194e-05, "loss": 0.072, "step": 50930 }, { "epoch": 61.44779722389861, "grad_norm": 6.160452365875244, "learning_rate": 1.99987734847534e-05, "loss": 0.0723, "step": 50940 }, { "epoch": 61.459867229933614, "grad_norm": 6.408629417419434, "learning_rate": 1.9998773243498606e-05, "loss": 0.0728, "step": 50950 }, { "epoch": 61.47193723596862, "grad_norm": 6.372729301452637, "learning_rate": 1.9998773002243812e-05, "loss": 0.073, "step": 50960 }, { "epoch": 61.48400724200362, "grad_norm": 6.860113143920898, "learning_rate": 1.999877276098902e-05, "loss": 0.0752, "step": 50970 }, { "epoch": 61.496077248038624, "grad_norm": 6.0853071212768555, "learning_rate": 1.9998772519734225e-05, "loss": 0.0769, "step": 50980 }, { "epoch": 61.50814725407363, "grad_norm": 6.660415172576904, "learning_rate": 1.999877227847943e-05, "loss": 0.0756, "step": 50990 }, { "epoch": 61.52021726010863, "grad_norm": 6.270865440368652, "learning_rate": 1.9998772037224637e-05, "loss": 0.0755, "step": 51000 }, { "epoch": 61.52021726010863, "eval_loss": 12.154770851135254, "eval_runtime": 8.1359, "eval_samples_per_second": 85.67, "eval_steps_per_second": 10.816, "step": 51000 }, { "epoch": 61.53228726614363, "grad_norm": 5.548401355743408, "learning_rate": 1.9998771795969844e-05, "loss": 0.0733, "step": 51010 }, { "epoch": 61.54435727217864, "grad_norm": 6.708763599395752, "learning_rate": 1.999877155471505e-05, "loss": 0.0746, "step": 51020 }, { "epoch": 61.55642727821364, "grad_norm": 6.317752838134766, "learning_rate": 1.9998771313460256e-05, "loss": 0.0775, "step": 51030 }, { "epoch": 61.56849728424864, "grad_norm": 6.048826217651367, "learning_rate": 1.9998771072205462e-05, "loss": 0.0754, "step": 51040 }, { "epoch": 61.580567290283646, "grad_norm": 6.567038536071777, "learning_rate": 1.999877083095067e-05, "loss": 0.075, "step": 51050 }, { "epoch": 61.59263729631865, "grad_norm": 6.2485833168029785, "learning_rate": 1.999877058969587e-05, "loss": 0.0734, "step": 51060 }, { "epoch": 61.60470730235365, "grad_norm": 6.311077117919922, "learning_rate": 1.9998770348441077e-05, "loss": 0.0769, "step": 51070 }, { "epoch": 61.616777308388656, "grad_norm": 6.975301265716553, "learning_rate": 1.9998770107186284e-05, "loss": 0.0789, "step": 51080 }, { "epoch": 61.62884731442366, "grad_norm": 6.847969055175781, "learning_rate": 1.999876986593149e-05, "loss": 0.0806, "step": 51090 }, { "epoch": 61.64091732045866, "grad_norm": 6.55341911315918, "learning_rate": 1.9998769624676696e-05, "loss": 0.0765, "step": 51100 }, { "epoch": 61.652987326493665, "grad_norm": 6.476667404174805, "learning_rate": 1.9998769383421902e-05, "loss": 0.0786, "step": 51110 }, { "epoch": 61.66505733252867, "grad_norm": 6.562695503234863, "learning_rate": 1.999876914216711e-05, "loss": 0.0765, "step": 51120 }, { "epoch": 61.67712733856367, "grad_norm": 6.363085746765137, "learning_rate": 1.9998768900912315e-05, "loss": 0.0764, "step": 51130 }, { "epoch": 61.689197344598675, "grad_norm": 6.3509626388549805, "learning_rate": 1.999876865965752e-05, "loss": 0.0771, "step": 51140 }, { "epoch": 61.70126735063368, "grad_norm": 6.941645622253418, "learning_rate": 1.999876841840273e-05, "loss": 0.0768, "step": 51150 }, { "epoch": 61.71333735666868, "grad_norm": 5.867564678192139, "learning_rate": 1.9998768177147937e-05, "loss": 0.0779, "step": 51160 }, { "epoch": 61.725407362703685, "grad_norm": 6.143188953399658, "learning_rate": 1.9998767935893143e-05, "loss": 0.0789, "step": 51170 }, { "epoch": 61.73747736873869, "grad_norm": 6.054654598236084, "learning_rate": 1.999876769463835e-05, "loss": 0.0755, "step": 51180 }, { "epoch": 61.749547374773684, "grad_norm": 6.155220031738281, "learning_rate": 1.9998767453383555e-05, "loss": 0.0772, "step": 51190 }, { "epoch": 61.76161738080869, "grad_norm": 6.875672817230225, "learning_rate": 1.999876721212876e-05, "loss": 0.0766, "step": 51200 }, { "epoch": 61.77368738684369, "grad_norm": 6.5312042236328125, "learning_rate": 1.9998766970873968e-05, "loss": 0.077, "step": 51210 }, { "epoch": 61.78575739287869, "grad_norm": 6.193559646606445, "learning_rate": 1.999876672961917e-05, "loss": 0.0763, "step": 51220 }, { "epoch": 61.7978273989137, "grad_norm": 6.277567386627197, "learning_rate": 1.9998766488364377e-05, "loss": 0.0762, "step": 51230 }, { "epoch": 61.8098974049487, "grad_norm": 6.42506742477417, "learning_rate": 1.9998766247109583e-05, "loss": 0.0794, "step": 51240 }, { "epoch": 61.8219674109837, "grad_norm": 6.496801853179932, "learning_rate": 1.999876600585479e-05, "loss": 0.0774, "step": 51250 }, { "epoch": 61.834037417018706, "grad_norm": 7.007970809936523, "learning_rate": 1.9998765764599996e-05, "loss": 0.0794, "step": 51260 }, { "epoch": 61.84610742305371, "grad_norm": 6.628303050994873, "learning_rate": 1.9998765523345202e-05, "loss": 0.0783, "step": 51270 }, { "epoch": 61.85817742908871, "grad_norm": 5.816625118255615, "learning_rate": 1.9998765282090408e-05, "loss": 0.079, "step": 51280 }, { "epoch": 61.870247435123716, "grad_norm": 6.95757532119751, "learning_rate": 1.9998765040835614e-05, "loss": 0.0799, "step": 51290 }, { "epoch": 61.88231744115872, "grad_norm": 6.950145721435547, "learning_rate": 1.999876479958082e-05, "loss": 0.0802, "step": 51300 }, { "epoch": 61.89438744719372, "grad_norm": 6.416000843048096, "learning_rate": 1.9998764558326027e-05, "loss": 0.0775, "step": 51310 }, { "epoch": 61.906457453228725, "grad_norm": 6.802143096923828, "learning_rate": 1.9998764317071233e-05, "loss": 0.0761, "step": 51320 }, { "epoch": 61.91852745926373, "grad_norm": 6.743236541748047, "learning_rate": 1.999876407581644e-05, "loss": 0.0783, "step": 51330 }, { "epoch": 61.93059746529873, "grad_norm": 6.170754432678223, "learning_rate": 1.9998763834561645e-05, "loss": 0.0772, "step": 51340 }, { "epoch": 61.942667471333735, "grad_norm": 6.006774425506592, "learning_rate": 1.999876359330685e-05, "loss": 0.0795, "step": 51350 }, { "epoch": 61.95473747736874, "grad_norm": 6.653646945953369, "learning_rate": 1.9998763352052058e-05, "loss": 0.0803, "step": 51360 }, { "epoch": 61.96680748340374, "grad_norm": 7.58525276184082, "learning_rate": 1.9998763110797264e-05, "loss": 0.0815, "step": 51370 }, { "epoch": 61.978877489438744, "grad_norm": 6.5828166007995605, "learning_rate": 1.999876286954247e-05, "loss": 0.0829, "step": 51380 }, { "epoch": 61.99094749547375, "grad_norm": 7.018863677978516, "learning_rate": 1.9998762628287676e-05, "loss": 0.0815, "step": 51390 }, { "epoch": 62.002414001207, "grad_norm": 5.277304172515869, "learning_rate": 1.9998762387032883e-05, "loss": 0.0761, "step": 51400 }, { "epoch": 62.014484007242004, "grad_norm": 5.3947296142578125, "learning_rate": 1.999876214577809e-05, "loss": 0.0562, "step": 51410 }, { "epoch": 62.02655401327701, "grad_norm": 5.909031867980957, "learning_rate": 1.9998761904523295e-05, "loss": 0.0598, "step": 51420 }, { "epoch": 62.03862401931201, "grad_norm": 5.327704429626465, "learning_rate": 1.99987616632685e-05, "loss": 0.0597, "step": 51430 }, { "epoch": 62.05069402534701, "grad_norm": 5.5216240882873535, "learning_rate": 1.9998761422013707e-05, "loss": 0.0558, "step": 51440 }, { "epoch": 62.06276403138202, "grad_norm": 5.662876129150391, "learning_rate": 1.9998761180758914e-05, "loss": 0.0599, "step": 51450 }, { "epoch": 62.07483403741702, "grad_norm": 5.473897933959961, "learning_rate": 1.999876093950412e-05, "loss": 0.0625, "step": 51460 }, { "epoch": 62.08690404345202, "grad_norm": 6.126105785369873, "learning_rate": 1.9998760698249323e-05, "loss": 0.0609, "step": 51470 }, { "epoch": 62.098974049487026, "grad_norm": 6.131624698638916, "learning_rate": 1.999876045699453e-05, "loss": 0.0632, "step": 51480 }, { "epoch": 62.11104405552203, "grad_norm": 5.788077354431152, "learning_rate": 1.9998760215739735e-05, "loss": 0.0615, "step": 51490 }, { "epoch": 62.12311406155703, "grad_norm": 5.6449480056762695, "learning_rate": 1.999875997448494e-05, "loss": 0.0614, "step": 51500 }, { "epoch": 62.12311406155703, "eval_loss": 12.151297569274902, "eval_runtime": 8.135, "eval_samples_per_second": 85.68, "eval_steps_per_second": 10.818, "step": 51500 }, { "epoch": 62.135184067592036, "grad_norm": 5.617641925811768, "learning_rate": 1.9998759733230148e-05, "loss": 0.0622, "step": 51510 }, { "epoch": 62.14725407362704, "grad_norm": 6.195265293121338, "learning_rate": 1.9998759491975354e-05, "loss": 0.0671, "step": 51520 }, { "epoch": 62.15932407966204, "grad_norm": 5.76193904876709, "learning_rate": 1.999875925072056e-05, "loss": 0.0654, "step": 51530 }, { "epoch": 62.171394085697045, "grad_norm": 5.876688480377197, "learning_rate": 1.9998759009465766e-05, "loss": 0.0649, "step": 51540 }, { "epoch": 62.18346409173205, "grad_norm": 5.917112350463867, "learning_rate": 1.9998758768210972e-05, "loss": 0.0652, "step": 51550 }, { "epoch": 62.19553409776705, "grad_norm": 5.712214946746826, "learning_rate": 1.999875852695618e-05, "loss": 0.0663, "step": 51560 }, { "epoch": 62.207604103802055, "grad_norm": 5.860784530639648, "learning_rate": 1.9998758285701385e-05, "loss": 0.0699, "step": 51570 }, { "epoch": 62.21967410983706, "grad_norm": 5.796151638031006, "learning_rate": 1.999875804444659e-05, "loss": 0.0664, "step": 51580 }, { "epoch": 62.23174411587206, "grad_norm": 6.236091136932373, "learning_rate": 1.9998757803191797e-05, "loss": 0.0695, "step": 51590 }, { "epoch": 62.243814121907064, "grad_norm": 6.071693420410156, "learning_rate": 1.9998757561937003e-05, "loss": 0.0741, "step": 51600 }, { "epoch": 62.25588412794206, "grad_norm": 6.265758514404297, "learning_rate": 1.999875732068221e-05, "loss": 0.0702, "step": 51610 }, { "epoch": 62.267954133977064, "grad_norm": 6.032050609588623, "learning_rate": 1.9998757079427416e-05, "loss": 0.0725, "step": 51620 }, { "epoch": 62.28002414001207, "grad_norm": 6.5025739669799805, "learning_rate": 1.9998756838172622e-05, "loss": 0.0693, "step": 51630 }, { "epoch": 62.29209414604707, "grad_norm": 6.158576965332031, "learning_rate": 1.999875659691783e-05, "loss": 0.0697, "step": 51640 }, { "epoch": 62.30416415208207, "grad_norm": 5.795437335968018, "learning_rate": 1.9998756355663035e-05, "loss": 0.0693, "step": 51650 }, { "epoch": 62.316234158117076, "grad_norm": 6.339942932128906, "learning_rate": 1.999875611440824e-05, "loss": 0.0709, "step": 51660 }, { "epoch": 62.32830416415208, "grad_norm": 5.2871880531311035, "learning_rate": 1.9998755873153447e-05, "loss": 0.0685, "step": 51670 }, { "epoch": 62.34037417018708, "grad_norm": 6.372450828552246, "learning_rate": 1.9998755631898653e-05, "loss": 0.0708, "step": 51680 }, { "epoch": 62.352444176222086, "grad_norm": 6.260430335998535, "learning_rate": 1.999875539064386e-05, "loss": 0.0715, "step": 51690 }, { "epoch": 62.36451418225709, "grad_norm": 6.3511176109313965, "learning_rate": 1.9998755149389066e-05, "loss": 0.0727, "step": 51700 }, { "epoch": 62.37658418829209, "grad_norm": 5.9953389167785645, "learning_rate": 1.9998754908134272e-05, "loss": 0.0755, "step": 51710 }, { "epoch": 62.388654194327096, "grad_norm": 6.733605861663818, "learning_rate": 1.9998754666879478e-05, "loss": 0.0705, "step": 51720 }, { "epoch": 62.4007242003621, "grad_norm": 6.7275872230529785, "learning_rate": 1.9998754425624684e-05, "loss": 0.0717, "step": 51730 }, { "epoch": 62.4127942063971, "grad_norm": 6.313235282897949, "learning_rate": 1.999875418436989e-05, "loss": 0.0737, "step": 51740 }, { "epoch": 62.424864212432105, "grad_norm": 6.066461086273193, "learning_rate": 1.9998753943115097e-05, "loss": 0.0749, "step": 51750 }, { "epoch": 62.43693421846711, "grad_norm": 6.103087902069092, "learning_rate": 1.9998753701860303e-05, "loss": 0.073, "step": 51760 }, { "epoch": 62.44900422450211, "grad_norm": 6.343201160430908, "learning_rate": 1.999875346060551e-05, "loss": 0.07, "step": 51770 }, { "epoch": 62.461074230537115, "grad_norm": 5.531157970428467, "learning_rate": 1.9998753219350715e-05, "loss": 0.0719, "step": 51780 }, { "epoch": 62.47314423657212, "grad_norm": 6.030037879943848, "learning_rate": 1.999875297809592e-05, "loss": 0.0718, "step": 51790 }, { "epoch": 62.48521424260712, "grad_norm": 7.075046539306641, "learning_rate": 1.9998752736841128e-05, "loss": 0.0725, "step": 51800 }, { "epoch": 62.497284248642124, "grad_norm": 6.648205280303955, "learning_rate": 1.9998752495586334e-05, "loss": 0.0746, "step": 51810 }, { "epoch": 62.50935425467713, "grad_norm": 6.007531642913818, "learning_rate": 1.999875225433154e-05, "loss": 0.0741, "step": 51820 }, { "epoch": 62.52142426071213, "grad_norm": 5.663097381591797, "learning_rate": 1.9998752013076746e-05, "loss": 0.0704, "step": 51830 }, { "epoch": 62.533494266747134, "grad_norm": 6.366771221160889, "learning_rate": 1.9998751771821953e-05, "loss": 0.0732, "step": 51840 }, { "epoch": 62.54556427278214, "grad_norm": 6.782674789428711, "learning_rate": 1.999875153056716e-05, "loss": 0.0726, "step": 51850 }, { "epoch": 62.55763427881714, "grad_norm": 6.244121551513672, "learning_rate": 1.9998751289312365e-05, "loss": 0.0703, "step": 51860 }, { "epoch": 62.56970428485214, "grad_norm": 6.4774346351623535, "learning_rate": 1.999875104805757e-05, "loss": 0.0769, "step": 51870 }, { "epoch": 62.58177429088715, "grad_norm": 6.0912275314331055, "learning_rate": 1.9998750806802774e-05, "loss": 0.0754, "step": 51880 }, { "epoch": 62.59384429692215, "grad_norm": 6.466669082641602, "learning_rate": 1.999875056554798e-05, "loss": 0.0758, "step": 51890 }, { "epoch": 62.60591430295715, "grad_norm": 5.705288887023926, "learning_rate": 1.9998750324293187e-05, "loss": 0.0725, "step": 51900 }, { "epoch": 62.617984308992156, "grad_norm": 6.801649570465088, "learning_rate": 1.9998750083038393e-05, "loss": 0.0719, "step": 51910 }, { "epoch": 62.63005431502716, "grad_norm": 6.8308916091918945, "learning_rate": 1.99987498417836e-05, "loss": 0.0753, "step": 51920 }, { "epoch": 62.64212432106216, "grad_norm": 6.7044782638549805, "learning_rate": 1.9998749600528805e-05, "loss": 0.0774, "step": 51930 }, { "epoch": 62.654194327097166, "grad_norm": 6.380041122436523, "learning_rate": 1.999874935927401e-05, "loss": 0.0759, "step": 51940 }, { "epoch": 62.66626433313217, "grad_norm": 5.9558305740356445, "learning_rate": 1.9998749118019218e-05, "loss": 0.0767, "step": 51950 }, { "epoch": 62.67833433916717, "grad_norm": 6.740294456481934, "learning_rate": 1.9998748876764424e-05, "loss": 0.0755, "step": 51960 }, { "epoch": 62.690404345202175, "grad_norm": 7.016548156738281, "learning_rate": 1.999874863550963e-05, "loss": 0.0762, "step": 51970 }, { "epoch": 62.70247435123718, "grad_norm": 7.02538537979126, "learning_rate": 1.9998748394254836e-05, "loss": 0.0767, "step": 51980 }, { "epoch": 62.71454435727218, "grad_norm": 6.335303783416748, "learning_rate": 1.9998748153000042e-05, "loss": 0.0776, "step": 51990 }, { "epoch": 62.726614363307185, "grad_norm": 6.452943325042725, "learning_rate": 1.999874791174525e-05, "loss": 0.0781, "step": 52000 }, { "epoch": 62.726614363307185, "eval_loss": 12.177423477172852, "eval_runtime": 8.1431, "eval_samples_per_second": 85.594, "eval_steps_per_second": 10.807, "step": 52000 }, { "epoch": 62.73868436934219, "grad_norm": 6.757974147796631, "learning_rate": 1.9998747670490455e-05, "loss": 0.0766, "step": 52010 }, { "epoch": 62.750754375377184, "grad_norm": 6.47194766998291, "learning_rate": 1.999874742923566e-05, "loss": 0.079, "step": 52020 }, { "epoch": 62.76282438141219, "grad_norm": 6.489194393157959, "learning_rate": 1.9998747187980867e-05, "loss": 0.0768, "step": 52030 }, { "epoch": 62.77489438744719, "grad_norm": 7.070462703704834, "learning_rate": 1.9998746946726074e-05, "loss": 0.0735, "step": 52040 }, { "epoch": 62.786964393482194, "grad_norm": 6.3723649978637695, "learning_rate": 1.999874670547128e-05, "loss": 0.08, "step": 52050 }, { "epoch": 62.7990343995172, "grad_norm": 6.266329288482666, "learning_rate": 1.9998746464216486e-05, "loss": 0.0798, "step": 52060 }, { "epoch": 62.8111044055522, "grad_norm": 6.595778465270996, "learning_rate": 1.9998746222961692e-05, "loss": 0.0802, "step": 52070 }, { "epoch": 62.8231744115872, "grad_norm": 6.570096015930176, "learning_rate": 1.99987459817069e-05, "loss": 0.078, "step": 52080 }, { "epoch": 62.83524441762221, "grad_norm": 6.4898810386657715, "learning_rate": 1.9998745740452105e-05, "loss": 0.0768, "step": 52090 }, { "epoch": 62.84731442365721, "grad_norm": 6.467265605926514, "learning_rate": 1.999874549919731e-05, "loss": 0.0776, "step": 52100 }, { "epoch": 62.85938442969221, "grad_norm": 6.633350372314453, "learning_rate": 1.9998745257942517e-05, "loss": 0.0788, "step": 52110 }, { "epoch": 62.871454435727216, "grad_norm": 6.3134589195251465, "learning_rate": 1.9998745016687723e-05, "loss": 0.0782, "step": 52120 }, { "epoch": 62.88352444176222, "grad_norm": 6.557009696960449, "learning_rate": 1.9998744775432926e-05, "loss": 0.0783, "step": 52130 }, { "epoch": 62.89559444779722, "grad_norm": 6.42658805847168, "learning_rate": 1.9998744534178132e-05, "loss": 0.0768, "step": 52140 }, { "epoch": 62.907664453832226, "grad_norm": 6.697299480438232, "learning_rate": 1.999874429292334e-05, "loss": 0.0792, "step": 52150 }, { "epoch": 62.91973445986723, "grad_norm": 6.791019916534424, "learning_rate": 1.9998744051668545e-05, "loss": 0.0791, "step": 52160 }, { "epoch": 62.93180446590223, "grad_norm": 7.138272762298584, "learning_rate": 1.999874381041375e-05, "loss": 0.0802, "step": 52170 }, { "epoch": 62.943874471937235, "grad_norm": 6.272778034210205, "learning_rate": 1.9998743569158957e-05, "loss": 0.0781, "step": 52180 }, { "epoch": 62.95594447797224, "grad_norm": 6.35160493850708, "learning_rate": 1.9998743327904163e-05, "loss": 0.0794, "step": 52190 }, { "epoch": 62.96801448400724, "grad_norm": 6.384340286254883, "learning_rate": 1.999874308664937e-05, "loss": 0.0787, "step": 52200 }, { "epoch": 62.980084490042245, "grad_norm": 6.850888252258301, "learning_rate": 1.9998742845394576e-05, "loss": 0.08, "step": 52210 }, { "epoch": 62.99215449607725, "grad_norm": 6.332371711730957, "learning_rate": 1.9998742604139782e-05, "loss": 0.0774, "step": 52220 }, { "epoch": 63.0036210018105, "grad_norm": 5.355305194854736, "learning_rate": 1.999874236288499e-05, "loss": 0.07, "step": 52230 }, { "epoch": 63.015691007845504, "grad_norm": 5.396366119384766, "learning_rate": 1.9998742121630198e-05, "loss": 0.0518, "step": 52240 }, { "epoch": 63.02776101388051, "grad_norm": 5.437967777252197, "learning_rate": 1.9998741880375404e-05, "loss": 0.06, "step": 52250 }, { "epoch": 63.03983101991551, "grad_norm": 5.802509307861328, "learning_rate": 1.999874163912061e-05, "loss": 0.0578, "step": 52260 }, { "epoch": 63.051901025950514, "grad_norm": 5.677207946777344, "learning_rate": 1.9998741397865817e-05, "loss": 0.0603, "step": 52270 }, { "epoch": 63.06397103198552, "grad_norm": 5.376163959503174, "learning_rate": 1.9998741156611023e-05, "loss": 0.0574, "step": 52280 }, { "epoch": 63.07604103802052, "grad_norm": 5.681808948516846, "learning_rate": 1.999874091535623e-05, "loss": 0.0604, "step": 52290 }, { "epoch": 63.08811104405552, "grad_norm": 5.471595764160156, "learning_rate": 1.9998740674101432e-05, "loss": 0.0629, "step": 52300 }, { "epoch": 63.10018105009053, "grad_norm": 5.969762802124023, "learning_rate": 1.9998740432846638e-05, "loss": 0.064, "step": 52310 }, { "epoch": 63.11225105612553, "grad_norm": 5.926026344299316, "learning_rate": 1.9998740191591844e-05, "loss": 0.0661, "step": 52320 }, { "epoch": 63.12432106216053, "grad_norm": 5.528351306915283, "learning_rate": 1.999873995033705e-05, "loss": 0.0645, "step": 52330 }, { "epoch": 63.136391068195536, "grad_norm": 6.179070949554443, "learning_rate": 1.9998739709082257e-05, "loss": 0.0655, "step": 52340 }, { "epoch": 63.14846107423054, "grad_norm": 5.264646053314209, "learning_rate": 1.9998739467827463e-05, "loss": 0.0638, "step": 52350 }, { "epoch": 63.16053108026554, "grad_norm": 5.775887489318848, "learning_rate": 1.999873922657267e-05, "loss": 0.064, "step": 52360 }, { "epoch": 63.172601086300546, "grad_norm": 5.643163681030273, "learning_rate": 1.9998738985317875e-05, "loss": 0.0676, "step": 52370 }, { "epoch": 63.18467109233555, "grad_norm": 5.786967754364014, "learning_rate": 1.999873874406308e-05, "loss": 0.0653, "step": 52380 }, { "epoch": 63.19674109837055, "grad_norm": 5.969313621520996, "learning_rate": 1.9998738502808288e-05, "loss": 0.0648, "step": 52390 }, { "epoch": 63.208811104405555, "grad_norm": 6.371257305145264, "learning_rate": 1.9998738261553494e-05, "loss": 0.0694, "step": 52400 }, { "epoch": 63.22088111044056, "grad_norm": 5.6411943435668945, "learning_rate": 1.99987380202987e-05, "loss": 0.0663, "step": 52410 }, { "epoch": 63.23295111647556, "grad_norm": 5.972850322723389, "learning_rate": 1.9998737779043906e-05, "loss": 0.0669, "step": 52420 }, { "epoch": 63.245021122510565, "grad_norm": 6.534405708312988, "learning_rate": 1.9998737537789113e-05, "loss": 0.0655, "step": 52430 }, { "epoch": 63.25709112854556, "grad_norm": 5.397893905639648, "learning_rate": 1.999873729653432e-05, "loss": 0.0655, "step": 52440 }, { "epoch": 63.269161134580564, "grad_norm": 6.016814708709717, "learning_rate": 1.9998737055279525e-05, "loss": 0.066, "step": 52450 }, { "epoch": 63.28123114061557, "grad_norm": 5.9818902015686035, "learning_rate": 1.999873681402473e-05, "loss": 0.0697, "step": 52460 }, { "epoch": 63.29330114665057, "grad_norm": 6.037510395050049, "learning_rate": 1.9998736572769937e-05, "loss": 0.0677, "step": 52470 }, { "epoch": 63.305371152685574, "grad_norm": 6.352215766906738, "learning_rate": 1.9998736331515144e-05, "loss": 0.0688, "step": 52480 }, { "epoch": 63.31744115872058, "grad_norm": 5.681643486022949, "learning_rate": 1.999873609026035e-05, "loss": 0.0665, "step": 52490 }, { "epoch": 63.32951116475558, "grad_norm": 6.0315022468566895, "learning_rate": 1.9998735849005556e-05, "loss": 0.0676, "step": 52500 }, { "epoch": 63.32951116475558, "eval_loss": 12.188133239746094, "eval_runtime": 8.1575, "eval_samples_per_second": 85.443, "eval_steps_per_second": 10.788, "step": 52500 }, { "epoch": 63.34158117079058, "grad_norm": 5.988542556762695, "learning_rate": 1.9998735607750762e-05, "loss": 0.072, "step": 52510 }, { "epoch": 63.353651176825586, "grad_norm": 6.744535446166992, "learning_rate": 1.999873536649597e-05, "loss": 0.07, "step": 52520 }, { "epoch": 63.36572118286059, "grad_norm": 6.40999174118042, "learning_rate": 1.9998735125241175e-05, "loss": 0.0723, "step": 52530 }, { "epoch": 63.37779118889559, "grad_norm": 6.5710978507995605, "learning_rate": 1.999873488398638e-05, "loss": 0.0685, "step": 52540 }, { "epoch": 63.389861194930596, "grad_norm": 5.749763011932373, "learning_rate": 1.9998734642731584e-05, "loss": 0.0716, "step": 52550 }, { "epoch": 63.4019312009656, "grad_norm": 6.658754348754883, "learning_rate": 1.999873440147679e-05, "loss": 0.0732, "step": 52560 }, { "epoch": 63.4140012070006, "grad_norm": 6.615808963775635, "learning_rate": 1.9998734160221996e-05, "loss": 0.0733, "step": 52570 }, { "epoch": 63.426071213035605, "grad_norm": 5.925971031188965, "learning_rate": 1.9998733918967202e-05, "loss": 0.0704, "step": 52580 }, { "epoch": 63.43814121907061, "grad_norm": 6.0478291511535645, "learning_rate": 1.999873367771241e-05, "loss": 0.0708, "step": 52590 }, { "epoch": 63.45021122510561, "grad_norm": 6.3954949378967285, "learning_rate": 1.9998733436457615e-05, "loss": 0.0701, "step": 52600 }, { "epoch": 63.462281231140615, "grad_norm": 6.086205005645752, "learning_rate": 1.999873319520282e-05, "loss": 0.0764, "step": 52610 }, { "epoch": 63.47435123717562, "grad_norm": 6.280473232269287, "learning_rate": 1.9998732953948027e-05, "loss": 0.0745, "step": 52620 }, { "epoch": 63.48642124321062, "grad_norm": 6.127275466918945, "learning_rate": 1.9998732712693233e-05, "loss": 0.0752, "step": 52630 }, { "epoch": 63.498491249245625, "grad_norm": 6.084734916687012, "learning_rate": 1.999873247143844e-05, "loss": 0.0714, "step": 52640 }, { "epoch": 63.51056125528063, "grad_norm": 5.96805477142334, "learning_rate": 1.9998732230183646e-05, "loss": 0.0721, "step": 52650 }, { "epoch": 63.52263126131563, "grad_norm": 6.402387619018555, "learning_rate": 1.9998731988928852e-05, "loss": 0.0701, "step": 52660 }, { "epoch": 63.534701267350634, "grad_norm": 6.62894344329834, "learning_rate": 1.999873174767406e-05, "loss": 0.072, "step": 52670 }, { "epoch": 63.54677127338564, "grad_norm": 6.623753070831299, "learning_rate": 1.9998731506419265e-05, "loss": 0.0706, "step": 52680 }, { "epoch": 63.55884127942064, "grad_norm": 6.60914945602417, "learning_rate": 1.999873126516447e-05, "loss": 0.0754, "step": 52690 }, { "epoch": 63.570911285455644, "grad_norm": 7.382273197174072, "learning_rate": 1.9998731023909677e-05, "loss": 0.0736, "step": 52700 }, { "epoch": 63.58298129149065, "grad_norm": 6.808924198150635, "learning_rate": 1.9998730782654883e-05, "loss": 0.0752, "step": 52710 }, { "epoch": 63.59505129752565, "grad_norm": 5.928431510925293, "learning_rate": 1.999873054140009e-05, "loss": 0.0745, "step": 52720 }, { "epoch": 63.60712130356065, "grad_norm": 6.603503704071045, "learning_rate": 1.9998730300145296e-05, "loss": 0.0751, "step": 52730 }, { "epoch": 63.61919130959566, "grad_norm": 6.706510066986084, "learning_rate": 1.9998730058890502e-05, "loss": 0.0729, "step": 52740 }, { "epoch": 63.63126131563066, "grad_norm": 6.817453384399414, "learning_rate": 1.9998729817635708e-05, "loss": 0.0745, "step": 52750 }, { "epoch": 63.64333132166566, "grad_norm": 6.814085960388184, "learning_rate": 1.9998729576380914e-05, "loss": 0.0757, "step": 52760 }, { "epoch": 63.655401327700666, "grad_norm": 7.036065578460693, "learning_rate": 1.999872933512612e-05, "loss": 0.0761, "step": 52770 }, { "epoch": 63.66747133373567, "grad_norm": 6.850988864898682, "learning_rate": 1.9998729093871327e-05, "loss": 0.0766, "step": 52780 }, { "epoch": 63.67954133977067, "grad_norm": 6.492572784423828, "learning_rate": 1.9998728852616533e-05, "loss": 0.0745, "step": 52790 }, { "epoch": 63.691611345805676, "grad_norm": 6.397115707397461, "learning_rate": 1.999872861136174e-05, "loss": 0.0721, "step": 52800 }, { "epoch": 63.70368135184068, "grad_norm": 6.7146687507629395, "learning_rate": 1.9998728370106945e-05, "loss": 0.0742, "step": 52810 }, { "epoch": 63.71575135787568, "grad_norm": 6.374525547027588, "learning_rate": 1.999872812885215e-05, "loss": 0.0748, "step": 52820 }, { "epoch": 63.727821363910685, "grad_norm": 6.47796106338501, "learning_rate": 1.9998727887597358e-05, "loss": 0.075, "step": 52830 }, { "epoch": 63.73989136994569, "grad_norm": 6.645823001861572, "learning_rate": 1.9998727646342564e-05, "loss": 0.0738, "step": 52840 }, { "epoch": 63.751961375980684, "grad_norm": 6.860070705413818, "learning_rate": 1.999872740508777e-05, "loss": 0.0741, "step": 52850 }, { "epoch": 63.76403138201569, "grad_norm": 6.333770751953125, "learning_rate": 1.9998727163832976e-05, "loss": 0.0761, "step": 52860 }, { "epoch": 63.77610138805069, "grad_norm": 7.277071475982666, "learning_rate": 1.9998726922578183e-05, "loss": 0.0773, "step": 52870 }, { "epoch": 63.788171394085694, "grad_norm": 5.997180461883545, "learning_rate": 1.999872668132339e-05, "loss": 0.0745, "step": 52880 }, { "epoch": 63.8002414001207, "grad_norm": 6.722307205200195, "learning_rate": 1.9998726440068595e-05, "loss": 0.0774, "step": 52890 }, { "epoch": 63.8123114061557, "grad_norm": 6.75794792175293, "learning_rate": 1.99987261988138e-05, "loss": 0.0771, "step": 52900 }, { "epoch": 63.824381412190704, "grad_norm": 6.174970626831055, "learning_rate": 1.9998725957559008e-05, "loss": 0.0741, "step": 52910 }, { "epoch": 63.83645141822571, "grad_norm": 6.502376556396484, "learning_rate": 1.9998725716304214e-05, "loss": 0.0776, "step": 52920 }, { "epoch": 63.84852142426071, "grad_norm": 6.645086765289307, "learning_rate": 1.999872547504942e-05, "loss": 0.0754, "step": 52930 }, { "epoch": 63.86059143029571, "grad_norm": 6.5736188888549805, "learning_rate": 1.9998725233794626e-05, "loss": 0.0763, "step": 52940 }, { "epoch": 63.872661436330716, "grad_norm": 7.508078575134277, "learning_rate": 1.9998724992539832e-05, "loss": 0.0759, "step": 52950 }, { "epoch": 63.88473144236572, "grad_norm": 6.422980785369873, "learning_rate": 1.9998724751285035e-05, "loss": 0.0776, "step": 52960 }, { "epoch": 63.89680144840072, "grad_norm": 6.297300815582275, "learning_rate": 1.999872451003024e-05, "loss": 0.0792, "step": 52970 }, { "epoch": 63.908871454435726, "grad_norm": 6.581125259399414, "learning_rate": 1.9998724268775448e-05, "loss": 0.077, "step": 52980 }, { "epoch": 63.92094146047073, "grad_norm": 6.787035942077637, "learning_rate": 1.9998724027520654e-05, "loss": 0.0762, "step": 52990 }, { "epoch": 63.93301146650573, "grad_norm": 6.314748287200928, "learning_rate": 1.999872378626586e-05, "loss": 0.0772, "step": 53000 }, { "epoch": 63.93301146650573, "eval_loss": 12.19707202911377, "eval_runtime": 10.9034, "eval_samples_per_second": 63.925, "eval_steps_per_second": 8.071, "step": 53000 }, { "epoch": 63.945081472540735, "grad_norm": 5.98057222366333, "learning_rate": 1.9998723545011066e-05, "loss": 0.0771, "step": 53010 }, { "epoch": 63.95715147857574, "grad_norm": 6.349828243255615, "learning_rate": 1.9998723303756273e-05, "loss": 0.0754, "step": 53020 }, { "epoch": 63.96922148461074, "grad_norm": 6.5886406898498535, "learning_rate": 1.999872306250148e-05, "loss": 0.0771, "step": 53030 }, { "epoch": 63.981291490645745, "grad_norm": 6.594023704528809, "learning_rate": 1.9998722821246685e-05, "loss": 0.0801, "step": 53040 }, { "epoch": 63.99336149668075, "grad_norm": 6.227415084838867, "learning_rate": 1.999872257999189e-05, "loss": 0.0798, "step": 53050 }, { "epoch": 64.004828002414, "grad_norm": 5.50883150100708, "learning_rate": 1.9998722338737097e-05, "loss": 0.0629, "step": 53060 }, { "epoch": 64.016898008449, "grad_norm": 5.619184494018555, "learning_rate": 1.9998722097482304e-05, "loss": 0.052, "step": 53070 }, { "epoch": 64.02896801448401, "grad_norm": 5.293949604034424, "learning_rate": 1.999872185622751e-05, "loss": 0.0548, "step": 53080 }, { "epoch": 64.04103802051901, "grad_norm": 5.532398700714111, "learning_rate": 1.9998721614972716e-05, "loss": 0.0581, "step": 53090 }, { "epoch": 64.05310802655401, "grad_norm": 5.832209587097168, "learning_rate": 1.9998721373717922e-05, "loss": 0.0588, "step": 53100 }, { "epoch": 64.06517803258902, "grad_norm": 5.269985675811768, "learning_rate": 1.999872113246313e-05, "loss": 0.0592, "step": 53110 }, { "epoch": 64.07724803862402, "grad_norm": 5.416989326477051, "learning_rate": 1.9998720891208335e-05, "loss": 0.0597, "step": 53120 }, { "epoch": 64.08931804465902, "grad_norm": 5.537458419799805, "learning_rate": 1.999872064995354e-05, "loss": 0.0591, "step": 53130 }, { "epoch": 64.10138805069403, "grad_norm": 5.5969390869140625, "learning_rate": 1.9998720408698747e-05, "loss": 0.0612, "step": 53140 }, { "epoch": 64.11345805672903, "grad_norm": 5.770853042602539, "learning_rate": 1.9998720167443953e-05, "loss": 0.0614, "step": 53150 }, { "epoch": 64.12552806276403, "grad_norm": 5.377837181091309, "learning_rate": 1.999871992618916e-05, "loss": 0.0619, "step": 53160 }, { "epoch": 64.13759806879904, "grad_norm": 6.040500164031982, "learning_rate": 1.9998719684934366e-05, "loss": 0.0604, "step": 53170 }, { "epoch": 64.14966807483404, "grad_norm": 5.457491874694824, "learning_rate": 1.9998719443679572e-05, "loss": 0.0596, "step": 53180 }, { "epoch": 64.16173808086904, "grad_norm": 5.389424800872803, "learning_rate": 1.9998719202424778e-05, "loss": 0.0629, "step": 53190 }, { "epoch": 64.17380808690405, "grad_norm": 6.239108085632324, "learning_rate": 1.9998718961169984e-05, "loss": 0.0643, "step": 53200 }, { "epoch": 64.18587809293905, "grad_norm": 5.416787147521973, "learning_rate": 1.9998718719915187e-05, "loss": 0.0637, "step": 53210 }, { "epoch": 64.19794809897405, "grad_norm": 6.164012908935547, "learning_rate": 1.9998718478660393e-05, "loss": 0.0659, "step": 53220 }, { "epoch": 64.21001810500906, "grad_norm": 6.309813499450684, "learning_rate": 1.99987182374056e-05, "loss": 0.0638, "step": 53230 }, { "epoch": 64.22208811104406, "grad_norm": 5.783747673034668, "learning_rate": 1.9998717996150806e-05, "loss": 0.0658, "step": 53240 }, { "epoch": 64.23415811707906, "grad_norm": 5.87152099609375, "learning_rate": 1.9998717754896012e-05, "loss": 0.0656, "step": 53250 }, { "epoch": 64.24622812311407, "grad_norm": 6.072438716888428, "learning_rate": 1.9998717513641218e-05, "loss": 0.0644, "step": 53260 }, { "epoch": 64.25829812914907, "grad_norm": 5.8912434577941895, "learning_rate": 1.9998717272386425e-05, "loss": 0.0655, "step": 53270 }, { "epoch": 64.27036813518407, "grad_norm": 5.09296989440918, "learning_rate": 1.999871703113163e-05, "loss": 0.0679, "step": 53280 }, { "epoch": 64.28243814121907, "grad_norm": 6.110800266265869, "learning_rate": 1.9998716789876837e-05, "loss": 0.0644, "step": 53290 }, { "epoch": 64.29450814725408, "grad_norm": 5.561822891235352, "learning_rate": 1.9998716548622043e-05, "loss": 0.0673, "step": 53300 }, { "epoch": 64.30657815328908, "grad_norm": 6.028954029083252, "learning_rate": 1.9998716307367253e-05, "loss": 0.067, "step": 53310 }, { "epoch": 64.31864815932408, "grad_norm": 5.688758373260498, "learning_rate": 1.999871606611246e-05, "loss": 0.0658, "step": 53320 }, { "epoch": 64.33071816535909, "grad_norm": 5.459908485412598, "learning_rate": 1.9998715824857665e-05, "loss": 0.0644, "step": 53330 }, { "epoch": 64.34278817139409, "grad_norm": 5.972171783447266, "learning_rate": 1.999871558360287e-05, "loss": 0.0679, "step": 53340 }, { "epoch": 64.3548581774291, "grad_norm": 6.096590518951416, "learning_rate": 1.9998715342348078e-05, "loss": 0.0666, "step": 53350 }, { "epoch": 64.3669281834641, "grad_norm": 6.321018695831299, "learning_rate": 1.9998715101093284e-05, "loss": 0.0706, "step": 53360 }, { "epoch": 64.3789981894991, "grad_norm": 6.588199615478516, "learning_rate": 1.999871485983849e-05, "loss": 0.0722, "step": 53370 }, { "epoch": 64.3910681955341, "grad_norm": 6.118592262268066, "learning_rate": 1.9998714618583693e-05, "loss": 0.0701, "step": 53380 }, { "epoch": 64.4031382015691, "grad_norm": 6.08465051651001, "learning_rate": 1.99987143773289e-05, "loss": 0.0694, "step": 53390 }, { "epoch": 64.41520820760411, "grad_norm": 6.195014476776123, "learning_rate": 1.9998714136074105e-05, "loss": 0.0689, "step": 53400 }, { "epoch": 64.42727821363911, "grad_norm": 6.223791599273682, "learning_rate": 1.999871389481931e-05, "loss": 0.0686, "step": 53410 }, { "epoch": 64.43934821967412, "grad_norm": 6.49624490737915, "learning_rate": 1.9998713653564518e-05, "loss": 0.0715, "step": 53420 }, { "epoch": 64.45141822570912, "grad_norm": 6.370952606201172, "learning_rate": 1.9998713412309724e-05, "loss": 0.072, "step": 53430 }, { "epoch": 64.46348823174412, "grad_norm": 6.605062007904053, "learning_rate": 1.999871317105493e-05, "loss": 0.0733, "step": 53440 }, { "epoch": 64.47555823777913, "grad_norm": 5.7833733558654785, "learning_rate": 1.9998712929800136e-05, "loss": 0.0725, "step": 53450 }, { "epoch": 64.48762824381413, "grad_norm": 6.409944534301758, "learning_rate": 1.9998712688545343e-05, "loss": 0.0741, "step": 53460 }, { "epoch": 64.49969824984913, "grad_norm": 6.617911338806152, "learning_rate": 1.999871244729055e-05, "loss": 0.0715, "step": 53470 }, { "epoch": 64.51176825588412, "grad_norm": 5.702047824859619, "learning_rate": 1.9998712206035755e-05, "loss": 0.0687, "step": 53480 }, { "epoch": 64.52383826191912, "grad_norm": 6.052584171295166, "learning_rate": 1.999871196478096e-05, "loss": 0.0719, "step": 53490 }, { "epoch": 64.53590826795413, "grad_norm": 6.315539360046387, "learning_rate": 1.9998711723526167e-05, "loss": 0.0726, "step": 53500 }, { "epoch": 64.53590826795413, "eval_loss": 12.204874038696289, "eval_runtime": 8.2532, "eval_samples_per_second": 84.452, "eval_steps_per_second": 10.663, "step": 53500 }, { "epoch": 64.54797827398913, "grad_norm": 5.882059097290039, "learning_rate": 1.9998711482271374e-05, "loss": 0.0744, "step": 53510 }, { "epoch": 64.56004828002413, "grad_norm": 6.143609046936035, "learning_rate": 1.999871124101658e-05, "loss": 0.0714, "step": 53520 }, { "epoch": 64.57211828605914, "grad_norm": 5.868282794952393, "learning_rate": 1.9998710999761786e-05, "loss": 0.0716, "step": 53530 }, { "epoch": 64.58418829209414, "grad_norm": 5.88337516784668, "learning_rate": 1.9998710758506992e-05, "loss": 0.0731, "step": 53540 }, { "epoch": 64.59625829812914, "grad_norm": 6.221279144287109, "learning_rate": 1.99987105172522e-05, "loss": 0.074, "step": 53550 }, { "epoch": 64.60832830416415, "grad_norm": 6.455683708190918, "learning_rate": 1.9998710275997405e-05, "loss": 0.0761, "step": 53560 }, { "epoch": 64.62039831019915, "grad_norm": 6.457929611206055, "learning_rate": 1.999871003474261e-05, "loss": 0.0707, "step": 53570 }, { "epoch": 64.63246831623415, "grad_norm": 6.880386829376221, "learning_rate": 1.9998709793487817e-05, "loss": 0.0716, "step": 53580 }, { "epoch": 64.64453832226916, "grad_norm": 6.057446002960205, "learning_rate": 1.9998709552233023e-05, "loss": 0.0726, "step": 53590 }, { "epoch": 64.65660832830416, "grad_norm": 6.294514179229736, "learning_rate": 1.999870931097823e-05, "loss": 0.0689, "step": 53600 }, { "epoch": 64.66867833433916, "grad_norm": 6.327899932861328, "learning_rate": 1.9998709069723436e-05, "loss": 0.0757, "step": 53610 }, { "epoch": 64.68074834037417, "grad_norm": 6.8767476081848145, "learning_rate": 1.9998708828468642e-05, "loss": 0.0738, "step": 53620 }, { "epoch": 64.69281834640917, "grad_norm": 6.271035671234131, "learning_rate": 1.9998708587213845e-05, "loss": 0.0763, "step": 53630 }, { "epoch": 64.70488835244417, "grad_norm": 6.758405685424805, "learning_rate": 1.999870834595905e-05, "loss": 0.0764, "step": 53640 }, { "epoch": 64.71695835847918, "grad_norm": 6.250345706939697, "learning_rate": 1.9998708104704257e-05, "loss": 0.0737, "step": 53650 }, { "epoch": 64.72902836451418, "grad_norm": 6.0571722984313965, "learning_rate": 1.9998707863449464e-05, "loss": 0.0719, "step": 53660 }, { "epoch": 64.74109837054918, "grad_norm": 6.3546037673950195, "learning_rate": 1.999870762219467e-05, "loss": 0.073, "step": 53670 }, { "epoch": 64.75316837658418, "grad_norm": 6.600006103515625, "learning_rate": 1.9998707380939876e-05, "loss": 0.0727, "step": 53680 }, { "epoch": 64.76523838261919, "grad_norm": 6.878722190856934, "learning_rate": 1.9998707139685082e-05, "loss": 0.0755, "step": 53690 }, { "epoch": 64.77730838865419, "grad_norm": 6.658950328826904, "learning_rate": 1.999870689843029e-05, "loss": 0.0772, "step": 53700 }, { "epoch": 64.7893783946892, "grad_norm": 6.21690034866333, "learning_rate": 1.9998706657175495e-05, "loss": 0.0729, "step": 53710 }, { "epoch": 64.8014484007242, "grad_norm": 6.151093006134033, "learning_rate": 1.99987064159207e-05, "loss": 0.0752, "step": 53720 }, { "epoch": 64.8135184067592, "grad_norm": 6.473703861236572, "learning_rate": 1.9998706174665907e-05, "loss": 0.0755, "step": 53730 }, { "epoch": 64.8255884127942, "grad_norm": 6.500491619110107, "learning_rate": 1.9998705933411113e-05, "loss": 0.0741, "step": 53740 }, { "epoch": 64.83765841882921, "grad_norm": 6.362079620361328, "learning_rate": 1.999870569215632e-05, "loss": 0.0758, "step": 53750 }, { "epoch": 64.84972842486421, "grad_norm": 6.342347145080566, "learning_rate": 1.9998705450901526e-05, "loss": 0.0748, "step": 53760 }, { "epoch": 64.86179843089921, "grad_norm": 6.154901027679443, "learning_rate": 1.9998705209646732e-05, "loss": 0.0741, "step": 53770 }, { "epoch": 64.87386843693422, "grad_norm": 7.033958911895752, "learning_rate": 1.9998704968391938e-05, "loss": 0.0757, "step": 53780 }, { "epoch": 64.88593844296922, "grad_norm": 6.254695415496826, "learning_rate": 1.9998704727137144e-05, "loss": 0.0749, "step": 53790 }, { "epoch": 64.89800844900422, "grad_norm": 6.806692600250244, "learning_rate": 1.999870448588235e-05, "loss": 0.0778, "step": 53800 }, { "epoch": 64.91007845503923, "grad_norm": 6.320444583892822, "learning_rate": 1.9998704244627557e-05, "loss": 0.0765, "step": 53810 }, { "epoch": 64.92214846107423, "grad_norm": 6.435061454772949, "learning_rate": 1.9998704003372763e-05, "loss": 0.0789, "step": 53820 }, { "epoch": 64.93421846710923, "grad_norm": 6.2396931648254395, "learning_rate": 1.999870376211797e-05, "loss": 0.0739, "step": 53830 }, { "epoch": 64.94628847314424, "grad_norm": 6.360030174255371, "learning_rate": 1.9998703520863175e-05, "loss": 0.0721, "step": 53840 }, { "epoch": 64.95835847917924, "grad_norm": 6.47424840927124, "learning_rate": 1.999870327960838e-05, "loss": 0.0729, "step": 53850 }, { "epoch": 64.97042848521424, "grad_norm": 6.010934829711914, "learning_rate": 1.9998703038353588e-05, "loss": 0.0744, "step": 53860 }, { "epoch": 64.98249849124925, "grad_norm": 6.38721227645874, "learning_rate": 1.9998702797098794e-05, "loss": 0.0769, "step": 53870 }, { "epoch": 64.99456849728425, "grad_norm": 6.097358226776123, "learning_rate": 1.9998702555844e-05, "loss": 0.0766, "step": 53880 }, { "epoch": 65.0060350030175, "grad_norm": 5.091323375701904, "learning_rate": 1.9998702314589206e-05, "loss": 0.0612, "step": 53890 }, { "epoch": 65.0181050090525, "grad_norm": 5.663048267364502, "learning_rate": 1.9998702073334413e-05, "loss": 0.0516, "step": 53900 }, { "epoch": 65.03017501508751, "grad_norm": 5.693656921386719, "learning_rate": 1.999870183207962e-05, "loss": 0.0591, "step": 53910 }, { "epoch": 65.04224502112251, "grad_norm": 6.2347893714904785, "learning_rate": 1.9998701590824825e-05, "loss": 0.059, "step": 53920 }, { "epoch": 65.05431502715751, "grad_norm": 5.179647445678711, "learning_rate": 1.999870134957003e-05, "loss": 0.0589, "step": 53930 }, { "epoch": 65.06638503319252, "grad_norm": 5.25006628036499, "learning_rate": 1.9998701108315238e-05, "loss": 0.058, "step": 53940 }, { "epoch": 65.07845503922752, "grad_norm": 5.5557475090026855, "learning_rate": 1.9998700867060444e-05, "loss": 0.0624, "step": 53950 }, { "epoch": 65.09052504526252, "grad_norm": 5.977875709533691, "learning_rate": 1.999870062580565e-05, "loss": 0.0618, "step": 53960 }, { "epoch": 65.10259505129753, "grad_norm": 6.144732475280762, "learning_rate": 1.9998700384550856e-05, "loss": 0.0606, "step": 53970 }, { "epoch": 65.11466505733253, "grad_norm": 5.6120429039001465, "learning_rate": 1.9998700143296062e-05, "loss": 0.0585, "step": 53980 }, { "epoch": 65.12673506336753, "grad_norm": 5.34677267074585, "learning_rate": 1.999869990204127e-05, "loss": 0.0609, "step": 53990 }, { "epoch": 65.13880506940254, "grad_norm": 6.213379859924316, "learning_rate": 1.9998699660786475e-05, "loss": 0.0614, "step": 54000 }, { "epoch": 65.13880506940254, "eval_loss": 12.195273399353027, "eval_runtime": 8.1377, "eval_samples_per_second": 85.651, "eval_steps_per_second": 10.814, "step": 54000 }, { "epoch": 65.15087507543754, "grad_norm": 5.973742485046387, "learning_rate": 1.999869941953168e-05, "loss": 0.0634, "step": 54010 }, { "epoch": 65.16294508147254, "grad_norm": 5.884477138519287, "learning_rate": 1.9998699178276887e-05, "loss": 0.0637, "step": 54020 }, { "epoch": 65.17501508750755, "grad_norm": 5.35136079788208, "learning_rate": 1.9998698937022093e-05, "loss": 0.0625, "step": 54030 }, { "epoch": 65.18708509354255, "grad_norm": 5.386416912078857, "learning_rate": 1.9998698695767296e-05, "loss": 0.0641, "step": 54040 }, { "epoch": 65.19915509957755, "grad_norm": 5.664528846740723, "learning_rate": 1.9998698454512503e-05, "loss": 0.0631, "step": 54050 }, { "epoch": 65.21122510561256, "grad_norm": 5.997976303100586, "learning_rate": 1.999869821325771e-05, "loss": 0.0654, "step": 54060 }, { "epoch": 65.22329511164756, "grad_norm": 5.749844074249268, "learning_rate": 1.9998697972002915e-05, "loss": 0.0647, "step": 54070 }, { "epoch": 65.23536511768256, "grad_norm": 5.976388454437256, "learning_rate": 1.999869773074812e-05, "loss": 0.0664, "step": 54080 }, { "epoch": 65.24743512371757, "grad_norm": 6.795783996582031, "learning_rate": 1.9998697489493327e-05, "loss": 0.0693, "step": 54090 }, { "epoch": 65.25950512975257, "grad_norm": 6.409842014312744, "learning_rate": 1.9998697248238534e-05, "loss": 0.068, "step": 54100 }, { "epoch": 65.27157513578757, "grad_norm": 6.180502891540527, "learning_rate": 1.999869700698374e-05, "loss": 0.0684, "step": 54110 }, { "epoch": 65.28364514182257, "grad_norm": 5.878511905670166, "learning_rate": 1.9998696765728946e-05, "loss": 0.0662, "step": 54120 }, { "epoch": 65.29571514785758, "grad_norm": 6.194437503814697, "learning_rate": 1.9998696524474152e-05, "loss": 0.0644, "step": 54130 }, { "epoch": 65.30778515389258, "grad_norm": 6.388954162597656, "learning_rate": 1.999869628321936e-05, "loss": 0.0682, "step": 54140 }, { "epoch": 65.31985515992758, "grad_norm": 5.769654750823975, "learning_rate": 1.9998696041964565e-05, "loss": 0.0689, "step": 54150 }, { "epoch": 65.33192516596259, "grad_norm": 6.209526062011719, "learning_rate": 1.999869580070977e-05, "loss": 0.0676, "step": 54160 }, { "epoch": 65.34399517199759, "grad_norm": 5.831426620483398, "learning_rate": 1.9998695559454977e-05, "loss": 0.0683, "step": 54170 }, { "epoch": 65.3560651780326, "grad_norm": 5.870965957641602, "learning_rate": 1.9998695318200183e-05, "loss": 0.0678, "step": 54180 }, { "epoch": 65.3681351840676, "grad_norm": 5.728525161743164, "learning_rate": 1.999869507694539e-05, "loss": 0.0663, "step": 54190 }, { "epoch": 65.3802051901026, "grad_norm": 5.972133159637451, "learning_rate": 1.9998694835690596e-05, "loss": 0.0655, "step": 54200 }, { "epoch": 65.3922751961376, "grad_norm": 6.124404430389404, "learning_rate": 1.9998694594435802e-05, "loss": 0.0668, "step": 54210 }, { "epoch": 65.4043452021726, "grad_norm": 5.8511810302734375, "learning_rate": 1.9998694353181008e-05, "loss": 0.0678, "step": 54220 }, { "epoch": 65.41641520820761, "grad_norm": 6.053267002105713, "learning_rate": 1.9998694111926214e-05, "loss": 0.0682, "step": 54230 }, { "epoch": 65.42848521424261, "grad_norm": 5.710860252380371, "learning_rate": 1.999869387067142e-05, "loss": 0.0695, "step": 54240 }, { "epoch": 65.44055522027762, "grad_norm": 5.702756881713867, "learning_rate": 1.9998693629416627e-05, "loss": 0.0677, "step": 54250 }, { "epoch": 65.45262522631262, "grad_norm": 6.615759372711182, "learning_rate": 1.9998693388161833e-05, "loss": 0.0657, "step": 54260 }, { "epoch": 65.46469523234762, "grad_norm": 6.5554656982421875, "learning_rate": 1.999869314690704e-05, "loss": 0.0695, "step": 54270 }, { "epoch": 65.47676523838263, "grad_norm": 6.821277618408203, "learning_rate": 1.9998692905652245e-05, "loss": 0.0702, "step": 54280 }, { "epoch": 65.48883524441763, "grad_norm": 6.220122814178467, "learning_rate": 1.999869266439745e-05, "loss": 0.0692, "step": 54290 }, { "epoch": 65.50090525045263, "grad_norm": 6.98976993560791, "learning_rate": 1.9998692423142655e-05, "loss": 0.0711, "step": 54300 }, { "epoch": 65.51297525648762, "grad_norm": 6.280595779418945, "learning_rate": 1.999869218188786e-05, "loss": 0.0725, "step": 54310 }, { "epoch": 65.52504526252262, "grad_norm": 6.211183071136475, "learning_rate": 1.9998691940633067e-05, "loss": 0.0662, "step": 54320 }, { "epoch": 65.53711526855763, "grad_norm": 6.734350681304932, "learning_rate": 1.9998691699378273e-05, "loss": 0.073, "step": 54330 }, { "epoch": 65.54918527459263, "grad_norm": 6.765491962432861, "learning_rate": 1.999869145812348e-05, "loss": 0.0695, "step": 54340 }, { "epoch": 65.56125528062763, "grad_norm": 6.459558010101318, "learning_rate": 1.9998691216868686e-05, "loss": 0.0763, "step": 54350 }, { "epoch": 65.57332528666264, "grad_norm": 6.174942493438721, "learning_rate": 1.9998690975613892e-05, "loss": 0.0687, "step": 54360 }, { "epoch": 65.58539529269764, "grad_norm": 5.820307731628418, "learning_rate": 1.9998690734359098e-05, "loss": 0.0682, "step": 54370 }, { "epoch": 65.59746529873264, "grad_norm": 5.98231840133667, "learning_rate": 1.9998690493104304e-05, "loss": 0.0695, "step": 54380 }, { "epoch": 65.60953530476765, "grad_norm": 6.125372409820557, "learning_rate": 1.9998690251849514e-05, "loss": 0.0721, "step": 54390 }, { "epoch": 65.62160531080265, "grad_norm": 6.2620954513549805, "learning_rate": 1.999869001059472e-05, "loss": 0.0685, "step": 54400 }, { "epoch": 65.63367531683765, "grad_norm": 6.019007205963135, "learning_rate": 1.9998689769339926e-05, "loss": 0.0699, "step": 54410 }, { "epoch": 65.64574532287266, "grad_norm": 6.305918216705322, "learning_rate": 1.9998689528085133e-05, "loss": 0.0731, "step": 54420 }, { "epoch": 65.65781532890766, "grad_norm": 5.705585479736328, "learning_rate": 1.999868928683034e-05, "loss": 0.0706, "step": 54430 }, { "epoch": 65.66988533494266, "grad_norm": 6.233631610870361, "learning_rate": 1.9998689045575545e-05, "loss": 0.0714, "step": 54440 }, { "epoch": 65.68195534097767, "grad_norm": 5.773248672485352, "learning_rate": 1.9998688804320748e-05, "loss": 0.0743, "step": 54450 }, { "epoch": 65.69402534701267, "grad_norm": 6.253625392913818, "learning_rate": 1.9998688563065954e-05, "loss": 0.0722, "step": 54460 }, { "epoch": 65.70609535304767, "grad_norm": 6.101328372955322, "learning_rate": 1.999868832181116e-05, "loss": 0.0686, "step": 54470 }, { "epoch": 65.71816535908268, "grad_norm": 6.763965606689453, "learning_rate": 1.9998688080556366e-05, "loss": 0.0733, "step": 54480 }, { "epoch": 65.73023536511768, "grad_norm": 6.381480693817139, "learning_rate": 1.9998687839301573e-05, "loss": 0.0724, "step": 54490 }, { "epoch": 65.74230537115268, "grad_norm": 6.424703598022461, "learning_rate": 1.999868759804678e-05, "loss": 0.0725, "step": 54500 }, { "epoch": 65.74230537115268, "eval_loss": 12.237269401550293, "eval_runtime": 8.4156, "eval_samples_per_second": 82.822, "eval_steps_per_second": 10.457, "step": 54500 }, { "epoch": 65.75437537718769, "grad_norm": 6.226495265960693, "learning_rate": 1.9998687356791985e-05, "loss": 0.0734, "step": 54510 }, { "epoch": 65.76644538322269, "grad_norm": 6.465895175933838, "learning_rate": 1.999868711553719e-05, "loss": 0.0727, "step": 54520 }, { "epoch": 65.77851538925769, "grad_norm": 6.24713134765625, "learning_rate": 1.9998686874282397e-05, "loss": 0.0732, "step": 54530 }, { "epoch": 65.7905853952927, "grad_norm": 6.22451639175415, "learning_rate": 1.9998686633027604e-05, "loss": 0.0747, "step": 54540 }, { "epoch": 65.8026554013277, "grad_norm": 6.921995162963867, "learning_rate": 1.999868639177281e-05, "loss": 0.0735, "step": 54550 }, { "epoch": 65.8147254073627, "grad_norm": 6.046572208404541, "learning_rate": 1.9998686150518016e-05, "loss": 0.0721, "step": 54560 }, { "epoch": 65.8267954133977, "grad_norm": 6.15524435043335, "learning_rate": 1.9998685909263222e-05, "loss": 0.0728, "step": 54570 }, { "epoch": 65.83886541943271, "grad_norm": 6.707143783569336, "learning_rate": 1.999868566800843e-05, "loss": 0.074, "step": 54580 }, { "epoch": 65.85093542546771, "grad_norm": 6.946089744567871, "learning_rate": 1.9998685426753635e-05, "loss": 0.0734, "step": 54590 }, { "epoch": 65.86300543150271, "grad_norm": 7.009385585784912, "learning_rate": 1.999868518549884e-05, "loss": 0.0733, "step": 54600 }, { "epoch": 65.87507543753772, "grad_norm": 5.492537021636963, "learning_rate": 1.9998684944244047e-05, "loss": 0.0742, "step": 54610 }, { "epoch": 65.88714544357272, "grad_norm": 6.403148651123047, "learning_rate": 1.9998684702989253e-05, "loss": 0.0738, "step": 54620 }, { "epoch": 65.89921544960772, "grad_norm": 5.784932613372803, "learning_rate": 1.999868446173446e-05, "loss": 0.072, "step": 54630 }, { "epoch": 65.91128545564273, "grad_norm": 6.8038859367370605, "learning_rate": 1.9998684220479666e-05, "loss": 0.0744, "step": 54640 }, { "epoch": 65.92335546167773, "grad_norm": 6.901299476623535, "learning_rate": 1.9998683979224872e-05, "loss": 0.0755, "step": 54650 }, { "epoch": 65.93542546771273, "grad_norm": 6.109143257141113, "learning_rate": 1.9998683737970078e-05, "loss": 0.0745, "step": 54660 }, { "epoch": 65.94749547374774, "grad_norm": 6.732188701629639, "learning_rate": 1.9998683496715284e-05, "loss": 0.0757, "step": 54670 }, { "epoch": 65.95956547978274, "grad_norm": 5.819955825805664, "learning_rate": 1.999868325546049e-05, "loss": 0.0753, "step": 54680 }, { "epoch": 65.97163548581774, "grad_norm": 6.44964599609375, "learning_rate": 1.9998683014205697e-05, "loss": 0.0741, "step": 54690 }, { "epoch": 65.98370549185275, "grad_norm": 6.924170970916748, "learning_rate": 1.99986827729509e-05, "loss": 0.0764, "step": 54700 }, { "epoch": 65.99577549788775, "grad_norm": 6.0808515548706055, "learning_rate": 1.9998682531696106e-05, "loss": 0.0777, "step": 54710 }, { "epoch": 66.007242003621, "grad_norm": 5.644708156585693, "learning_rate": 1.9998682290441312e-05, "loss": 0.0594, "step": 54720 }, { "epoch": 66.019312009656, "grad_norm": 6.169862747192383, "learning_rate": 1.999868204918652e-05, "loss": 0.0544, "step": 54730 }, { "epoch": 66.03138201569101, "grad_norm": 6.080380916595459, "learning_rate": 1.9998681807931725e-05, "loss": 0.0552, "step": 54740 }, { "epoch": 66.04345202172601, "grad_norm": 5.018310546875, "learning_rate": 1.999868156667693e-05, "loss": 0.0556, "step": 54750 }, { "epoch": 66.05552202776101, "grad_norm": 5.470941543579102, "learning_rate": 1.9998681325422137e-05, "loss": 0.057, "step": 54760 }, { "epoch": 66.06759203379602, "grad_norm": 6.2448344230651855, "learning_rate": 1.9998681084167343e-05, "loss": 0.0591, "step": 54770 }, { "epoch": 66.07966203983102, "grad_norm": 5.545588970184326, "learning_rate": 1.999868084291255e-05, "loss": 0.0602, "step": 54780 }, { "epoch": 66.09173204586602, "grad_norm": 6.245748519897461, "learning_rate": 1.9998680601657756e-05, "loss": 0.0603, "step": 54790 }, { "epoch": 66.10380205190103, "grad_norm": 5.948812007904053, "learning_rate": 1.9998680360402962e-05, "loss": 0.0584, "step": 54800 }, { "epoch": 66.11587205793603, "grad_norm": 5.526739597320557, "learning_rate": 1.9998680119148168e-05, "loss": 0.0609, "step": 54810 }, { "epoch": 66.12794206397103, "grad_norm": 5.164377689361572, "learning_rate": 1.9998679877893374e-05, "loss": 0.0606, "step": 54820 }, { "epoch": 66.14001207000604, "grad_norm": 5.641938209533691, "learning_rate": 1.999867963663858e-05, "loss": 0.0601, "step": 54830 }, { "epoch": 66.15208207604104, "grad_norm": 5.435865879058838, "learning_rate": 1.9998679395383787e-05, "loss": 0.0593, "step": 54840 }, { "epoch": 66.16415208207604, "grad_norm": 5.253557205200195, "learning_rate": 1.9998679154128993e-05, "loss": 0.0608, "step": 54850 }, { "epoch": 66.17622208811105, "grad_norm": 5.8967604637146, "learning_rate": 1.99986789128742e-05, "loss": 0.0635, "step": 54860 }, { "epoch": 66.18829209414605, "grad_norm": 5.9011969566345215, "learning_rate": 1.9998678671619405e-05, "loss": 0.0608, "step": 54870 }, { "epoch": 66.20036210018105, "grad_norm": 5.811470985412598, "learning_rate": 1.999867843036461e-05, "loss": 0.0651, "step": 54880 }, { "epoch": 66.21243210621606, "grad_norm": 5.668938159942627, "learning_rate": 1.9998678189109818e-05, "loss": 0.0638, "step": 54890 }, { "epoch": 66.22450211225106, "grad_norm": 5.939190864562988, "learning_rate": 1.9998677947855024e-05, "loss": 0.0624, "step": 54900 }, { "epoch": 66.23657211828606, "grad_norm": 6.0085344314575195, "learning_rate": 1.999867770660023e-05, "loss": 0.0649, "step": 54910 }, { "epoch": 66.24864212432107, "grad_norm": 5.551784992218018, "learning_rate": 1.9998677465345436e-05, "loss": 0.0632, "step": 54920 }, { "epoch": 66.26071213035607, "grad_norm": 6.374574661254883, "learning_rate": 1.9998677224090643e-05, "loss": 0.066, "step": 54930 }, { "epoch": 66.27278213639107, "grad_norm": 5.48739767074585, "learning_rate": 1.999867698283585e-05, "loss": 0.0654, "step": 54940 }, { "epoch": 66.28485214242608, "grad_norm": 5.8330488204956055, "learning_rate": 1.9998676741581055e-05, "loss": 0.0654, "step": 54950 }, { "epoch": 66.29692214846108, "grad_norm": 5.53649377822876, "learning_rate": 1.999867650032626e-05, "loss": 0.0638, "step": 54960 }, { "epoch": 66.30899215449608, "grad_norm": 6.176201820373535, "learning_rate": 1.9998676259071468e-05, "loss": 0.063, "step": 54970 }, { "epoch": 66.32106216053108, "grad_norm": 5.589507579803467, "learning_rate": 1.9998676017816674e-05, "loss": 0.0655, "step": 54980 }, { "epoch": 66.33313216656609, "grad_norm": 5.726750373840332, "learning_rate": 1.999867577656188e-05, "loss": 0.0657, "step": 54990 }, { "epoch": 66.34520217260109, "grad_norm": 6.356719017028809, "learning_rate": 1.9998675535307086e-05, "loss": 0.0655, "step": 55000 }, { "epoch": 66.34520217260109, "eval_loss": 12.237953186035156, "eval_runtime": 8.1435, "eval_samples_per_second": 85.59, "eval_steps_per_second": 10.806, "step": 55000 }, { "epoch": 66.3572721786361, "grad_norm": 6.130847454071045, "learning_rate": 1.9998675294052292e-05, "loss": 0.0655, "step": 55010 }, { "epoch": 66.3693421846711, "grad_norm": 5.836452484130859, "learning_rate": 1.99986750527975e-05, "loss": 0.0659, "step": 55020 }, { "epoch": 66.3814121907061, "grad_norm": 4.969335079193115, "learning_rate": 1.9998674811542705e-05, "loss": 0.0648, "step": 55030 }, { "epoch": 66.3934821967411, "grad_norm": 5.557551860809326, "learning_rate": 1.999867457028791e-05, "loss": 0.0644, "step": 55040 }, { "epoch": 66.40555220277611, "grad_norm": 5.930431842803955, "learning_rate": 1.9998674329033117e-05, "loss": 0.065, "step": 55050 }, { "epoch": 66.41762220881111, "grad_norm": 6.179157257080078, "learning_rate": 1.9998674087778324e-05, "loss": 0.0665, "step": 55060 }, { "epoch": 66.42969221484611, "grad_norm": 6.439099311828613, "learning_rate": 1.999867384652353e-05, "loss": 0.0663, "step": 55070 }, { "epoch": 66.44176222088112, "grad_norm": 6.269090175628662, "learning_rate": 1.9998673605268736e-05, "loss": 0.0669, "step": 55080 }, { "epoch": 66.45383222691612, "grad_norm": 6.255475997924805, "learning_rate": 1.9998673364013942e-05, "loss": 0.0694, "step": 55090 }, { "epoch": 66.46590223295112, "grad_norm": 6.077669143676758, "learning_rate": 1.999867312275915e-05, "loss": 0.072, "step": 55100 }, { "epoch": 66.47797223898613, "grad_norm": 6.344482421875, "learning_rate": 1.9998672881504355e-05, "loss": 0.0705, "step": 55110 }, { "epoch": 66.49004224502113, "grad_norm": 6.111152648925781, "learning_rate": 1.9998672640249557e-05, "loss": 0.0706, "step": 55120 }, { "epoch": 66.50211225105613, "grad_norm": 6.052796363830566, "learning_rate": 1.9998672398994764e-05, "loss": 0.0683, "step": 55130 }, { "epoch": 66.51418225709112, "grad_norm": 6.016850471496582, "learning_rate": 1.999867215773997e-05, "loss": 0.0655, "step": 55140 }, { "epoch": 66.52625226312612, "grad_norm": 5.768632411956787, "learning_rate": 1.9998671916485176e-05, "loss": 0.069, "step": 55150 }, { "epoch": 66.53832226916113, "grad_norm": 6.094193458557129, "learning_rate": 1.9998671675230382e-05, "loss": 0.0691, "step": 55160 }, { "epoch": 66.55039227519613, "grad_norm": 6.209371089935303, "learning_rate": 1.999867143397559e-05, "loss": 0.0674, "step": 55170 }, { "epoch": 66.56246228123113, "grad_norm": 6.154359817504883, "learning_rate": 1.9998671192720795e-05, "loss": 0.07, "step": 55180 }, { "epoch": 66.57453228726614, "grad_norm": 6.089198112487793, "learning_rate": 1.9998670951466e-05, "loss": 0.0693, "step": 55190 }, { "epoch": 66.58660229330114, "grad_norm": 6.295828342437744, "learning_rate": 1.9998670710211207e-05, "loss": 0.0672, "step": 55200 }, { "epoch": 66.59867229933614, "grad_norm": 6.800573825836182, "learning_rate": 1.9998670468956413e-05, "loss": 0.0716, "step": 55210 }, { "epoch": 66.61074230537115, "grad_norm": 5.738896369934082, "learning_rate": 1.999867022770162e-05, "loss": 0.0715, "step": 55220 }, { "epoch": 66.62281231140615, "grad_norm": 6.151996612548828, "learning_rate": 1.9998669986446826e-05, "loss": 0.0687, "step": 55230 }, { "epoch": 66.63488231744115, "grad_norm": 6.341254711151123, "learning_rate": 1.9998669745192032e-05, "loss": 0.0702, "step": 55240 }, { "epoch": 66.64695232347616, "grad_norm": 6.718547821044922, "learning_rate": 1.9998669503937238e-05, "loss": 0.0735, "step": 55250 }, { "epoch": 66.65902232951116, "grad_norm": 6.336780071258545, "learning_rate": 1.9998669262682444e-05, "loss": 0.0736, "step": 55260 }, { "epoch": 66.67109233554616, "grad_norm": 5.834489345550537, "learning_rate": 1.999866902142765e-05, "loss": 0.0703, "step": 55270 }, { "epoch": 66.68316234158117, "grad_norm": 6.234102249145508, "learning_rate": 1.9998668780172857e-05, "loss": 0.0702, "step": 55280 }, { "epoch": 66.69523234761617, "grad_norm": 6.225027561187744, "learning_rate": 1.9998668538918063e-05, "loss": 0.073, "step": 55290 }, { "epoch": 66.70730235365117, "grad_norm": 5.830474853515625, "learning_rate": 1.999866829766327e-05, "loss": 0.0726, "step": 55300 }, { "epoch": 66.71937235968618, "grad_norm": 6.279987812042236, "learning_rate": 1.9998668056408476e-05, "loss": 0.0683, "step": 55310 }, { "epoch": 66.73144236572118, "grad_norm": 6.33789587020874, "learning_rate": 1.9998667815153682e-05, "loss": 0.0745, "step": 55320 }, { "epoch": 66.74351237175618, "grad_norm": 6.65038537979126, "learning_rate": 1.9998667573898888e-05, "loss": 0.0721, "step": 55330 }, { "epoch": 66.75558237779119, "grad_norm": 5.627010345458984, "learning_rate": 1.9998667332644094e-05, "loss": 0.066, "step": 55340 }, { "epoch": 66.76765238382619, "grad_norm": 6.798277378082275, "learning_rate": 1.99986670913893e-05, "loss": 0.0723, "step": 55350 }, { "epoch": 66.77972238986119, "grad_norm": 6.413732528686523, "learning_rate": 1.9998666850134507e-05, "loss": 0.0758, "step": 55360 }, { "epoch": 66.7917923958962, "grad_norm": 6.684319019317627, "learning_rate": 1.999866660887971e-05, "loss": 0.0759, "step": 55370 }, { "epoch": 66.8038624019312, "grad_norm": 5.903954029083252, "learning_rate": 1.9998666367624916e-05, "loss": 0.0733, "step": 55380 }, { "epoch": 66.8159324079662, "grad_norm": 6.226583480834961, "learning_rate": 1.9998666126370122e-05, "loss": 0.0735, "step": 55390 }, { "epoch": 66.8280024140012, "grad_norm": 5.80227518081665, "learning_rate": 1.9998665885115328e-05, "loss": 0.0712, "step": 55400 }, { "epoch": 66.84007242003621, "grad_norm": 5.992454528808594, "learning_rate": 1.9998665643860534e-05, "loss": 0.0716, "step": 55410 }, { "epoch": 66.85214242607121, "grad_norm": 6.798959255218506, "learning_rate": 1.999866540260574e-05, "loss": 0.0714, "step": 55420 }, { "epoch": 66.86421243210621, "grad_norm": 6.8714799880981445, "learning_rate": 1.9998665161350947e-05, "loss": 0.072, "step": 55430 }, { "epoch": 66.87628243814122, "grad_norm": 6.111284255981445, "learning_rate": 1.9998664920096153e-05, "loss": 0.0705, "step": 55440 }, { "epoch": 66.88835244417622, "grad_norm": 6.500970840454102, "learning_rate": 1.999866467884136e-05, "loss": 0.0743, "step": 55450 }, { "epoch": 66.90042245021122, "grad_norm": 6.886479377746582, "learning_rate": 1.9998664437586565e-05, "loss": 0.0735, "step": 55460 }, { "epoch": 66.91249245624623, "grad_norm": 7.066665172576904, "learning_rate": 1.9998664196331775e-05, "loss": 0.0735, "step": 55470 }, { "epoch": 66.92456246228123, "grad_norm": 5.615832328796387, "learning_rate": 1.999866395507698e-05, "loss": 0.0763, "step": 55480 }, { "epoch": 66.93663246831623, "grad_norm": 7.424221992492676, "learning_rate": 1.9998663713822187e-05, "loss": 0.0725, "step": 55490 }, { "epoch": 66.94870247435124, "grad_norm": 6.500857830047607, "learning_rate": 1.9998663472567394e-05, "loss": 0.0762, "step": 55500 }, { "epoch": 66.94870247435124, "eval_loss": 12.264263153076172, "eval_runtime": 8.1644, "eval_samples_per_second": 85.371, "eval_steps_per_second": 10.779, "step": 55500 }, { "epoch": 66.96077248038624, "grad_norm": 6.350656509399414, "learning_rate": 1.99986632313126e-05, "loss": 0.0752, "step": 55510 }, { "epoch": 66.97284248642124, "grad_norm": 6.499892711639404, "learning_rate": 1.9998662990057806e-05, "loss": 0.0742, "step": 55520 }, { "epoch": 66.98491249245625, "grad_norm": 6.269669532775879, "learning_rate": 1.999866274880301e-05, "loss": 0.0774, "step": 55530 }, { "epoch": 66.99698249849125, "grad_norm": 6.325533866882324, "learning_rate": 1.9998662507548215e-05, "loss": 0.0751, "step": 55540 }, { "epoch": 67.0084490042245, "grad_norm": 4.988296031951904, "learning_rate": 1.999866226629342e-05, "loss": 0.0581, "step": 55550 }, { "epoch": 67.0205190102595, "grad_norm": 5.402826309204102, "learning_rate": 1.9998662025038628e-05, "loss": 0.0514, "step": 55560 }, { "epoch": 67.03258901629451, "grad_norm": 4.941632270812988, "learning_rate": 1.9998661783783834e-05, "loss": 0.0532, "step": 55570 }, { "epoch": 67.04465902232951, "grad_norm": 4.978194713592529, "learning_rate": 1.999866154252904e-05, "loss": 0.0577, "step": 55580 }, { "epoch": 67.05672902836451, "grad_norm": 5.183745861053467, "learning_rate": 1.9998661301274246e-05, "loss": 0.0567, "step": 55590 }, { "epoch": 67.06879903439952, "grad_norm": 5.583774089813232, "learning_rate": 1.9998661060019452e-05, "loss": 0.0548, "step": 55600 }, { "epoch": 67.08086904043452, "grad_norm": 5.4761881828308105, "learning_rate": 1.999866081876466e-05, "loss": 0.056, "step": 55610 }, { "epoch": 67.09293904646952, "grad_norm": 5.491135597229004, "learning_rate": 1.9998660577509865e-05, "loss": 0.0569, "step": 55620 }, { "epoch": 67.10500905250453, "grad_norm": 6.075554847717285, "learning_rate": 1.999866033625507e-05, "loss": 0.0612, "step": 55630 }, { "epoch": 67.11707905853953, "grad_norm": 6.370484352111816, "learning_rate": 1.9998660095000277e-05, "loss": 0.063, "step": 55640 }, { "epoch": 67.12914906457453, "grad_norm": 5.257132530212402, "learning_rate": 1.9998659853745483e-05, "loss": 0.0636, "step": 55650 }, { "epoch": 67.14121907060954, "grad_norm": 5.723682880401611, "learning_rate": 1.999865961249069e-05, "loss": 0.0611, "step": 55660 }, { "epoch": 67.15328907664454, "grad_norm": 5.8559441566467285, "learning_rate": 1.9998659371235896e-05, "loss": 0.0635, "step": 55670 }, { "epoch": 67.16535908267954, "grad_norm": 6.131743907928467, "learning_rate": 1.9998659129981102e-05, "loss": 0.0629, "step": 55680 }, { "epoch": 67.17742908871455, "grad_norm": 5.573004245758057, "learning_rate": 1.999865888872631e-05, "loss": 0.0613, "step": 55690 }, { "epoch": 67.18949909474955, "grad_norm": 5.774176120758057, "learning_rate": 1.9998658647471515e-05, "loss": 0.0608, "step": 55700 }, { "epoch": 67.20156910078455, "grad_norm": 5.9344258308410645, "learning_rate": 1.999865840621672e-05, "loss": 0.0622, "step": 55710 }, { "epoch": 67.21363910681956, "grad_norm": 6.022648334503174, "learning_rate": 1.9998658164961927e-05, "loss": 0.0655, "step": 55720 }, { "epoch": 67.22570911285456, "grad_norm": 5.837624549865723, "learning_rate": 1.9998657923707133e-05, "loss": 0.0631, "step": 55730 }, { "epoch": 67.23777911888956, "grad_norm": 6.43239688873291, "learning_rate": 1.999865768245234e-05, "loss": 0.0614, "step": 55740 }, { "epoch": 67.24984912492457, "grad_norm": 5.743227005004883, "learning_rate": 1.9998657441197546e-05, "loss": 0.063, "step": 55750 }, { "epoch": 67.26191913095957, "grad_norm": 5.77793025970459, "learning_rate": 1.9998657199942752e-05, "loss": 0.0623, "step": 55760 }, { "epoch": 67.27398913699457, "grad_norm": 6.047115802764893, "learning_rate": 1.9998656958687958e-05, "loss": 0.0646, "step": 55770 }, { "epoch": 67.28605914302958, "grad_norm": 6.149204730987549, "learning_rate": 1.999865671743316e-05, "loss": 0.0657, "step": 55780 }, { "epoch": 67.29812914906458, "grad_norm": 6.204369068145752, "learning_rate": 1.9998656476178367e-05, "loss": 0.0647, "step": 55790 }, { "epoch": 67.31019915509958, "grad_norm": 6.092814922332764, "learning_rate": 1.9998656234923573e-05, "loss": 0.0637, "step": 55800 }, { "epoch": 67.32226916113459, "grad_norm": 5.8397979736328125, "learning_rate": 1.999865599366878e-05, "loss": 0.0638, "step": 55810 }, { "epoch": 67.33433916716959, "grad_norm": 6.12357234954834, "learning_rate": 1.9998655752413986e-05, "loss": 0.063, "step": 55820 }, { "epoch": 67.34640917320459, "grad_norm": 6.216517925262451, "learning_rate": 1.9998655511159192e-05, "loss": 0.0646, "step": 55830 }, { "epoch": 67.3584791792396, "grad_norm": 6.0373992919921875, "learning_rate": 1.9998655269904398e-05, "loss": 0.0666, "step": 55840 }, { "epoch": 67.3705491852746, "grad_norm": 5.667769432067871, "learning_rate": 1.9998655028649604e-05, "loss": 0.0655, "step": 55850 }, { "epoch": 67.3826191913096, "grad_norm": 6.154231071472168, "learning_rate": 1.999865478739481e-05, "loss": 0.0649, "step": 55860 }, { "epoch": 67.3946891973446, "grad_norm": 5.736454963684082, "learning_rate": 1.9998654546140017e-05, "loss": 0.0668, "step": 55870 }, { "epoch": 67.40675920337961, "grad_norm": 6.345763206481934, "learning_rate": 1.9998654304885223e-05, "loss": 0.0668, "step": 55880 }, { "epoch": 67.41882920941461, "grad_norm": 5.104516506195068, "learning_rate": 1.999865406363043e-05, "loss": 0.0652, "step": 55890 }, { "epoch": 67.43089921544961, "grad_norm": 5.594074726104736, "learning_rate": 1.9998653822375635e-05, "loss": 0.0667, "step": 55900 }, { "epoch": 67.44296922148462, "grad_norm": 6.000238418579102, "learning_rate": 1.999865358112084e-05, "loss": 0.0646, "step": 55910 }, { "epoch": 67.45503922751962, "grad_norm": 6.098887920379639, "learning_rate": 1.9998653339866048e-05, "loss": 0.0671, "step": 55920 }, { "epoch": 67.46710923355462, "grad_norm": 5.657845497131348, "learning_rate": 1.9998653098611254e-05, "loss": 0.065, "step": 55930 }, { "epoch": 67.47917923958963, "grad_norm": 5.830939769744873, "learning_rate": 1.999865285735646e-05, "loss": 0.0695, "step": 55940 }, { "epoch": 67.49124924562463, "grad_norm": 6.325922966003418, "learning_rate": 1.9998652616101667e-05, "loss": 0.0666, "step": 55950 }, { "epoch": 67.50331925165963, "grad_norm": 6.380095958709717, "learning_rate": 1.9998652374846873e-05, "loss": 0.0699, "step": 55960 }, { "epoch": 67.51538925769462, "grad_norm": 5.894421100616455, "learning_rate": 1.999865213359208e-05, "loss": 0.0719, "step": 55970 }, { "epoch": 67.52745926372963, "grad_norm": 6.698941707611084, "learning_rate": 1.9998651892337285e-05, "loss": 0.0712, "step": 55980 }, { "epoch": 67.53952926976463, "grad_norm": 6.437810897827148, "learning_rate": 1.999865165108249e-05, "loss": 0.0703, "step": 55990 }, { "epoch": 67.55159927579963, "grad_norm": 6.013824462890625, "learning_rate": 1.9998651409827698e-05, "loss": 0.068, "step": 56000 }, { "epoch": 67.55159927579963, "eval_loss": 12.264579772949219, "eval_runtime": 8.1427, "eval_samples_per_second": 85.598, "eval_steps_per_second": 10.807, "step": 56000 }, { "epoch": 67.56366928183463, "grad_norm": 6.16840124130249, "learning_rate": 1.9998651168572904e-05, "loss": 0.0669, "step": 56010 }, { "epoch": 67.57573928786964, "grad_norm": 6.2523579597473145, "learning_rate": 1.999865092731811e-05, "loss": 0.0702, "step": 56020 }, { "epoch": 67.58780929390464, "grad_norm": 6.467589855194092, "learning_rate": 1.9998650686063316e-05, "loss": 0.0713, "step": 56030 }, { "epoch": 67.59987929993964, "grad_norm": 6.260464191436768, "learning_rate": 1.9998650444808522e-05, "loss": 0.0683, "step": 56040 }, { "epoch": 67.61194930597465, "grad_norm": 6.405597686767578, "learning_rate": 1.999865020355373e-05, "loss": 0.0687, "step": 56050 }, { "epoch": 67.62401931200965, "grad_norm": 5.771214485168457, "learning_rate": 1.9998649962298935e-05, "loss": 0.0677, "step": 56060 }, { "epoch": 67.63608931804465, "grad_norm": 5.707738399505615, "learning_rate": 1.999864972104414e-05, "loss": 0.0672, "step": 56070 }, { "epoch": 67.64815932407966, "grad_norm": 5.914234161376953, "learning_rate": 1.9998649479789347e-05, "loss": 0.0705, "step": 56080 }, { "epoch": 67.66022933011466, "grad_norm": 6.290699481964111, "learning_rate": 1.9998649238534554e-05, "loss": 0.0709, "step": 56090 }, { "epoch": 67.67229933614966, "grad_norm": 5.708197116851807, "learning_rate": 1.999864899727976e-05, "loss": 0.0705, "step": 56100 }, { "epoch": 67.68436934218467, "grad_norm": 5.629133224487305, "learning_rate": 1.9998648756024966e-05, "loss": 0.0688, "step": 56110 }, { "epoch": 67.69643934821967, "grad_norm": 6.262328624725342, "learning_rate": 1.9998648514770172e-05, "loss": 0.0697, "step": 56120 }, { "epoch": 67.70850935425467, "grad_norm": 6.083400726318359, "learning_rate": 1.999864827351538e-05, "loss": 0.0699, "step": 56130 }, { "epoch": 67.72057936028968, "grad_norm": 6.141289234161377, "learning_rate": 1.9998648032260585e-05, "loss": 0.0701, "step": 56140 }, { "epoch": 67.73264936632468, "grad_norm": 5.800866603851318, "learning_rate": 1.999864779100579e-05, "loss": 0.0723, "step": 56150 }, { "epoch": 67.74471937235968, "grad_norm": 6.273632526397705, "learning_rate": 1.9998647549750997e-05, "loss": 0.0739, "step": 56160 }, { "epoch": 67.75678937839469, "grad_norm": 5.932977676391602, "learning_rate": 1.9998647308496203e-05, "loss": 0.0732, "step": 56170 }, { "epoch": 67.76885938442969, "grad_norm": 6.658799648284912, "learning_rate": 1.999864706724141e-05, "loss": 0.0717, "step": 56180 }, { "epoch": 67.78092939046469, "grad_norm": 7.517887592315674, "learning_rate": 1.9998646825986616e-05, "loss": 0.0731, "step": 56190 }, { "epoch": 67.7929993964997, "grad_norm": 6.361918926239014, "learning_rate": 1.999864658473182e-05, "loss": 0.0736, "step": 56200 }, { "epoch": 67.8050694025347, "grad_norm": 6.276568412780762, "learning_rate": 1.9998646343477025e-05, "loss": 0.0696, "step": 56210 }, { "epoch": 67.8171394085697, "grad_norm": 5.792972564697266, "learning_rate": 1.999864610222223e-05, "loss": 0.0704, "step": 56220 }, { "epoch": 67.8292094146047, "grad_norm": 6.212174415588379, "learning_rate": 1.9998645860967437e-05, "loss": 0.0723, "step": 56230 }, { "epoch": 67.84127942063971, "grad_norm": 6.158066749572754, "learning_rate": 1.9998645619712643e-05, "loss": 0.0709, "step": 56240 }, { "epoch": 67.85334942667471, "grad_norm": 5.499885559082031, "learning_rate": 1.999864537845785e-05, "loss": 0.0718, "step": 56250 }, { "epoch": 67.86541943270971, "grad_norm": 6.4851813316345215, "learning_rate": 1.9998645137203056e-05, "loss": 0.0711, "step": 56260 }, { "epoch": 67.87748943874472, "grad_norm": 5.760282039642334, "learning_rate": 1.9998644895948262e-05, "loss": 0.0713, "step": 56270 }, { "epoch": 67.88955944477972, "grad_norm": 6.297687530517578, "learning_rate": 1.9998644654693468e-05, "loss": 0.0706, "step": 56280 }, { "epoch": 67.90162945081472, "grad_norm": 6.5161004066467285, "learning_rate": 1.9998644413438674e-05, "loss": 0.0711, "step": 56290 }, { "epoch": 67.91369945684973, "grad_norm": 6.034632205963135, "learning_rate": 1.999864417218388e-05, "loss": 0.0741, "step": 56300 }, { "epoch": 67.92576946288473, "grad_norm": 6.243621826171875, "learning_rate": 1.9998643930929087e-05, "loss": 0.0736, "step": 56310 }, { "epoch": 67.93783946891973, "grad_norm": 6.371464729309082, "learning_rate": 1.9998643689674293e-05, "loss": 0.0732, "step": 56320 }, { "epoch": 67.94990947495474, "grad_norm": 5.7764201164245605, "learning_rate": 1.99986434484195e-05, "loss": 0.0734, "step": 56330 }, { "epoch": 67.96197948098974, "grad_norm": 6.264448165893555, "learning_rate": 1.9998643207164706e-05, "loss": 0.0736, "step": 56340 }, { "epoch": 67.97404948702474, "grad_norm": 6.7061333656311035, "learning_rate": 1.9998642965909912e-05, "loss": 0.0739, "step": 56350 }, { "epoch": 67.98611949305975, "grad_norm": 6.342446804046631, "learning_rate": 1.9998642724655118e-05, "loss": 0.0738, "step": 56360 }, { "epoch": 67.99818949909475, "grad_norm": 6.571288108825684, "learning_rate": 1.9998642483400324e-05, "loss": 0.0726, "step": 56370 }, { "epoch": 68.009656004828, "grad_norm": 5.460779190063477, "learning_rate": 1.999864224214553e-05, "loss": 0.0557, "step": 56380 }, { "epoch": 68.021726010863, "grad_norm": 5.222171783447266, "learning_rate": 1.9998642000890737e-05, "loss": 0.0514, "step": 56390 }, { "epoch": 68.03379601689801, "grad_norm": 5.361963272094727, "learning_rate": 1.9998641759635943e-05, "loss": 0.055, "step": 56400 }, { "epoch": 68.04586602293301, "grad_norm": 5.497074604034424, "learning_rate": 1.999864151838115e-05, "loss": 0.0551, "step": 56410 }, { "epoch": 68.05793602896802, "grad_norm": 5.980125427246094, "learning_rate": 1.9998641277126355e-05, "loss": 0.0552, "step": 56420 }, { "epoch": 68.07000603500302, "grad_norm": 5.746893405914307, "learning_rate": 1.999864103587156e-05, "loss": 0.0571, "step": 56430 }, { "epoch": 68.08207604103802, "grad_norm": 5.961381435394287, "learning_rate": 1.9998640794616768e-05, "loss": 0.0603, "step": 56440 }, { "epoch": 68.09414604707302, "grad_norm": 4.955400466918945, "learning_rate": 1.999864055336197e-05, "loss": 0.0582, "step": 56450 }, { "epoch": 68.10621605310803, "grad_norm": 5.148478031158447, "learning_rate": 1.9998640312107177e-05, "loss": 0.0594, "step": 56460 }, { "epoch": 68.11828605914303, "grad_norm": 5.5606865882873535, "learning_rate": 1.9998640070852383e-05, "loss": 0.0583, "step": 56470 }, { "epoch": 68.13035606517803, "grad_norm": 5.190249919891357, "learning_rate": 1.999863982959759e-05, "loss": 0.0596, "step": 56480 }, { "epoch": 68.14242607121304, "grad_norm": 5.089982986450195, "learning_rate": 1.9998639588342795e-05, "loss": 0.0597, "step": 56490 }, { "epoch": 68.15449607724804, "grad_norm": 5.66483736038208, "learning_rate": 1.9998639347088e-05, "loss": 0.0596, "step": 56500 }, { "epoch": 68.15449607724804, "eval_loss": 12.284185409545898, "eval_runtime": 8.1567, "eval_samples_per_second": 85.451, "eval_steps_per_second": 10.789, "step": 56500 }, { "epoch": 68.16656608328304, "grad_norm": 5.765948295593262, "learning_rate": 1.9998639105833208e-05, "loss": 0.0606, "step": 56510 }, { "epoch": 68.17863608931805, "grad_norm": 5.125070095062256, "learning_rate": 1.9998638864578414e-05, "loss": 0.0601, "step": 56520 }, { "epoch": 68.19070609535305, "grad_norm": 5.61266565322876, "learning_rate": 1.999863862332362e-05, "loss": 0.0607, "step": 56530 }, { "epoch": 68.20277610138805, "grad_norm": 5.457344055175781, "learning_rate": 1.9998638382068826e-05, "loss": 0.0609, "step": 56540 }, { "epoch": 68.21484610742306, "grad_norm": 5.872602462768555, "learning_rate": 1.9998638140814033e-05, "loss": 0.0605, "step": 56550 }, { "epoch": 68.22691611345806, "grad_norm": 5.455148696899414, "learning_rate": 1.9998637899559242e-05, "loss": 0.0625, "step": 56560 }, { "epoch": 68.23898611949306, "grad_norm": 5.779104232788086, "learning_rate": 1.999863765830445e-05, "loss": 0.0594, "step": 56570 }, { "epoch": 68.25105612552807, "grad_norm": 5.601633548736572, "learning_rate": 1.9998637417049655e-05, "loss": 0.0615, "step": 56580 }, { "epoch": 68.26312613156307, "grad_norm": 6.2273712158203125, "learning_rate": 1.999863717579486e-05, "loss": 0.0628, "step": 56590 }, { "epoch": 68.27519613759807, "grad_norm": 5.397634506225586, "learning_rate": 1.9998636934540067e-05, "loss": 0.0632, "step": 56600 }, { "epoch": 68.28726614363308, "grad_norm": 5.700357437133789, "learning_rate": 1.999863669328527e-05, "loss": 0.0624, "step": 56610 }, { "epoch": 68.29933614966808, "grad_norm": 5.08852481842041, "learning_rate": 1.9998636452030476e-05, "loss": 0.0645, "step": 56620 }, { "epoch": 68.31140615570308, "grad_norm": 6.608517169952393, "learning_rate": 1.9998636210775682e-05, "loss": 0.0651, "step": 56630 }, { "epoch": 68.32347616173809, "grad_norm": 5.544213771820068, "learning_rate": 1.999863596952089e-05, "loss": 0.0651, "step": 56640 }, { "epoch": 68.33554616777309, "grad_norm": 6.183680534362793, "learning_rate": 1.9998635728266095e-05, "loss": 0.0635, "step": 56650 }, { "epoch": 68.34761617380809, "grad_norm": 5.879703044891357, "learning_rate": 1.99986354870113e-05, "loss": 0.0675, "step": 56660 }, { "epoch": 68.3596861798431, "grad_norm": 5.300556659698486, "learning_rate": 1.9998635245756507e-05, "loss": 0.0647, "step": 56670 }, { "epoch": 68.3717561858781, "grad_norm": 5.9653730392456055, "learning_rate": 1.9998635004501713e-05, "loss": 0.0642, "step": 56680 }, { "epoch": 68.3838261919131, "grad_norm": 6.3895039558410645, "learning_rate": 1.999863476324692e-05, "loss": 0.0611, "step": 56690 }, { "epoch": 68.3958961979481, "grad_norm": 5.566645622253418, "learning_rate": 1.9998634521992126e-05, "loss": 0.0629, "step": 56700 }, { "epoch": 68.40796620398311, "grad_norm": 6.056954383850098, "learning_rate": 1.9998634280737332e-05, "loss": 0.0624, "step": 56710 }, { "epoch": 68.42003621001811, "grad_norm": 5.394970417022705, "learning_rate": 1.999863403948254e-05, "loss": 0.0648, "step": 56720 }, { "epoch": 68.43210621605311, "grad_norm": 5.956596851348877, "learning_rate": 1.9998633798227745e-05, "loss": 0.0638, "step": 56730 }, { "epoch": 68.44417622208812, "grad_norm": 6.090643405914307, "learning_rate": 1.999863355697295e-05, "loss": 0.0661, "step": 56740 }, { "epoch": 68.45624622812312, "grad_norm": 5.94242000579834, "learning_rate": 1.9998633315718157e-05, "loss": 0.0663, "step": 56750 }, { "epoch": 68.46831623415812, "grad_norm": 6.177280426025391, "learning_rate": 1.9998633074463363e-05, "loss": 0.0655, "step": 56760 }, { "epoch": 68.48038624019313, "grad_norm": 5.943648338317871, "learning_rate": 1.999863283320857e-05, "loss": 0.0649, "step": 56770 }, { "epoch": 68.49245624622813, "grad_norm": 5.478034973144531, "learning_rate": 1.9998632591953776e-05, "loss": 0.067, "step": 56780 }, { "epoch": 68.50452625226312, "grad_norm": 5.385597229003906, "learning_rate": 1.9998632350698982e-05, "loss": 0.0695, "step": 56790 }, { "epoch": 68.51659625829812, "grad_norm": 6.4263129234313965, "learning_rate": 1.9998632109444188e-05, "loss": 0.0674, "step": 56800 }, { "epoch": 68.52866626433313, "grad_norm": 6.139267921447754, "learning_rate": 1.9998631868189394e-05, "loss": 0.0693, "step": 56810 }, { "epoch": 68.54073627036813, "grad_norm": 5.830252647399902, "learning_rate": 1.99986316269346e-05, "loss": 0.0666, "step": 56820 }, { "epoch": 68.55280627640313, "grad_norm": 5.9640679359436035, "learning_rate": 1.9998631385679807e-05, "loss": 0.0656, "step": 56830 }, { "epoch": 68.56487628243814, "grad_norm": 6.027914524078369, "learning_rate": 1.9998631144425013e-05, "loss": 0.0707, "step": 56840 }, { "epoch": 68.57694628847314, "grad_norm": 5.654533386230469, "learning_rate": 1.999863090317022e-05, "loss": 0.0651, "step": 56850 }, { "epoch": 68.58901629450814, "grad_norm": 6.307214736938477, "learning_rate": 1.9998630661915422e-05, "loss": 0.0671, "step": 56860 }, { "epoch": 68.60108630054314, "grad_norm": 6.191965579986572, "learning_rate": 1.9998630420660628e-05, "loss": 0.0677, "step": 56870 }, { "epoch": 68.61315630657815, "grad_norm": 6.0200724601745605, "learning_rate": 1.9998630179405834e-05, "loss": 0.0693, "step": 56880 }, { "epoch": 68.62522631261315, "grad_norm": 6.105053901672363, "learning_rate": 1.999862993815104e-05, "loss": 0.0698, "step": 56890 }, { "epoch": 68.63729631864815, "grad_norm": 5.729704856872559, "learning_rate": 1.9998629696896247e-05, "loss": 0.0682, "step": 56900 }, { "epoch": 68.64936632468316, "grad_norm": 5.327478408813477, "learning_rate": 1.9998629455641453e-05, "loss": 0.0684, "step": 56910 }, { "epoch": 68.66143633071816, "grad_norm": 6.233434200286865, "learning_rate": 1.999862921438666e-05, "loss": 0.0691, "step": 56920 }, { "epoch": 68.67350633675316, "grad_norm": 5.815061092376709, "learning_rate": 1.9998628973131865e-05, "loss": 0.0675, "step": 56930 }, { "epoch": 68.68557634278817, "grad_norm": 5.965333461761475, "learning_rate": 1.999862873187707e-05, "loss": 0.0689, "step": 56940 }, { "epoch": 68.69764634882317, "grad_norm": 6.122251987457275, "learning_rate": 1.9998628490622278e-05, "loss": 0.067, "step": 56950 }, { "epoch": 68.70971635485817, "grad_norm": 5.614966869354248, "learning_rate": 1.9998628249367484e-05, "loss": 0.0708, "step": 56960 }, { "epoch": 68.72178636089318, "grad_norm": 5.720747470855713, "learning_rate": 1.999862800811269e-05, "loss": 0.0669, "step": 56970 }, { "epoch": 68.73385636692818, "grad_norm": 5.558542728424072, "learning_rate": 1.9998627766857897e-05, "loss": 0.067, "step": 56980 }, { "epoch": 68.74592637296318, "grad_norm": 5.955792427062988, "learning_rate": 1.9998627525603103e-05, "loss": 0.0707, "step": 56990 }, { "epoch": 68.75799637899819, "grad_norm": 5.844067096710205, "learning_rate": 1.999862728434831e-05, "loss": 0.0696, "step": 57000 }, { "epoch": 68.75799637899819, "eval_loss": 12.329307556152344, "eval_runtime": 8.3548, "eval_samples_per_second": 83.425, "eval_steps_per_second": 10.533, "step": 57000 }, { "epoch": 68.77006638503319, "grad_norm": 6.722574710845947, "learning_rate": 1.9998627043093515e-05, "loss": 0.0712, "step": 57010 }, { "epoch": 68.78213639106819, "grad_norm": 5.8015594482421875, "learning_rate": 1.999862680183872e-05, "loss": 0.073, "step": 57020 }, { "epoch": 68.7942063971032, "grad_norm": 6.62223482131958, "learning_rate": 1.9998626560583928e-05, "loss": 0.0701, "step": 57030 }, { "epoch": 68.8062764031382, "grad_norm": 5.944985866546631, "learning_rate": 1.9998626319329134e-05, "loss": 0.0733, "step": 57040 }, { "epoch": 68.8183464091732, "grad_norm": 5.8295698165893555, "learning_rate": 1.999862607807434e-05, "loss": 0.0729, "step": 57050 }, { "epoch": 68.8304164152082, "grad_norm": 6.7737040519714355, "learning_rate": 1.9998625836819546e-05, "loss": 0.0717, "step": 57060 }, { "epoch": 68.84248642124321, "grad_norm": 6.347014904022217, "learning_rate": 1.9998625595564752e-05, "loss": 0.0723, "step": 57070 }, { "epoch": 68.85455642727821, "grad_norm": 6.072583198547363, "learning_rate": 1.999862535430996e-05, "loss": 0.0745, "step": 57080 }, { "epoch": 68.86662643331321, "grad_norm": 6.324760437011719, "learning_rate": 1.9998625113055165e-05, "loss": 0.0738, "step": 57090 }, { "epoch": 68.87869643934822, "grad_norm": 6.038755416870117, "learning_rate": 1.999862487180037e-05, "loss": 0.0728, "step": 57100 }, { "epoch": 68.89076644538322, "grad_norm": 5.724707126617432, "learning_rate": 1.9998624630545577e-05, "loss": 0.0748, "step": 57110 }, { "epoch": 68.90283645141822, "grad_norm": 5.79099178314209, "learning_rate": 1.9998624389290784e-05, "loss": 0.0676, "step": 57120 }, { "epoch": 68.91490645745323, "grad_norm": 6.085238456726074, "learning_rate": 1.999862414803599e-05, "loss": 0.0723, "step": 57130 }, { "epoch": 68.92697646348823, "grad_norm": 5.664501667022705, "learning_rate": 1.9998623906781196e-05, "loss": 0.074, "step": 57140 }, { "epoch": 68.93904646952323, "grad_norm": 5.907235622406006, "learning_rate": 1.9998623665526402e-05, "loss": 0.0695, "step": 57150 }, { "epoch": 68.95111647555824, "grad_norm": 6.262484073638916, "learning_rate": 1.999862342427161e-05, "loss": 0.0734, "step": 57160 }, { "epoch": 68.96318648159324, "grad_norm": 6.938223838806152, "learning_rate": 1.9998623183016815e-05, "loss": 0.0731, "step": 57170 }, { "epoch": 68.97525648762824, "grad_norm": 6.749214172363281, "learning_rate": 1.999862294176202e-05, "loss": 0.073, "step": 57180 }, { "epoch": 68.98732649366325, "grad_norm": 6.390651702880859, "learning_rate": 1.9998622700507227e-05, "loss": 0.0713, "step": 57190 }, { "epoch": 68.99939649969825, "grad_norm": 6.345817565917969, "learning_rate": 1.9998622459252433e-05, "loss": 0.0775, "step": 57200 }, { "epoch": 69.0108630054315, "grad_norm": 4.6445159912109375, "learning_rate": 1.999862221799764e-05, "loss": 0.0518, "step": 57210 }, { "epoch": 69.0229330114665, "grad_norm": 5.344893455505371, "learning_rate": 1.9998621976742846e-05, "loss": 0.0524, "step": 57220 }, { "epoch": 69.03500301750151, "grad_norm": 5.306985855102539, "learning_rate": 1.9998621735488052e-05, "loss": 0.0513, "step": 57230 }, { "epoch": 69.04707302353651, "grad_norm": 5.7270612716674805, "learning_rate": 1.9998621494233258e-05, "loss": 0.0553, "step": 57240 }, { "epoch": 69.05914302957152, "grad_norm": 5.0068745613098145, "learning_rate": 1.9998621252978464e-05, "loss": 0.0556, "step": 57250 }, { "epoch": 69.07121303560652, "grad_norm": 5.548910617828369, "learning_rate": 1.999862101172367e-05, "loss": 0.0534, "step": 57260 }, { "epoch": 69.08328304164152, "grad_norm": 5.666830062866211, "learning_rate": 1.9998620770468873e-05, "loss": 0.0573, "step": 57270 }, { "epoch": 69.09535304767653, "grad_norm": 5.7116193771362305, "learning_rate": 1.999862052921408e-05, "loss": 0.0574, "step": 57280 }, { "epoch": 69.10742305371153, "grad_norm": 5.364109992980957, "learning_rate": 1.9998620287959286e-05, "loss": 0.0579, "step": 57290 }, { "epoch": 69.11949305974653, "grad_norm": 5.802278995513916, "learning_rate": 1.9998620046704492e-05, "loss": 0.0587, "step": 57300 }, { "epoch": 69.13156306578153, "grad_norm": 5.303823471069336, "learning_rate": 1.9998619805449698e-05, "loss": 0.0604, "step": 57310 }, { "epoch": 69.14363307181654, "grad_norm": 6.111446380615234, "learning_rate": 1.9998619564194904e-05, "loss": 0.0621, "step": 57320 }, { "epoch": 69.15570307785154, "grad_norm": 5.560831069946289, "learning_rate": 1.999861932294011e-05, "loss": 0.0581, "step": 57330 }, { "epoch": 69.16777308388654, "grad_norm": 5.102551460266113, "learning_rate": 1.9998619081685317e-05, "loss": 0.0578, "step": 57340 }, { "epoch": 69.17984308992155, "grad_norm": 5.419503688812256, "learning_rate": 1.9998618840430523e-05, "loss": 0.0594, "step": 57350 }, { "epoch": 69.19191309595655, "grad_norm": 5.503938674926758, "learning_rate": 1.999861859917573e-05, "loss": 0.06, "step": 57360 }, { "epoch": 69.20398310199155, "grad_norm": 5.592369556427002, "learning_rate": 1.9998618357920936e-05, "loss": 0.0596, "step": 57370 }, { "epoch": 69.21605310802656, "grad_norm": 5.78318977355957, "learning_rate": 1.9998618116666142e-05, "loss": 0.0624, "step": 57380 }, { "epoch": 69.22812311406156, "grad_norm": 5.6410746574401855, "learning_rate": 1.9998617875411348e-05, "loss": 0.0581, "step": 57390 }, { "epoch": 69.24019312009656, "grad_norm": 6.171215057373047, "learning_rate": 1.9998617634156554e-05, "loss": 0.0594, "step": 57400 }, { "epoch": 69.25226312613157, "grad_norm": 5.077654838562012, "learning_rate": 1.999861739290176e-05, "loss": 0.0615, "step": 57410 }, { "epoch": 69.26433313216657, "grad_norm": 5.416453838348389, "learning_rate": 1.9998617151646967e-05, "loss": 0.0626, "step": 57420 }, { "epoch": 69.27640313820157, "grad_norm": 5.631503582000732, "learning_rate": 1.9998616910392173e-05, "loss": 0.0617, "step": 57430 }, { "epoch": 69.28847314423658, "grad_norm": 5.879523277282715, "learning_rate": 1.999861666913738e-05, "loss": 0.0634, "step": 57440 }, { "epoch": 69.30054315027158, "grad_norm": 5.315565586090088, "learning_rate": 1.9998616427882585e-05, "loss": 0.0631, "step": 57450 }, { "epoch": 69.31261315630658, "grad_norm": 6.030155658721924, "learning_rate": 1.999861618662779e-05, "loss": 0.0636, "step": 57460 }, { "epoch": 69.32468316234159, "grad_norm": 6.248990535736084, "learning_rate": 1.9998615945372998e-05, "loss": 0.0606, "step": 57470 }, { "epoch": 69.33675316837659, "grad_norm": 5.226402282714844, "learning_rate": 1.9998615704118204e-05, "loss": 0.0641, "step": 57480 }, { "epoch": 69.34882317441159, "grad_norm": 5.824979782104492, "learning_rate": 1.999861546286341e-05, "loss": 0.062, "step": 57490 }, { "epoch": 69.3608931804466, "grad_norm": 6.144801139831543, "learning_rate": 1.9998615221608616e-05, "loss": 0.0632, "step": 57500 }, { "epoch": 69.3608931804466, "eval_loss": 12.314597129821777, "eval_runtime": 8.1281, "eval_samples_per_second": 85.752, "eval_steps_per_second": 10.827, "step": 57500 }, { "epoch": 69.3729631864816, "grad_norm": 5.800685405731201, "learning_rate": 1.9998614980353823e-05, "loss": 0.0634, "step": 57510 }, { "epoch": 69.3850331925166, "grad_norm": 6.137479782104492, "learning_rate": 1.9998614739099025e-05, "loss": 0.0645, "step": 57520 }, { "epoch": 69.3971031985516, "grad_norm": 6.0523881912231445, "learning_rate": 1.999861449784423e-05, "loss": 0.0632, "step": 57530 }, { "epoch": 69.40917320458661, "grad_norm": 5.754865646362305, "learning_rate": 1.9998614256589438e-05, "loss": 0.0623, "step": 57540 }, { "epoch": 69.42124321062161, "grad_norm": 5.672056198120117, "learning_rate": 1.9998614015334644e-05, "loss": 0.0639, "step": 57550 }, { "epoch": 69.43331321665661, "grad_norm": 5.734525203704834, "learning_rate": 1.999861377407985e-05, "loss": 0.0636, "step": 57560 }, { "epoch": 69.44538322269162, "grad_norm": 5.587035179138184, "learning_rate": 1.9998613532825056e-05, "loss": 0.0667, "step": 57570 }, { "epoch": 69.45745322872662, "grad_norm": 5.916792869567871, "learning_rate": 1.9998613291570263e-05, "loss": 0.0662, "step": 57580 }, { "epoch": 69.46952323476162, "grad_norm": 5.54447603225708, "learning_rate": 1.999861305031547e-05, "loss": 0.0651, "step": 57590 }, { "epoch": 69.48159324079663, "grad_norm": 5.818124771118164, "learning_rate": 1.9998612809060675e-05, "loss": 0.0631, "step": 57600 }, { "epoch": 69.49366324683163, "grad_norm": 5.520540237426758, "learning_rate": 1.999861256780588e-05, "loss": 0.0641, "step": 57610 }, { "epoch": 69.50573325286662, "grad_norm": 6.006669521331787, "learning_rate": 1.9998612326551088e-05, "loss": 0.0658, "step": 57620 }, { "epoch": 69.51780325890162, "grad_norm": 6.022196292877197, "learning_rate": 1.9998612085296294e-05, "loss": 0.0676, "step": 57630 }, { "epoch": 69.52987326493663, "grad_norm": 6.231136322021484, "learning_rate": 1.9998611844041503e-05, "loss": 0.0632, "step": 57640 }, { "epoch": 69.54194327097163, "grad_norm": 5.666462421417236, "learning_rate": 1.999861160278671e-05, "loss": 0.0637, "step": 57650 }, { "epoch": 69.55401327700663, "grad_norm": 5.790175914764404, "learning_rate": 1.9998611361531916e-05, "loss": 0.0637, "step": 57660 }, { "epoch": 69.56608328304164, "grad_norm": 5.404178142547607, "learning_rate": 1.9998611120277122e-05, "loss": 0.063, "step": 57670 }, { "epoch": 69.57815328907664, "grad_norm": 6.633474826812744, "learning_rate": 1.9998610879022328e-05, "loss": 0.0694, "step": 57680 }, { "epoch": 69.59022329511164, "grad_norm": 5.652596473693848, "learning_rate": 1.999861063776753e-05, "loss": 0.07, "step": 57690 }, { "epoch": 69.60229330114664, "grad_norm": 5.6806416511535645, "learning_rate": 1.9998610396512737e-05, "loss": 0.0668, "step": 57700 }, { "epoch": 69.61436330718165, "grad_norm": 5.426058769226074, "learning_rate": 1.9998610155257943e-05, "loss": 0.0686, "step": 57710 }, { "epoch": 69.62643331321665, "grad_norm": 6.256883144378662, "learning_rate": 1.999860991400315e-05, "loss": 0.0665, "step": 57720 }, { "epoch": 69.63850331925165, "grad_norm": 5.818232536315918, "learning_rate": 1.9998609672748356e-05, "loss": 0.0685, "step": 57730 }, { "epoch": 69.65057332528666, "grad_norm": 5.888890266418457, "learning_rate": 1.9998609431493562e-05, "loss": 0.0671, "step": 57740 }, { "epoch": 69.66264333132166, "grad_norm": 6.069561004638672, "learning_rate": 1.999860919023877e-05, "loss": 0.0677, "step": 57750 }, { "epoch": 69.67471333735666, "grad_norm": 6.422438621520996, "learning_rate": 1.9998608948983975e-05, "loss": 0.0679, "step": 57760 }, { "epoch": 69.68678334339167, "grad_norm": 5.596698760986328, "learning_rate": 1.999860870772918e-05, "loss": 0.0682, "step": 57770 }, { "epoch": 69.69885334942667, "grad_norm": 5.959321022033691, "learning_rate": 1.9998608466474387e-05, "loss": 0.0693, "step": 57780 }, { "epoch": 69.71092335546167, "grad_norm": 5.697901248931885, "learning_rate": 1.9998608225219593e-05, "loss": 0.069, "step": 57790 }, { "epoch": 69.72299336149668, "grad_norm": 6.542925834655762, "learning_rate": 1.99986079839648e-05, "loss": 0.0683, "step": 57800 }, { "epoch": 69.73506336753168, "grad_norm": 6.203988075256348, "learning_rate": 1.9998607742710006e-05, "loss": 0.0704, "step": 57810 }, { "epoch": 69.74713337356668, "grad_norm": 5.530033111572266, "learning_rate": 1.9998607501455212e-05, "loss": 0.0691, "step": 57820 }, { "epoch": 69.75920337960169, "grad_norm": 6.217360973358154, "learning_rate": 1.9998607260200418e-05, "loss": 0.067, "step": 57830 }, { "epoch": 69.77127338563669, "grad_norm": 6.695731163024902, "learning_rate": 1.9998607018945624e-05, "loss": 0.0714, "step": 57840 }, { "epoch": 69.78334339167169, "grad_norm": 6.03998327255249, "learning_rate": 1.999860677769083e-05, "loss": 0.0679, "step": 57850 }, { "epoch": 69.7954133977067, "grad_norm": 5.934584140777588, "learning_rate": 1.9998606536436037e-05, "loss": 0.0698, "step": 57860 }, { "epoch": 69.8074834037417, "grad_norm": 6.396552562713623, "learning_rate": 1.9998606295181243e-05, "loss": 0.0722, "step": 57870 }, { "epoch": 69.8195534097767, "grad_norm": 5.677385330200195, "learning_rate": 1.999860605392645e-05, "loss": 0.0695, "step": 57880 }, { "epoch": 69.8316234158117, "grad_norm": 6.366878509521484, "learning_rate": 1.9998605812671655e-05, "loss": 0.0702, "step": 57890 }, { "epoch": 69.84369342184671, "grad_norm": 6.2183308601379395, "learning_rate": 1.999860557141686e-05, "loss": 0.0679, "step": 57900 }, { "epoch": 69.85576342788171, "grad_norm": 6.234658718109131, "learning_rate": 1.9998605330162068e-05, "loss": 0.0686, "step": 57910 }, { "epoch": 69.86783343391672, "grad_norm": 5.590453147888184, "learning_rate": 1.9998605088907274e-05, "loss": 0.0666, "step": 57920 }, { "epoch": 69.87990343995172, "grad_norm": 5.721842288970947, "learning_rate": 1.999860484765248e-05, "loss": 0.0673, "step": 57930 }, { "epoch": 69.89197344598672, "grad_norm": 6.4774322509765625, "learning_rate": 1.9998604606397683e-05, "loss": 0.0687, "step": 57940 }, { "epoch": 69.90404345202172, "grad_norm": 6.323344707489014, "learning_rate": 1.999860436514289e-05, "loss": 0.072, "step": 57950 }, { "epoch": 69.91611345805673, "grad_norm": 7.977266788482666, "learning_rate": 1.9998604123888095e-05, "loss": 0.07, "step": 57960 }, { "epoch": 69.92818346409173, "grad_norm": 6.062766075134277, "learning_rate": 1.9998603882633302e-05, "loss": 0.071, "step": 57970 }, { "epoch": 69.94025347012673, "grad_norm": 6.011053085327148, "learning_rate": 1.9998603641378508e-05, "loss": 0.07, "step": 57980 }, { "epoch": 69.95232347616174, "grad_norm": 6.289527416229248, "learning_rate": 1.9998603400123714e-05, "loss": 0.0701, "step": 57990 }, { "epoch": 69.96439348219674, "grad_norm": 6.038223743438721, "learning_rate": 1.999860315886892e-05, "loss": 0.0691, "step": 58000 }, { "epoch": 69.96439348219674, "eval_loss": 12.345186233520508, "eval_runtime": 8.131, "eval_samples_per_second": 85.721, "eval_steps_per_second": 10.823, "step": 58000 }, { "epoch": 69.97646348823174, "grad_norm": 6.412364959716797, "learning_rate": 1.9998602917614127e-05, "loss": 0.0735, "step": 58010 }, { "epoch": 69.98853349426675, "grad_norm": 6.676702499389648, "learning_rate": 1.9998602676359333e-05, "loss": 0.0693, "step": 58020 }, { "epoch": 70.0, "grad_norm": 10.541123390197754, "learning_rate": 1.999860243510454e-05, "loss": 0.0724, "step": 58030 }, { "epoch": 70.012070006035, "grad_norm": 4.9569573402404785, "learning_rate": 1.9998602193849745e-05, "loss": 0.0479, "step": 58040 }, { "epoch": 70.02414001207, "grad_norm": 4.553520202636719, "learning_rate": 1.999860195259495e-05, "loss": 0.0553, "step": 58050 }, { "epoch": 70.03621001810501, "grad_norm": 4.980017185211182, "learning_rate": 1.9998601711340158e-05, "loss": 0.0502, "step": 58060 }, { "epoch": 70.04828002414001, "grad_norm": 4.862907409667969, "learning_rate": 1.9998601470085364e-05, "loss": 0.0538, "step": 58070 }, { "epoch": 70.06035003017502, "grad_norm": 4.847743988037109, "learning_rate": 1.999860122883057e-05, "loss": 0.0547, "step": 58080 }, { "epoch": 70.07242003621002, "grad_norm": 5.625429153442383, "learning_rate": 1.9998600987575776e-05, "loss": 0.0561, "step": 58090 }, { "epoch": 70.08449004224502, "grad_norm": 5.111926078796387, "learning_rate": 1.9998600746320983e-05, "loss": 0.0556, "step": 58100 }, { "epoch": 70.09656004828003, "grad_norm": 5.675597190856934, "learning_rate": 1.999860050506619e-05, "loss": 0.057, "step": 58110 }, { "epoch": 70.10863005431503, "grad_norm": 5.312305927276611, "learning_rate": 1.9998600263811395e-05, "loss": 0.0529, "step": 58120 }, { "epoch": 70.12070006035003, "grad_norm": 5.626957416534424, "learning_rate": 1.99986000225566e-05, "loss": 0.0577, "step": 58130 }, { "epoch": 70.13277006638504, "grad_norm": 5.576411724090576, "learning_rate": 1.9998599781301807e-05, "loss": 0.0535, "step": 58140 }, { "epoch": 70.14484007242004, "grad_norm": 5.860774517059326, "learning_rate": 1.9998599540047014e-05, "loss": 0.0593, "step": 58150 }, { "epoch": 70.15691007845504, "grad_norm": 5.9707441329956055, "learning_rate": 1.999859929879222e-05, "loss": 0.0565, "step": 58160 }, { "epoch": 70.16898008449004, "grad_norm": 5.64743185043335, "learning_rate": 1.9998599057537426e-05, "loss": 0.0572, "step": 58170 }, { "epoch": 70.18105009052505, "grad_norm": 5.233848571777344, "learning_rate": 1.9998598816282632e-05, "loss": 0.0582, "step": 58180 }, { "epoch": 70.19312009656005, "grad_norm": 5.4905805587768555, "learning_rate": 1.999859857502784e-05, "loss": 0.0583, "step": 58190 }, { "epoch": 70.20519010259505, "grad_norm": 5.31244421005249, "learning_rate": 1.9998598333773045e-05, "loss": 0.0589, "step": 58200 }, { "epoch": 70.21726010863006, "grad_norm": 5.210171222686768, "learning_rate": 1.999859809251825e-05, "loss": 0.058, "step": 58210 }, { "epoch": 70.22933011466506, "grad_norm": 4.745039939880371, "learning_rate": 1.9998597851263457e-05, "loss": 0.0601, "step": 58220 }, { "epoch": 70.24140012070006, "grad_norm": 5.4510884284973145, "learning_rate": 1.9998597610008663e-05, "loss": 0.06, "step": 58230 }, { "epoch": 70.25347012673507, "grad_norm": 5.2952046394348145, "learning_rate": 1.999859736875387e-05, "loss": 0.0608, "step": 58240 }, { "epoch": 70.26554013277007, "grad_norm": 5.311239719390869, "learning_rate": 1.9998597127499076e-05, "loss": 0.0598, "step": 58250 }, { "epoch": 70.27761013880507, "grad_norm": 5.8282670974731445, "learning_rate": 1.9998596886244282e-05, "loss": 0.0623, "step": 58260 }, { "epoch": 70.28968014484008, "grad_norm": 5.7520222663879395, "learning_rate": 1.9998596644989488e-05, "loss": 0.0598, "step": 58270 }, { "epoch": 70.30175015087508, "grad_norm": 6.001716136932373, "learning_rate": 1.9998596403734694e-05, "loss": 0.0617, "step": 58280 }, { "epoch": 70.31382015691008, "grad_norm": 5.835625648498535, "learning_rate": 1.99985961624799e-05, "loss": 0.0633, "step": 58290 }, { "epoch": 70.32589016294509, "grad_norm": 5.898458957672119, "learning_rate": 1.9998595921225107e-05, "loss": 0.0644, "step": 58300 }, { "epoch": 70.33796016898009, "grad_norm": 6.293110370635986, "learning_rate": 1.9998595679970313e-05, "loss": 0.0619, "step": 58310 }, { "epoch": 70.35003017501509, "grad_norm": 5.414556503295898, "learning_rate": 1.999859543871552e-05, "loss": 0.0613, "step": 58320 }, { "epoch": 70.3621001810501, "grad_norm": 5.43922233581543, "learning_rate": 1.9998595197460725e-05, "loss": 0.0633, "step": 58330 }, { "epoch": 70.3741701870851, "grad_norm": 6.078142166137695, "learning_rate": 1.999859495620593e-05, "loss": 0.0617, "step": 58340 }, { "epoch": 70.3862401931201, "grad_norm": 5.549102306365967, "learning_rate": 1.9998594714951134e-05, "loss": 0.0627, "step": 58350 }, { "epoch": 70.3983101991551, "grad_norm": 5.1342058181762695, "learning_rate": 1.999859447369634e-05, "loss": 0.0646, "step": 58360 }, { "epoch": 70.41038020519011, "grad_norm": 5.579059600830078, "learning_rate": 1.9998594232441547e-05, "loss": 0.0666, "step": 58370 }, { "epoch": 70.42245021122511, "grad_norm": 6.02203369140625, "learning_rate": 1.9998593991186753e-05, "loss": 0.0625, "step": 58380 }, { "epoch": 70.43452021726011, "grad_norm": 5.907992839813232, "learning_rate": 1.999859374993196e-05, "loss": 0.0603, "step": 58390 }, { "epoch": 70.44659022329512, "grad_norm": 5.491796493530273, "learning_rate": 1.9998593508677166e-05, "loss": 0.0624, "step": 58400 }, { "epoch": 70.45866022933012, "grad_norm": 5.646358966827393, "learning_rate": 1.9998593267422372e-05, "loss": 0.0679, "step": 58410 }, { "epoch": 70.47073023536512, "grad_norm": 5.9585981369018555, "learning_rate": 1.9998593026167578e-05, "loss": 0.0636, "step": 58420 }, { "epoch": 70.48280024140013, "grad_norm": 5.591762065887451, "learning_rate": 1.9998592784912784e-05, "loss": 0.0632, "step": 58430 }, { "epoch": 70.49487024743513, "grad_norm": 5.18078088760376, "learning_rate": 1.999859254365799e-05, "loss": 0.0643, "step": 58440 }, { "epoch": 70.50694025347012, "grad_norm": 6.302553653717041, "learning_rate": 1.9998592302403197e-05, "loss": 0.0665, "step": 58450 }, { "epoch": 70.51901025950512, "grad_norm": 6.2101898193359375, "learning_rate": 1.9998592061148403e-05, "loss": 0.0642, "step": 58460 }, { "epoch": 70.53108026554013, "grad_norm": 5.8372321128845215, "learning_rate": 1.999859181989361e-05, "loss": 0.0663, "step": 58470 }, { "epoch": 70.54315027157513, "grad_norm": 5.769875526428223, "learning_rate": 1.9998591578638815e-05, "loss": 0.0649, "step": 58480 }, { "epoch": 70.55522027761013, "grad_norm": 6.1804890632629395, "learning_rate": 1.999859133738402e-05, "loss": 0.0656, "step": 58490 }, { "epoch": 70.56729028364514, "grad_norm": 5.34534215927124, "learning_rate": 1.9998591096129228e-05, "loss": 0.0659, "step": 58500 }, { "epoch": 70.56729028364514, "eval_loss": 12.348387718200684, "eval_runtime": 8.1327, "eval_samples_per_second": 85.703, "eval_steps_per_second": 10.821, "step": 58500 }, { "epoch": 70.57936028968014, "grad_norm": 5.953372478485107, "learning_rate": 1.9998590854874434e-05, "loss": 0.0649, "step": 58510 }, { "epoch": 70.59143029571514, "grad_norm": 6.025021076202393, "learning_rate": 1.999859061361964e-05, "loss": 0.0683, "step": 58520 }, { "epoch": 70.60350030175015, "grad_norm": 5.7935566902160645, "learning_rate": 1.9998590372364846e-05, "loss": 0.0658, "step": 58530 }, { "epoch": 70.61557030778515, "grad_norm": 5.496426105499268, "learning_rate": 1.9998590131110053e-05, "loss": 0.0653, "step": 58540 }, { "epoch": 70.62764031382015, "grad_norm": 6.363903045654297, "learning_rate": 1.999858988985526e-05, "loss": 0.0667, "step": 58550 }, { "epoch": 70.63971031985515, "grad_norm": 5.800562381744385, "learning_rate": 1.9998589648600465e-05, "loss": 0.0646, "step": 58560 }, { "epoch": 70.65178032589016, "grad_norm": 6.4005937576293945, "learning_rate": 1.999858940734567e-05, "loss": 0.0671, "step": 58570 }, { "epoch": 70.66385033192516, "grad_norm": 6.278224468231201, "learning_rate": 1.9998589166090877e-05, "loss": 0.0692, "step": 58580 }, { "epoch": 70.67592033796016, "grad_norm": 6.108465194702148, "learning_rate": 1.9998588924836084e-05, "loss": 0.0684, "step": 58590 }, { "epoch": 70.68799034399517, "grad_norm": 5.941433429718018, "learning_rate": 1.9998588683581286e-05, "loss": 0.069, "step": 58600 }, { "epoch": 70.70006035003017, "grad_norm": 5.442427635192871, "learning_rate": 1.9998588442326493e-05, "loss": 0.0688, "step": 58610 }, { "epoch": 70.71213035606517, "grad_norm": 6.088051795959473, "learning_rate": 1.99985882010717e-05, "loss": 0.0668, "step": 58620 }, { "epoch": 70.72420036210018, "grad_norm": 5.521499156951904, "learning_rate": 1.9998587959816905e-05, "loss": 0.0683, "step": 58630 }, { "epoch": 70.73627036813518, "grad_norm": 5.876876354217529, "learning_rate": 1.999858771856211e-05, "loss": 0.0671, "step": 58640 }, { "epoch": 70.74834037417018, "grad_norm": 5.7945332527160645, "learning_rate": 1.9998587477307318e-05, "loss": 0.0648, "step": 58650 }, { "epoch": 70.76041038020519, "grad_norm": 5.852102279663086, "learning_rate": 1.9998587236052524e-05, "loss": 0.0694, "step": 58660 }, { "epoch": 70.77248038624019, "grad_norm": 6.099263668060303, "learning_rate": 1.999858699479773e-05, "loss": 0.0707, "step": 58670 }, { "epoch": 70.7845503922752, "grad_norm": 5.641778469085693, "learning_rate": 1.9998586753542936e-05, "loss": 0.0668, "step": 58680 }, { "epoch": 70.7966203983102, "grad_norm": 6.2507781982421875, "learning_rate": 1.9998586512288142e-05, "loss": 0.0695, "step": 58690 }, { "epoch": 70.8086904043452, "grad_norm": 6.351855754852295, "learning_rate": 1.999858627103335e-05, "loss": 0.0706, "step": 58700 }, { "epoch": 70.8207604103802, "grad_norm": 6.335616588592529, "learning_rate": 1.9998586029778555e-05, "loss": 0.0708, "step": 58710 }, { "epoch": 70.8328304164152, "grad_norm": 6.14625883102417, "learning_rate": 1.9998585788523764e-05, "loss": 0.0673, "step": 58720 }, { "epoch": 70.84490042245021, "grad_norm": 5.6405463218688965, "learning_rate": 1.999858554726897e-05, "loss": 0.0719, "step": 58730 }, { "epoch": 70.85697042848521, "grad_norm": 6.468915939331055, "learning_rate": 1.9998585306014177e-05, "loss": 0.0725, "step": 58740 }, { "epoch": 70.86904043452022, "grad_norm": 6.580385684967041, "learning_rate": 1.9998585064759383e-05, "loss": 0.0705, "step": 58750 }, { "epoch": 70.88111044055522, "grad_norm": 5.752800941467285, "learning_rate": 1.9998584823504586e-05, "loss": 0.0682, "step": 58760 }, { "epoch": 70.89318044659022, "grad_norm": 6.895336151123047, "learning_rate": 1.9998584582249792e-05, "loss": 0.069, "step": 58770 }, { "epoch": 70.90525045262522, "grad_norm": 6.189088344573975, "learning_rate": 1.9998584340995e-05, "loss": 0.0693, "step": 58780 }, { "epoch": 70.91732045866023, "grad_norm": 5.972328186035156, "learning_rate": 1.9998584099740205e-05, "loss": 0.0707, "step": 58790 }, { "epoch": 70.92939046469523, "grad_norm": 6.072625160217285, "learning_rate": 1.999858385848541e-05, "loss": 0.0701, "step": 58800 }, { "epoch": 70.94146047073023, "grad_norm": 5.929001808166504, "learning_rate": 1.9998583617230617e-05, "loss": 0.0677, "step": 58810 }, { "epoch": 70.95353047676524, "grad_norm": 5.977123737335205, "learning_rate": 1.9998583375975823e-05, "loss": 0.0686, "step": 58820 }, { "epoch": 70.96560048280024, "grad_norm": 6.340959072113037, "learning_rate": 1.999858313472103e-05, "loss": 0.0691, "step": 58830 }, { "epoch": 70.97767048883524, "grad_norm": 6.070498466491699, "learning_rate": 1.9998582893466236e-05, "loss": 0.0702, "step": 58840 }, { "epoch": 70.98974049487025, "grad_norm": 6.123202323913574, "learning_rate": 1.9998582652211442e-05, "loss": 0.0692, "step": 58850 }, { "epoch": 71.0012070006035, "grad_norm": 5.350730895996094, "learning_rate": 1.9998582410956648e-05, "loss": 0.0688, "step": 58860 }, { "epoch": 71.0132770066385, "grad_norm": 5.0868072509765625, "learning_rate": 1.9998582169701854e-05, "loss": 0.0458, "step": 58870 }, { "epoch": 71.0253470126735, "grad_norm": 5.112293243408203, "learning_rate": 1.999858192844706e-05, "loss": 0.0499, "step": 58880 }, { "epoch": 71.03741701870851, "grad_norm": 4.85072660446167, "learning_rate": 1.9998581687192267e-05, "loss": 0.0505, "step": 58890 }, { "epoch": 71.04948702474351, "grad_norm": 4.610794544219971, "learning_rate": 1.9998581445937473e-05, "loss": 0.0498, "step": 58900 }, { "epoch": 71.06155703077852, "grad_norm": 5.210287570953369, "learning_rate": 1.999858120468268e-05, "loss": 0.0522, "step": 58910 }, { "epoch": 71.07362703681352, "grad_norm": 5.079074382781982, "learning_rate": 1.9998580963427885e-05, "loss": 0.054, "step": 58920 }, { "epoch": 71.08569704284852, "grad_norm": 4.9053778648376465, "learning_rate": 1.999858072217309e-05, "loss": 0.0571, "step": 58930 }, { "epoch": 71.09776704888353, "grad_norm": 5.524157524108887, "learning_rate": 1.9998580480918298e-05, "loss": 0.0546, "step": 58940 }, { "epoch": 71.10983705491853, "grad_norm": 5.197609901428223, "learning_rate": 1.9998580239663504e-05, "loss": 0.0571, "step": 58950 }, { "epoch": 71.12190706095353, "grad_norm": 5.869367599487305, "learning_rate": 1.999857999840871e-05, "loss": 0.0579, "step": 58960 }, { "epoch": 71.13397706698854, "grad_norm": 6.23269510269165, "learning_rate": 1.9998579757153916e-05, "loss": 0.058, "step": 58970 }, { "epoch": 71.14604707302354, "grad_norm": 6.032592296600342, "learning_rate": 1.9998579515899123e-05, "loss": 0.0574, "step": 58980 }, { "epoch": 71.15811707905854, "grad_norm": 5.595477104187012, "learning_rate": 1.999857927464433e-05, "loss": 0.0597, "step": 58990 }, { "epoch": 71.17018708509354, "grad_norm": 4.922096252441406, "learning_rate": 1.9998579033389535e-05, "loss": 0.0585, "step": 59000 }, { "epoch": 71.17018708509354, "eval_loss": 12.334692001342773, "eval_runtime": 8.1326, "eval_samples_per_second": 85.705, "eval_steps_per_second": 10.821, "step": 59000 }, { "epoch": 71.18225709112855, "grad_norm": 5.867879867553711, "learning_rate": 1.9998578792134738e-05, "loss": 0.0575, "step": 59010 }, { "epoch": 71.19432709716355, "grad_norm": 5.697842597961426, "learning_rate": 1.9998578550879944e-05, "loss": 0.0582, "step": 59020 }, { "epoch": 71.20639710319855, "grad_norm": 5.95745849609375, "learning_rate": 1.999857830962515e-05, "loss": 0.0626, "step": 59030 }, { "epoch": 71.21846710923356, "grad_norm": 5.941538333892822, "learning_rate": 1.9998578068370357e-05, "loss": 0.0599, "step": 59040 }, { "epoch": 71.23053711526856, "grad_norm": 5.884032726287842, "learning_rate": 1.9998577827115563e-05, "loss": 0.0587, "step": 59050 }, { "epoch": 71.24260712130356, "grad_norm": 5.310934543609619, "learning_rate": 1.999857758586077e-05, "loss": 0.0612, "step": 59060 }, { "epoch": 71.25467712733857, "grad_norm": 5.442577362060547, "learning_rate": 1.9998577344605975e-05, "loss": 0.0597, "step": 59070 }, { "epoch": 71.26674713337357, "grad_norm": 6.174004554748535, "learning_rate": 1.999857710335118e-05, "loss": 0.0598, "step": 59080 }, { "epoch": 71.27881713940857, "grad_norm": 5.350177764892578, "learning_rate": 1.9998576862096388e-05, "loss": 0.0597, "step": 59090 }, { "epoch": 71.29088714544358, "grad_norm": 5.273113250732422, "learning_rate": 1.9998576620841594e-05, "loss": 0.0596, "step": 59100 }, { "epoch": 71.30295715147858, "grad_norm": 5.109650135040283, "learning_rate": 1.99985763795868e-05, "loss": 0.0596, "step": 59110 }, { "epoch": 71.31502715751358, "grad_norm": 4.921658992767334, "learning_rate": 1.9998576138332006e-05, "loss": 0.0609, "step": 59120 }, { "epoch": 71.32709716354859, "grad_norm": 5.641357421875, "learning_rate": 1.9998575897077213e-05, "loss": 0.0628, "step": 59130 }, { "epoch": 71.33916716958359, "grad_norm": 6.145009994506836, "learning_rate": 1.999857565582242e-05, "loss": 0.061, "step": 59140 }, { "epoch": 71.35123717561859, "grad_norm": 5.557809829711914, "learning_rate": 1.9998575414567625e-05, "loss": 0.0585, "step": 59150 }, { "epoch": 71.3633071816536, "grad_norm": 5.9493937492370605, "learning_rate": 1.999857517331283e-05, "loss": 0.0625, "step": 59160 }, { "epoch": 71.3753771876886, "grad_norm": 5.731709003448486, "learning_rate": 1.9998574932058037e-05, "loss": 0.0594, "step": 59170 }, { "epoch": 71.3874471937236, "grad_norm": 5.3913187980651855, "learning_rate": 1.9998574690803244e-05, "loss": 0.0623, "step": 59180 }, { "epoch": 71.3995171997586, "grad_norm": 5.866750717163086, "learning_rate": 1.999857444954845e-05, "loss": 0.0624, "step": 59190 }, { "epoch": 71.41158720579361, "grad_norm": 5.604518413543701, "learning_rate": 1.9998574208293656e-05, "loss": 0.0616, "step": 59200 }, { "epoch": 71.42365721182861, "grad_norm": 5.6189045906066895, "learning_rate": 1.9998573967038862e-05, "loss": 0.0634, "step": 59210 }, { "epoch": 71.43572721786362, "grad_norm": 5.609368324279785, "learning_rate": 1.999857372578407e-05, "loss": 0.0644, "step": 59220 }, { "epoch": 71.44779722389862, "grad_norm": 5.333143711090088, "learning_rate": 1.9998573484529275e-05, "loss": 0.0644, "step": 59230 }, { "epoch": 71.45986722993362, "grad_norm": 6.40625524520874, "learning_rate": 1.999857324327448e-05, "loss": 0.0648, "step": 59240 }, { "epoch": 71.47193723596862, "grad_norm": 6.2714691162109375, "learning_rate": 1.9998573002019687e-05, "loss": 0.0636, "step": 59250 }, { "epoch": 71.48400724200363, "grad_norm": 5.325184345245361, "learning_rate": 1.9998572760764893e-05, "loss": 0.0639, "step": 59260 }, { "epoch": 71.49607724803863, "grad_norm": 5.855652332305908, "learning_rate": 1.99985725195101e-05, "loss": 0.0674, "step": 59270 }, { "epoch": 71.50814725407362, "grad_norm": 5.975926399230957, "learning_rate": 1.9998572278255306e-05, "loss": 0.066, "step": 59280 }, { "epoch": 71.52021726010862, "grad_norm": 5.222170352935791, "learning_rate": 1.9998572037000512e-05, "loss": 0.0644, "step": 59290 }, { "epoch": 71.53228726614363, "grad_norm": 6.29029655456543, "learning_rate": 1.9998571795745718e-05, "loss": 0.0609, "step": 59300 }, { "epoch": 71.54435727217863, "grad_norm": 5.91177225112915, "learning_rate": 1.9998571554490924e-05, "loss": 0.0644, "step": 59310 }, { "epoch": 71.55642727821363, "grad_norm": 5.704710006713867, "learning_rate": 1.999857131323613e-05, "loss": 0.0669, "step": 59320 }, { "epoch": 71.56849728424864, "grad_norm": 5.73807430267334, "learning_rate": 1.9998571071981337e-05, "loss": 0.067, "step": 59330 }, { "epoch": 71.58056729028364, "grad_norm": 6.723142147064209, "learning_rate": 1.9998570830726543e-05, "loss": 0.0631, "step": 59340 }, { "epoch": 71.59263729631864, "grad_norm": 5.584421634674072, "learning_rate": 1.999857058947175e-05, "loss": 0.0649, "step": 59350 }, { "epoch": 71.60470730235365, "grad_norm": 5.945818901062012, "learning_rate": 1.9998570348216955e-05, "loss": 0.066, "step": 59360 }, { "epoch": 71.61677730838865, "grad_norm": 5.837306976318359, "learning_rate": 1.999857010696216e-05, "loss": 0.0669, "step": 59370 }, { "epoch": 71.62884731442365, "grad_norm": 5.688288688659668, "learning_rate": 1.9998569865707368e-05, "loss": 0.0634, "step": 59380 }, { "epoch": 71.64091732045866, "grad_norm": 6.53924036026001, "learning_rate": 1.9998569624452574e-05, "loss": 0.0656, "step": 59390 }, { "epoch": 71.65298732649366, "grad_norm": 5.993324279785156, "learning_rate": 1.999856938319778e-05, "loss": 0.0661, "step": 59400 }, { "epoch": 71.66505733252866, "grad_norm": 5.62163782119751, "learning_rate": 1.9998569141942987e-05, "loss": 0.0672, "step": 59410 }, { "epoch": 71.67712733856366, "grad_norm": 5.910152912139893, "learning_rate": 1.9998568900688193e-05, "loss": 0.067, "step": 59420 }, { "epoch": 71.68919734459867, "grad_norm": 6.088947296142578, "learning_rate": 1.9998568659433396e-05, "loss": 0.0671, "step": 59430 }, { "epoch": 71.70126735063367, "grad_norm": 5.833085060119629, "learning_rate": 1.9998568418178602e-05, "loss": 0.0661, "step": 59440 }, { "epoch": 71.71333735666867, "grad_norm": 5.466915607452393, "learning_rate": 1.9998568176923808e-05, "loss": 0.0664, "step": 59450 }, { "epoch": 71.72540736270368, "grad_norm": 5.94281530380249, "learning_rate": 1.9998567935669014e-05, "loss": 0.071, "step": 59460 }, { "epoch": 71.73747736873868, "grad_norm": 5.317188739776611, "learning_rate": 1.999856769441422e-05, "loss": 0.0669, "step": 59470 }, { "epoch": 71.74954737477368, "grad_norm": 5.962674140930176, "learning_rate": 1.9998567453159427e-05, "loss": 0.0649, "step": 59480 }, { "epoch": 71.76161738080869, "grad_norm": 5.949699401855469, "learning_rate": 1.9998567211904633e-05, "loss": 0.0651, "step": 59490 }, { "epoch": 71.77368738684369, "grad_norm": 6.0998640060424805, "learning_rate": 1.999856697064984e-05, "loss": 0.0663, "step": 59500 }, { "epoch": 71.77368738684369, "eval_loss": 12.370756149291992, "eval_runtime": 8.1411, "eval_samples_per_second": 85.615, "eval_steps_per_second": 10.809, "step": 59500 }, { "epoch": 71.7857573928787, "grad_norm": 6.100261211395264, "learning_rate": 1.9998566729395045e-05, "loss": 0.0712, "step": 59510 }, { "epoch": 71.7978273989137, "grad_norm": 5.929318904876709, "learning_rate": 1.999856648814025e-05, "loss": 0.0667, "step": 59520 }, { "epoch": 71.8098974049487, "grad_norm": 6.0509562492370605, "learning_rate": 1.9998566246885458e-05, "loss": 0.0689, "step": 59530 }, { "epoch": 71.8219674109837, "grad_norm": 6.428370952606201, "learning_rate": 1.9998566005630664e-05, "loss": 0.0701, "step": 59540 }, { "epoch": 71.8340374170187, "grad_norm": 5.641948699951172, "learning_rate": 1.999856576437587e-05, "loss": 0.0706, "step": 59550 }, { "epoch": 71.84610742305371, "grad_norm": 6.564952373504639, "learning_rate": 1.9998565523121076e-05, "loss": 0.069, "step": 59560 }, { "epoch": 71.85817742908871, "grad_norm": 7.040923595428467, "learning_rate": 1.9998565281866283e-05, "loss": 0.0701, "step": 59570 }, { "epoch": 71.87024743512372, "grad_norm": 6.437949180603027, "learning_rate": 1.999856504061149e-05, "loss": 0.067, "step": 59580 }, { "epoch": 71.88231744115872, "grad_norm": 6.077201843261719, "learning_rate": 1.9998564799356695e-05, "loss": 0.0673, "step": 59590 }, { "epoch": 71.89438744719372, "grad_norm": 5.669251918792725, "learning_rate": 1.99985645581019e-05, "loss": 0.0704, "step": 59600 }, { "epoch": 71.90645745322873, "grad_norm": 6.116119384765625, "learning_rate": 1.9998564316847107e-05, "loss": 0.0691, "step": 59610 }, { "epoch": 71.91852745926373, "grad_norm": 6.108559608459473, "learning_rate": 1.9998564075592314e-05, "loss": 0.0702, "step": 59620 }, { "epoch": 71.93059746529873, "grad_norm": 5.691151142120361, "learning_rate": 1.999856383433752e-05, "loss": 0.0687, "step": 59630 }, { "epoch": 71.94266747133373, "grad_norm": 6.233589172363281, "learning_rate": 1.9998563593082726e-05, "loss": 0.067, "step": 59640 }, { "epoch": 71.95473747736874, "grad_norm": 6.069468975067139, "learning_rate": 1.9998563351827932e-05, "loss": 0.066, "step": 59650 }, { "epoch": 71.96680748340374, "grad_norm": 6.205773830413818, "learning_rate": 1.999856311057314e-05, "loss": 0.0683, "step": 59660 }, { "epoch": 71.97887748943874, "grad_norm": 5.994094371795654, "learning_rate": 1.9998562869318345e-05, "loss": 0.0705, "step": 59670 }, { "epoch": 71.99094749547375, "grad_norm": 5.962924957275391, "learning_rate": 1.9998562628063548e-05, "loss": 0.0694, "step": 59680 }, { "epoch": 72.002414001207, "grad_norm": 5.118126392364502, "learning_rate": 1.9998562386808754e-05, "loss": 0.0632, "step": 59690 }, { "epoch": 72.014484007242, "grad_norm": 4.989163875579834, "learning_rate": 1.999856214555396e-05, "loss": 0.0457, "step": 59700 }, { "epoch": 72.026554013277, "grad_norm": 5.129473686218262, "learning_rate": 1.9998561904299166e-05, "loss": 0.0483, "step": 59710 }, { "epoch": 72.03862401931201, "grad_norm": 5.394372463226318, "learning_rate": 1.9998561663044372e-05, "loss": 0.052, "step": 59720 }, { "epoch": 72.05069402534701, "grad_norm": 5.10278844833374, "learning_rate": 1.999856142178958e-05, "loss": 0.051, "step": 59730 }, { "epoch": 72.06276403138202, "grad_norm": 5.475245475769043, "learning_rate": 1.9998561180534785e-05, "loss": 0.0559, "step": 59740 }, { "epoch": 72.07483403741702, "grad_norm": 5.439391136169434, "learning_rate": 1.999856093927999e-05, "loss": 0.053, "step": 59750 }, { "epoch": 72.08690404345202, "grad_norm": 5.402289867401123, "learning_rate": 1.9998560698025197e-05, "loss": 0.0539, "step": 59760 }, { "epoch": 72.09897404948703, "grad_norm": 4.782583713531494, "learning_rate": 1.9998560456770404e-05, "loss": 0.0554, "step": 59770 }, { "epoch": 72.11104405552203, "grad_norm": 5.135707378387451, "learning_rate": 1.999856021551561e-05, "loss": 0.0548, "step": 59780 }, { "epoch": 72.12311406155703, "grad_norm": 5.0493597984313965, "learning_rate": 1.9998559974260816e-05, "loss": 0.0555, "step": 59790 }, { "epoch": 72.13518406759204, "grad_norm": 5.462370872497559, "learning_rate": 1.9998559733006026e-05, "loss": 0.0551, "step": 59800 }, { "epoch": 72.14725407362704, "grad_norm": 5.659482002258301, "learning_rate": 1.9998559491751232e-05, "loss": 0.0557, "step": 59810 }, { "epoch": 72.15932407966204, "grad_norm": 5.316836357116699, "learning_rate": 1.9998559250496438e-05, "loss": 0.0563, "step": 59820 }, { "epoch": 72.17139408569705, "grad_norm": 5.654707908630371, "learning_rate": 1.9998559009241644e-05, "loss": 0.0584, "step": 59830 }, { "epoch": 72.18346409173205, "grad_norm": 5.264445781707764, "learning_rate": 1.9998558767986847e-05, "loss": 0.0579, "step": 59840 }, { "epoch": 72.19553409776705, "grad_norm": 5.825174331665039, "learning_rate": 1.9998558526732053e-05, "loss": 0.056, "step": 59850 }, { "epoch": 72.20760410380205, "grad_norm": 5.2684807777404785, "learning_rate": 1.999855828547726e-05, "loss": 0.0579, "step": 59860 }, { "epoch": 72.21967410983706, "grad_norm": 5.171299934387207, "learning_rate": 1.9998558044222466e-05, "loss": 0.0574, "step": 59870 }, { "epoch": 72.23174411587206, "grad_norm": 5.501769542694092, "learning_rate": 1.9998557802967672e-05, "loss": 0.0595, "step": 59880 }, { "epoch": 72.24381412190706, "grad_norm": 5.32857608795166, "learning_rate": 1.9998557561712878e-05, "loss": 0.0588, "step": 59890 }, { "epoch": 72.25588412794207, "grad_norm": 5.818996906280518, "learning_rate": 1.9998557320458084e-05, "loss": 0.0599, "step": 59900 }, { "epoch": 72.26795413397707, "grad_norm": 6.104966163635254, "learning_rate": 1.999855707920329e-05, "loss": 0.0614, "step": 59910 }, { "epoch": 72.28002414001207, "grad_norm": 5.34361457824707, "learning_rate": 1.9998556837948497e-05, "loss": 0.0623, "step": 59920 }, { "epoch": 72.29209414604708, "grad_norm": 5.707705020904541, "learning_rate": 1.9998556596693703e-05, "loss": 0.063, "step": 59930 }, { "epoch": 72.30416415208208, "grad_norm": 5.51879358291626, "learning_rate": 1.999855635543891e-05, "loss": 0.0608, "step": 59940 }, { "epoch": 72.31623415811708, "grad_norm": 6.14491605758667, "learning_rate": 1.9998556114184115e-05, "loss": 0.0622, "step": 59950 }, { "epoch": 72.32830416415209, "grad_norm": 6.150513648986816, "learning_rate": 1.999855587292932e-05, "loss": 0.0604, "step": 59960 }, { "epoch": 72.34037417018709, "grad_norm": 5.462543964385986, "learning_rate": 1.9998555631674528e-05, "loss": 0.0609, "step": 59970 }, { "epoch": 72.3524441762221, "grad_norm": 5.547036647796631, "learning_rate": 1.9998555390419734e-05, "loss": 0.0593, "step": 59980 }, { "epoch": 72.3645141822571, "grad_norm": 5.657717704772949, "learning_rate": 1.999855514916494e-05, "loss": 0.0603, "step": 59990 }, { "epoch": 72.3765841882921, "grad_norm": 5.881928443908691, "learning_rate": 1.9998554907910146e-05, "loss": 0.0615, "step": 60000 }, { "epoch": 72.3765841882921, "eval_loss": 12.37149715423584, "eval_runtime": 8.1418, "eval_samples_per_second": 85.608, "eval_steps_per_second": 10.808, "step": 60000 }, { "epoch": 72.3886541943271, "grad_norm": 5.232429027557373, "learning_rate": 1.9998554666655353e-05, "loss": 0.0609, "step": 60010 }, { "epoch": 72.4007242003621, "grad_norm": 5.390051364898682, "learning_rate": 1.999855442540056e-05, "loss": 0.0607, "step": 60020 }, { "epoch": 72.41279420639711, "grad_norm": 6.055614471435547, "learning_rate": 1.9998554184145765e-05, "loss": 0.0613, "step": 60030 }, { "epoch": 72.42486421243211, "grad_norm": 5.903752326965332, "learning_rate": 1.999855394289097e-05, "loss": 0.0605, "step": 60040 }, { "epoch": 72.43693421846712, "grad_norm": 5.589896202087402, "learning_rate": 1.9998553701636178e-05, "loss": 0.0594, "step": 60050 }, { "epoch": 72.44900422450212, "grad_norm": 5.157318115234375, "learning_rate": 1.9998553460381384e-05, "loss": 0.0609, "step": 60060 }, { "epoch": 72.46107423053712, "grad_norm": 5.98091459274292, "learning_rate": 1.999855321912659e-05, "loss": 0.0617, "step": 60070 }, { "epoch": 72.47314423657213, "grad_norm": 5.316287517547607, "learning_rate": 1.9998552977871796e-05, "loss": 0.0614, "step": 60080 }, { "epoch": 72.48521424260713, "grad_norm": 5.875818252563477, "learning_rate": 1.9998552736617e-05, "loss": 0.0581, "step": 60090 }, { "epoch": 72.49728424864213, "grad_norm": 5.907900810241699, "learning_rate": 1.9998552495362205e-05, "loss": 0.0628, "step": 60100 }, { "epoch": 72.50935425467712, "grad_norm": 6.311191082000732, "learning_rate": 1.999855225410741e-05, "loss": 0.0631, "step": 60110 }, { "epoch": 72.52142426071212, "grad_norm": 6.208446025848389, "learning_rate": 1.9998552012852618e-05, "loss": 0.0667, "step": 60120 }, { "epoch": 72.53349426674713, "grad_norm": 5.619662761688232, "learning_rate": 1.9998551771597824e-05, "loss": 0.0635, "step": 60130 }, { "epoch": 72.54556427278213, "grad_norm": 5.5374064445495605, "learning_rate": 1.999855153034303e-05, "loss": 0.0648, "step": 60140 }, { "epoch": 72.55763427881713, "grad_norm": 6.080585956573486, "learning_rate": 1.9998551289088236e-05, "loss": 0.0637, "step": 60150 }, { "epoch": 72.56970428485214, "grad_norm": 5.5749640464782715, "learning_rate": 1.9998551047833443e-05, "loss": 0.0643, "step": 60160 }, { "epoch": 72.58177429088714, "grad_norm": 5.935259819030762, "learning_rate": 1.999855080657865e-05, "loss": 0.0644, "step": 60170 }, { "epoch": 72.59384429692214, "grad_norm": 5.8712687492370605, "learning_rate": 1.9998550565323855e-05, "loss": 0.0634, "step": 60180 }, { "epoch": 72.60591430295715, "grad_norm": 6.046607494354248, "learning_rate": 1.999855032406906e-05, "loss": 0.067, "step": 60190 }, { "epoch": 72.61798430899215, "grad_norm": 6.459715366363525, "learning_rate": 1.9998550082814267e-05, "loss": 0.0656, "step": 60200 }, { "epoch": 72.63005431502715, "grad_norm": 5.631467819213867, "learning_rate": 1.9998549841559474e-05, "loss": 0.0664, "step": 60210 }, { "epoch": 72.64212432106216, "grad_norm": 5.917319297790527, "learning_rate": 1.999854960030468e-05, "loss": 0.0659, "step": 60220 }, { "epoch": 72.65419432709716, "grad_norm": 5.699784755706787, "learning_rate": 1.9998549359049886e-05, "loss": 0.0638, "step": 60230 }, { "epoch": 72.66626433313216, "grad_norm": 5.553251266479492, "learning_rate": 1.9998549117795092e-05, "loss": 0.0658, "step": 60240 }, { "epoch": 72.67833433916717, "grad_norm": 6.171426773071289, "learning_rate": 1.99985488765403e-05, "loss": 0.0687, "step": 60250 }, { "epoch": 72.69040434520217, "grad_norm": 5.466480255126953, "learning_rate": 1.9998548635285505e-05, "loss": 0.0654, "step": 60260 }, { "epoch": 72.70247435123717, "grad_norm": 5.501940727233887, "learning_rate": 1.999854839403071e-05, "loss": 0.0628, "step": 60270 }, { "epoch": 72.71454435727217, "grad_norm": 6.038638591766357, "learning_rate": 1.9998548152775917e-05, "loss": 0.0634, "step": 60280 }, { "epoch": 72.72661436330718, "grad_norm": 5.672584056854248, "learning_rate": 1.9998547911521123e-05, "loss": 0.0645, "step": 60290 }, { "epoch": 72.73868436934218, "grad_norm": 5.9994683265686035, "learning_rate": 1.999854767026633e-05, "loss": 0.0653, "step": 60300 }, { "epoch": 72.75075437537718, "grad_norm": 5.868988990783691, "learning_rate": 1.9998547429011536e-05, "loss": 0.0674, "step": 60310 }, { "epoch": 72.76282438141219, "grad_norm": 6.701897144317627, "learning_rate": 1.9998547187756742e-05, "loss": 0.0645, "step": 60320 }, { "epoch": 72.77489438744719, "grad_norm": 5.884408950805664, "learning_rate": 1.9998546946501948e-05, "loss": 0.069, "step": 60330 }, { "epoch": 72.7869643934822, "grad_norm": 5.906371116638184, "learning_rate": 1.9998546705247154e-05, "loss": 0.0655, "step": 60340 }, { "epoch": 72.7990343995172, "grad_norm": 5.838982582092285, "learning_rate": 1.999854646399236e-05, "loss": 0.0636, "step": 60350 }, { "epoch": 72.8111044055522, "grad_norm": 6.0268659591674805, "learning_rate": 1.9998546222737567e-05, "loss": 0.066, "step": 60360 }, { "epoch": 72.8231744115872, "grad_norm": 5.9342827796936035, "learning_rate": 1.9998545981482773e-05, "loss": 0.0685, "step": 60370 }, { "epoch": 72.8352444176222, "grad_norm": 6.000491142272949, "learning_rate": 1.999854574022798e-05, "loss": 0.0665, "step": 60380 }, { "epoch": 72.84731442365721, "grad_norm": 5.943642616271973, "learning_rate": 1.9998545498973186e-05, "loss": 0.0679, "step": 60390 }, { "epoch": 72.85938442969221, "grad_norm": 5.267383098602295, "learning_rate": 1.9998545257718392e-05, "loss": 0.0695, "step": 60400 }, { "epoch": 72.87145443572722, "grad_norm": 5.8560357093811035, "learning_rate": 1.9998545016463598e-05, "loss": 0.0695, "step": 60410 }, { "epoch": 72.88352444176222, "grad_norm": 6.138646602630615, "learning_rate": 1.9998544775208804e-05, "loss": 0.0649, "step": 60420 }, { "epoch": 72.89559444779722, "grad_norm": 5.703669548034668, "learning_rate": 1.999854453395401e-05, "loss": 0.0625, "step": 60430 }, { "epoch": 72.90766445383223, "grad_norm": 6.046250343322754, "learning_rate": 1.9998544292699217e-05, "loss": 0.0684, "step": 60440 }, { "epoch": 72.91973445986723, "grad_norm": 6.296443462371826, "learning_rate": 1.9998544051444423e-05, "loss": 0.0674, "step": 60450 }, { "epoch": 72.93180446590223, "grad_norm": 5.720254898071289, "learning_rate": 1.999854381018963e-05, "loss": 0.0667, "step": 60460 }, { "epoch": 72.94387447193724, "grad_norm": 6.032215595245361, "learning_rate": 1.9998543568934835e-05, "loss": 0.0669, "step": 60470 }, { "epoch": 72.95594447797224, "grad_norm": 6.3487019538879395, "learning_rate": 1.999854332768004e-05, "loss": 0.0658, "step": 60480 }, { "epoch": 72.96801448400724, "grad_norm": 5.465012550354004, "learning_rate": 1.9998543086425248e-05, "loss": 0.07, "step": 60490 }, { "epoch": 72.98008449004224, "grad_norm": 5.079113006591797, "learning_rate": 1.9998542845170454e-05, "loss": 0.0658, "step": 60500 }, { "epoch": 72.98008449004224, "eval_loss": 12.382471084594727, "eval_runtime": 8.1401, "eval_samples_per_second": 85.625, "eval_steps_per_second": 10.811, "step": 60500 }, { "epoch": 72.99215449607725, "grad_norm": 5.488523006439209, "learning_rate": 1.9998542603915657e-05, "loss": 0.0637, "step": 60510 }, { "epoch": 73.0036210018105, "grad_norm": 4.981382846832275, "learning_rate": 1.9998542362660863e-05, "loss": 0.0629, "step": 60520 }, { "epoch": 73.0156910078455, "grad_norm": 4.7571258544921875, "learning_rate": 1.999854212140607e-05, "loss": 0.0465, "step": 60530 }, { "epoch": 73.02776101388051, "grad_norm": 4.617233753204346, "learning_rate": 1.9998541880151275e-05, "loss": 0.0454, "step": 60540 }, { "epoch": 73.03983101991551, "grad_norm": 5.255321979522705, "learning_rate": 1.999854163889648e-05, "loss": 0.0513, "step": 60550 }, { "epoch": 73.05190102595051, "grad_norm": 5.002791404724121, "learning_rate": 1.9998541397641688e-05, "loss": 0.0502, "step": 60560 }, { "epoch": 73.06397103198552, "grad_norm": 5.213599681854248, "learning_rate": 1.9998541156386894e-05, "loss": 0.053, "step": 60570 }, { "epoch": 73.07604103802052, "grad_norm": 5.32327127456665, "learning_rate": 1.99985409151321e-05, "loss": 0.0526, "step": 60580 }, { "epoch": 73.08811104405552, "grad_norm": 5.578052520751953, "learning_rate": 1.9998540673877306e-05, "loss": 0.0535, "step": 60590 }, { "epoch": 73.10018105009053, "grad_norm": 5.474132061004639, "learning_rate": 1.9998540432622513e-05, "loss": 0.0536, "step": 60600 }, { "epoch": 73.11225105612553, "grad_norm": 5.305715084075928, "learning_rate": 1.999854019136772e-05, "loss": 0.0525, "step": 60610 }, { "epoch": 73.12432106216053, "grad_norm": 4.875646591186523, "learning_rate": 1.9998539950112925e-05, "loss": 0.056, "step": 60620 }, { "epoch": 73.13639106819554, "grad_norm": 5.064355373382568, "learning_rate": 1.999853970885813e-05, "loss": 0.0553, "step": 60630 }, { "epoch": 73.14846107423054, "grad_norm": 5.592251777648926, "learning_rate": 1.9998539467603337e-05, "loss": 0.0561, "step": 60640 }, { "epoch": 73.16053108026554, "grad_norm": 5.952481269836426, "learning_rate": 1.9998539226348544e-05, "loss": 0.055, "step": 60650 }, { "epoch": 73.17260108630055, "grad_norm": 5.1400556564331055, "learning_rate": 1.999853898509375e-05, "loss": 0.055, "step": 60660 }, { "epoch": 73.18467109233555, "grad_norm": 5.916867256164551, "learning_rate": 1.9998538743838956e-05, "loss": 0.0562, "step": 60670 }, { "epoch": 73.19674109837055, "grad_norm": 5.64821720123291, "learning_rate": 1.9998538502584162e-05, "loss": 0.0588, "step": 60680 }, { "epoch": 73.20881110440556, "grad_norm": 5.369014739990234, "learning_rate": 1.999853826132937e-05, "loss": 0.0562, "step": 60690 }, { "epoch": 73.22088111044056, "grad_norm": 5.537065505981445, "learning_rate": 1.9998538020074575e-05, "loss": 0.0573, "step": 60700 }, { "epoch": 73.23295111647556, "grad_norm": 6.029740810394287, "learning_rate": 1.999853777881978e-05, "loss": 0.0603, "step": 60710 }, { "epoch": 73.24502112251056, "grad_norm": 5.747778415679932, "learning_rate": 1.9998537537564987e-05, "loss": 0.06, "step": 60720 }, { "epoch": 73.25709112854557, "grad_norm": 5.119932651519775, "learning_rate": 1.9998537296310193e-05, "loss": 0.0563, "step": 60730 }, { "epoch": 73.26916113458057, "grad_norm": 5.693003177642822, "learning_rate": 1.99985370550554e-05, "loss": 0.0596, "step": 60740 }, { "epoch": 73.28123114061557, "grad_norm": 5.359810829162598, "learning_rate": 1.9998536813800606e-05, "loss": 0.0591, "step": 60750 }, { "epoch": 73.29330114665058, "grad_norm": 5.51974630355835, "learning_rate": 1.999853657254581e-05, "loss": 0.0567, "step": 60760 }, { "epoch": 73.30537115268558, "grad_norm": 5.946218967437744, "learning_rate": 1.9998536331291015e-05, "loss": 0.0609, "step": 60770 }, { "epoch": 73.31744115872058, "grad_norm": 5.931737422943115, "learning_rate": 1.999853609003622e-05, "loss": 0.0562, "step": 60780 }, { "epoch": 73.32951116475559, "grad_norm": 5.017658710479736, "learning_rate": 1.9998535848781427e-05, "loss": 0.0614, "step": 60790 }, { "epoch": 73.34158117079059, "grad_norm": 5.727543354034424, "learning_rate": 1.9998535607526634e-05, "loss": 0.0607, "step": 60800 }, { "epoch": 73.3536511768256, "grad_norm": 5.136538505554199, "learning_rate": 1.999853536627184e-05, "loss": 0.0579, "step": 60810 }, { "epoch": 73.3657211828606, "grad_norm": 5.305262565612793, "learning_rate": 1.9998535125017046e-05, "loss": 0.0576, "step": 60820 }, { "epoch": 73.3777911888956, "grad_norm": 6.341740608215332, "learning_rate": 1.9998534883762252e-05, "loss": 0.0624, "step": 60830 }, { "epoch": 73.3898611949306, "grad_norm": 5.864682674407959, "learning_rate": 1.999853464250746e-05, "loss": 0.0606, "step": 60840 }, { "epoch": 73.4019312009656, "grad_norm": 6.0062432289123535, "learning_rate": 1.9998534401252665e-05, "loss": 0.0636, "step": 60850 }, { "epoch": 73.41400120700061, "grad_norm": 5.329391002655029, "learning_rate": 1.999853415999787e-05, "loss": 0.0617, "step": 60860 }, { "epoch": 73.42607121303561, "grad_norm": 6.072780609130859, "learning_rate": 1.9998533918743077e-05, "loss": 0.0606, "step": 60870 }, { "epoch": 73.43814121907062, "grad_norm": 5.7192888259887695, "learning_rate": 1.9998533677488287e-05, "loss": 0.0615, "step": 60880 }, { "epoch": 73.45021122510562, "grad_norm": 5.517786979675293, "learning_rate": 1.9998533436233493e-05, "loss": 0.0616, "step": 60890 }, { "epoch": 73.46228123114062, "grad_norm": 5.5367045402526855, "learning_rate": 1.99985331949787e-05, "loss": 0.0609, "step": 60900 }, { "epoch": 73.47435123717563, "grad_norm": 5.595832824707031, "learning_rate": 1.9998532953723905e-05, "loss": 0.0632, "step": 60910 }, { "epoch": 73.48642124321063, "grad_norm": 5.670534610748291, "learning_rate": 1.9998532712469108e-05, "loss": 0.0628, "step": 60920 }, { "epoch": 73.49849124924563, "grad_norm": 5.706480979919434, "learning_rate": 1.9998532471214314e-05, "loss": 0.0655, "step": 60930 }, { "epoch": 73.51056125528062, "grad_norm": 5.850586414337158, "learning_rate": 1.999853222995952e-05, "loss": 0.0619, "step": 60940 }, { "epoch": 73.52263126131562, "grad_norm": 5.705413341522217, "learning_rate": 1.9998531988704727e-05, "loss": 0.0635, "step": 60950 }, { "epoch": 73.53470126735063, "grad_norm": 5.443399429321289, "learning_rate": 1.9998531747449933e-05, "loss": 0.0623, "step": 60960 }, { "epoch": 73.54677127338563, "grad_norm": 6.033102512359619, "learning_rate": 1.999853150619514e-05, "loss": 0.0638, "step": 60970 }, { "epoch": 73.55884127942063, "grad_norm": 5.999614715576172, "learning_rate": 1.9998531264940345e-05, "loss": 0.0652, "step": 60980 }, { "epoch": 73.57091128545564, "grad_norm": 5.298684597015381, "learning_rate": 1.999853102368555e-05, "loss": 0.0607, "step": 60990 }, { "epoch": 73.58298129149064, "grad_norm": 5.835437297821045, "learning_rate": 1.9998530782430758e-05, "loss": 0.061, "step": 61000 }, { "epoch": 73.58298129149064, "eval_loss": 12.39999008178711, "eval_runtime": 8.1429, "eval_samples_per_second": 85.597, "eval_steps_per_second": 10.807, "step": 61000 }, { "epoch": 73.59505129752564, "grad_norm": 6.283607006072998, "learning_rate": 1.9998530541175964e-05, "loss": 0.0641, "step": 61010 }, { "epoch": 73.60712130356065, "grad_norm": 5.8974690437316895, "learning_rate": 1.999853029992117e-05, "loss": 0.0637, "step": 61020 }, { "epoch": 73.61919130959565, "grad_norm": 6.1489362716674805, "learning_rate": 1.9998530058666377e-05, "loss": 0.0633, "step": 61030 }, { "epoch": 73.63126131563065, "grad_norm": 5.382430076599121, "learning_rate": 1.9998529817411583e-05, "loss": 0.0623, "step": 61040 }, { "epoch": 73.64333132166566, "grad_norm": 5.975822925567627, "learning_rate": 1.999852957615679e-05, "loss": 0.0631, "step": 61050 }, { "epoch": 73.65540132770066, "grad_norm": 5.5209059715271, "learning_rate": 1.9998529334901995e-05, "loss": 0.064, "step": 61060 }, { "epoch": 73.66747133373566, "grad_norm": 5.479202747344971, "learning_rate": 1.99985290936472e-05, "loss": 0.0637, "step": 61070 }, { "epoch": 73.67954133977067, "grad_norm": 5.775239944458008, "learning_rate": 1.9998528852392408e-05, "loss": 0.0642, "step": 61080 }, { "epoch": 73.69161134580567, "grad_norm": 5.927867412567139, "learning_rate": 1.9998528611137614e-05, "loss": 0.0653, "step": 61090 }, { "epoch": 73.70368135184067, "grad_norm": 5.93231725692749, "learning_rate": 1.999852836988282e-05, "loss": 0.0655, "step": 61100 }, { "epoch": 73.71575135787567, "grad_norm": 5.846649646759033, "learning_rate": 1.9998528128628026e-05, "loss": 0.0636, "step": 61110 }, { "epoch": 73.72782136391068, "grad_norm": 5.894708156585693, "learning_rate": 1.9998527887373232e-05, "loss": 0.0633, "step": 61120 }, { "epoch": 73.73989136994568, "grad_norm": 5.888668537139893, "learning_rate": 1.999852764611844e-05, "loss": 0.0654, "step": 61130 }, { "epoch": 73.75196137598068, "grad_norm": 6.584202289581299, "learning_rate": 1.9998527404863645e-05, "loss": 0.0657, "step": 61140 }, { "epoch": 73.76403138201569, "grad_norm": 5.618241310119629, "learning_rate": 1.999852716360885e-05, "loss": 0.0673, "step": 61150 }, { "epoch": 73.77610138805069, "grad_norm": 5.791197776794434, "learning_rate": 1.9998526922354057e-05, "loss": 0.0652, "step": 61160 }, { "epoch": 73.7881713940857, "grad_norm": 5.997535705566406, "learning_rate": 1.999852668109926e-05, "loss": 0.0647, "step": 61170 }, { "epoch": 73.8002414001207, "grad_norm": 5.379873752593994, "learning_rate": 1.9998526439844466e-05, "loss": 0.0636, "step": 61180 }, { "epoch": 73.8123114061557, "grad_norm": 5.44938325881958, "learning_rate": 1.9998526198589673e-05, "loss": 0.065, "step": 61190 }, { "epoch": 73.8243814121907, "grad_norm": 5.9330220222473145, "learning_rate": 1.999852595733488e-05, "loss": 0.0647, "step": 61200 }, { "epoch": 73.8364514182257, "grad_norm": 6.166661262512207, "learning_rate": 1.9998525716080085e-05, "loss": 0.0677, "step": 61210 }, { "epoch": 73.84852142426071, "grad_norm": 5.858083248138428, "learning_rate": 1.999852547482529e-05, "loss": 0.0663, "step": 61220 }, { "epoch": 73.86059143029571, "grad_norm": 6.01976203918457, "learning_rate": 1.9998525233570497e-05, "loss": 0.0673, "step": 61230 }, { "epoch": 73.87266143633072, "grad_norm": 5.734703063964844, "learning_rate": 1.9998524992315704e-05, "loss": 0.0688, "step": 61240 }, { "epoch": 73.88473144236572, "grad_norm": 6.102864742279053, "learning_rate": 1.999852475106091e-05, "loss": 0.0693, "step": 61250 }, { "epoch": 73.89680144840072, "grad_norm": 5.782594680786133, "learning_rate": 1.9998524509806116e-05, "loss": 0.0675, "step": 61260 }, { "epoch": 73.90887145443573, "grad_norm": 6.206390380859375, "learning_rate": 1.9998524268551322e-05, "loss": 0.067, "step": 61270 }, { "epoch": 73.92094146047073, "grad_norm": 6.559313774108887, "learning_rate": 1.999852402729653e-05, "loss": 0.0699, "step": 61280 }, { "epoch": 73.93301146650573, "grad_norm": 6.021063804626465, "learning_rate": 1.9998523786041735e-05, "loss": 0.0661, "step": 61290 }, { "epoch": 73.94508147254074, "grad_norm": 6.339284896850586, "learning_rate": 1.999852354478694e-05, "loss": 0.0681, "step": 61300 }, { "epoch": 73.95715147857574, "grad_norm": 5.978257179260254, "learning_rate": 1.9998523303532147e-05, "loss": 0.0684, "step": 61310 }, { "epoch": 73.96922148461074, "grad_norm": 5.763051509857178, "learning_rate": 1.9998523062277353e-05, "loss": 0.0652, "step": 61320 }, { "epoch": 73.98129149064575, "grad_norm": 6.3755784034729, "learning_rate": 1.999852282102256e-05, "loss": 0.0684, "step": 61330 }, { "epoch": 73.99336149668075, "grad_norm": 6.341646194458008, "learning_rate": 1.9998522579767766e-05, "loss": 0.0686, "step": 61340 }, { "epoch": 74.004828002414, "grad_norm": 4.390363693237305, "learning_rate": 1.9998522338512972e-05, "loss": 0.059, "step": 61350 }, { "epoch": 74.016898008449, "grad_norm": 5.0647807121276855, "learning_rate": 1.9998522097258178e-05, "loss": 0.049, "step": 61360 }, { "epoch": 74.02896801448401, "grad_norm": 5.060196876525879, "learning_rate": 1.9998521856003384e-05, "loss": 0.0457, "step": 61370 }, { "epoch": 74.04103802051901, "grad_norm": 4.944368839263916, "learning_rate": 1.999852161474859e-05, "loss": 0.0494, "step": 61380 }, { "epoch": 74.05310802655401, "grad_norm": 5.638315200805664, "learning_rate": 1.9998521373493797e-05, "loss": 0.0519, "step": 61390 }, { "epoch": 74.06517803258902, "grad_norm": 5.5710368156433105, "learning_rate": 1.9998521132239003e-05, "loss": 0.0517, "step": 61400 }, { "epoch": 74.07724803862402, "grad_norm": 4.506527423858643, "learning_rate": 1.999852089098421e-05, "loss": 0.0536, "step": 61410 }, { "epoch": 74.08931804465902, "grad_norm": 5.958456516265869, "learning_rate": 1.9998520649729416e-05, "loss": 0.0518, "step": 61420 }, { "epoch": 74.10138805069403, "grad_norm": 5.5072736740112305, "learning_rate": 1.9998520408474622e-05, "loss": 0.0519, "step": 61430 }, { "epoch": 74.11345805672903, "grad_norm": 5.762560844421387, "learning_rate": 1.9998520167219828e-05, "loss": 0.0531, "step": 61440 }, { "epoch": 74.12552806276403, "grad_norm": 5.5344929695129395, "learning_rate": 1.9998519925965034e-05, "loss": 0.055, "step": 61450 }, { "epoch": 74.13759806879904, "grad_norm": 5.127845764160156, "learning_rate": 1.999851968471024e-05, "loss": 0.0544, "step": 61460 }, { "epoch": 74.14966807483404, "grad_norm": 5.7445831298828125, "learning_rate": 1.9998519443455447e-05, "loss": 0.0586, "step": 61470 }, { "epoch": 74.16173808086904, "grad_norm": 6.034804344177246, "learning_rate": 1.9998519202200653e-05, "loss": 0.0566, "step": 61480 }, { "epoch": 74.17380808690405, "grad_norm": 5.669587135314941, "learning_rate": 1.999851896094586e-05, "loss": 0.0571, "step": 61490 }, { "epoch": 74.18587809293905, "grad_norm": 5.592026233673096, "learning_rate": 1.9998518719691065e-05, "loss": 0.0575, "step": 61500 }, { "epoch": 74.18587809293905, "eval_loss": 12.411149978637695, "eval_runtime": 8.1405, "eval_samples_per_second": 85.621, "eval_steps_per_second": 10.81, "step": 61500 }, { "epoch": 74.19794809897405, "grad_norm": 4.747236728668213, "learning_rate": 1.999851847843627e-05, "loss": 0.0577, "step": 61510 }, { "epoch": 74.21001810500906, "grad_norm": 5.214375972747803, "learning_rate": 1.9998518237181478e-05, "loss": 0.0555, "step": 61520 }, { "epoch": 74.22208811104406, "grad_norm": 5.983163356781006, "learning_rate": 1.9998517995926684e-05, "loss": 0.0595, "step": 61530 }, { "epoch": 74.23415811707906, "grad_norm": 4.804268836975098, "learning_rate": 1.999851775467189e-05, "loss": 0.0568, "step": 61540 }, { "epoch": 74.24622812311407, "grad_norm": 5.64279842376709, "learning_rate": 1.9998517513417096e-05, "loss": 0.0555, "step": 61550 }, { "epoch": 74.25829812914907, "grad_norm": 5.131346702575684, "learning_rate": 1.9998517272162303e-05, "loss": 0.0541, "step": 61560 }, { "epoch": 74.27036813518407, "grad_norm": 5.776472091674805, "learning_rate": 1.999851703090751e-05, "loss": 0.0589, "step": 61570 }, { "epoch": 74.28243814121907, "grad_norm": 5.610470294952393, "learning_rate": 1.999851678965271e-05, "loss": 0.0595, "step": 61580 }, { "epoch": 74.29450814725408, "grad_norm": 5.327671527862549, "learning_rate": 1.9998516548397918e-05, "loss": 0.0587, "step": 61590 }, { "epoch": 74.30657815328908, "grad_norm": 5.814975738525391, "learning_rate": 1.9998516307143124e-05, "loss": 0.0614, "step": 61600 }, { "epoch": 74.31864815932408, "grad_norm": 5.282426834106445, "learning_rate": 1.999851606588833e-05, "loss": 0.0585, "step": 61610 }, { "epoch": 74.33071816535909, "grad_norm": 5.157220363616943, "learning_rate": 1.9998515824633536e-05, "loss": 0.0589, "step": 61620 }, { "epoch": 74.34278817139409, "grad_norm": 5.424541473388672, "learning_rate": 1.9998515583378743e-05, "loss": 0.058, "step": 61630 }, { "epoch": 74.3548581774291, "grad_norm": 5.826629161834717, "learning_rate": 1.999851534212395e-05, "loss": 0.06, "step": 61640 }, { "epoch": 74.3669281834641, "grad_norm": 5.648942470550537, "learning_rate": 1.9998515100869155e-05, "loss": 0.0592, "step": 61650 }, { "epoch": 74.3789981894991, "grad_norm": 5.171614646911621, "learning_rate": 1.999851485961436e-05, "loss": 0.0612, "step": 61660 }, { "epoch": 74.3910681955341, "grad_norm": 5.553709983825684, "learning_rate": 1.9998514618359568e-05, "loss": 0.0595, "step": 61670 }, { "epoch": 74.4031382015691, "grad_norm": 5.554616451263428, "learning_rate": 1.9998514377104774e-05, "loss": 0.0597, "step": 61680 }, { "epoch": 74.41520820760411, "grad_norm": 5.955418586730957, "learning_rate": 1.999851413584998e-05, "loss": 0.0615, "step": 61690 }, { "epoch": 74.42727821363911, "grad_norm": 5.593278408050537, "learning_rate": 1.9998513894595186e-05, "loss": 0.0617, "step": 61700 }, { "epoch": 74.43934821967412, "grad_norm": 5.1773152351379395, "learning_rate": 1.9998513653340392e-05, "loss": 0.0611, "step": 61710 }, { "epoch": 74.45141822570912, "grad_norm": 6.265524864196777, "learning_rate": 1.99985134120856e-05, "loss": 0.0628, "step": 61720 }, { "epoch": 74.46348823174412, "grad_norm": 6.211633205413818, "learning_rate": 1.9998513170830805e-05, "loss": 0.0612, "step": 61730 }, { "epoch": 74.47555823777913, "grad_norm": 5.713349342346191, "learning_rate": 1.999851292957601e-05, "loss": 0.062, "step": 61740 }, { "epoch": 74.48762824381413, "grad_norm": 5.324471950531006, "learning_rate": 1.9998512688321217e-05, "loss": 0.0616, "step": 61750 }, { "epoch": 74.49969824984913, "grad_norm": 5.718802452087402, "learning_rate": 1.9998512447066423e-05, "loss": 0.0615, "step": 61760 }, { "epoch": 74.51176825588412, "grad_norm": 5.797171592712402, "learning_rate": 1.999851220581163e-05, "loss": 0.0612, "step": 61770 }, { "epoch": 74.52383826191912, "grad_norm": 5.416046142578125, "learning_rate": 1.9998511964556836e-05, "loss": 0.0587, "step": 61780 }, { "epoch": 74.53590826795413, "grad_norm": 5.621153354644775, "learning_rate": 1.9998511723302042e-05, "loss": 0.0625, "step": 61790 }, { "epoch": 74.54797827398913, "grad_norm": 6.100666046142578, "learning_rate": 1.999851148204725e-05, "loss": 0.0621, "step": 61800 }, { "epoch": 74.56004828002413, "grad_norm": 5.049992561340332, "learning_rate": 1.9998511240792455e-05, "loss": 0.061, "step": 61810 }, { "epoch": 74.57211828605914, "grad_norm": 5.939859867095947, "learning_rate": 1.999851099953766e-05, "loss": 0.0632, "step": 61820 }, { "epoch": 74.58418829209414, "grad_norm": 5.672728538513184, "learning_rate": 1.9998510758282864e-05, "loss": 0.0654, "step": 61830 }, { "epoch": 74.59625829812914, "grad_norm": 5.757481098175049, "learning_rate": 1.999851051702807e-05, "loss": 0.0625, "step": 61840 }, { "epoch": 74.60832830416415, "grad_norm": 5.459110736846924, "learning_rate": 1.9998510275773276e-05, "loss": 0.06, "step": 61850 }, { "epoch": 74.62039831019915, "grad_norm": 5.907842636108398, "learning_rate": 1.9998510034518482e-05, "loss": 0.0624, "step": 61860 }, { "epoch": 74.63246831623415, "grad_norm": 5.437806606292725, "learning_rate": 1.999850979326369e-05, "loss": 0.0619, "step": 61870 }, { "epoch": 74.64453832226916, "grad_norm": 5.651839256286621, "learning_rate": 1.9998509552008895e-05, "loss": 0.065, "step": 61880 }, { "epoch": 74.65660832830416, "grad_norm": 5.3466949462890625, "learning_rate": 1.99985093107541e-05, "loss": 0.0636, "step": 61890 }, { "epoch": 74.66867833433916, "grad_norm": 6.124446392059326, "learning_rate": 1.9998509069499307e-05, "loss": 0.0617, "step": 61900 }, { "epoch": 74.68074834037417, "grad_norm": 5.113577842712402, "learning_rate": 1.9998508828244513e-05, "loss": 0.0629, "step": 61910 }, { "epoch": 74.69281834640917, "grad_norm": 4.845737934112549, "learning_rate": 1.999850858698972e-05, "loss": 0.0631, "step": 61920 }, { "epoch": 74.70488835244417, "grad_norm": 6.007425308227539, "learning_rate": 1.9998508345734926e-05, "loss": 0.0633, "step": 61930 }, { "epoch": 74.71695835847918, "grad_norm": 5.439863204956055, "learning_rate": 1.9998508104480132e-05, "loss": 0.0627, "step": 61940 }, { "epoch": 74.72902836451418, "grad_norm": 6.112880229949951, "learning_rate": 1.9998507863225338e-05, "loss": 0.0641, "step": 61950 }, { "epoch": 74.74109837054918, "grad_norm": 6.262854099273682, "learning_rate": 1.9998507621970548e-05, "loss": 0.065, "step": 61960 }, { "epoch": 74.75316837658418, "grad_norm": 5.848293781280518, "learning_rate": 1.9998507380715754e-05, "loss": 0.0634, "step": 61970 }, { "epoch": 74.76523838261919, "grad_norm": 5.698366641998291, "learning_rate": 1.999850713946096e-05, "loss": 0.0674, "step": 61980 }, { "epoch": 74.77730838865419, "grad_norm": 5.879347801208496, "learning_rate": 1.9998506898206166e-05, "loss": 0.0644, "step": 61990 }, { "epoch": 74.7893783946892, "grad_norm": 5.407710075378418, "learning_rate": 1.999850665695137e-05, "loss": 0.062, "step": 62000 }, { "epoch": 74.7893783946892, "eval_loss": 12.413745880126953, "eval_runtime": 8.14, "eval_samples_per_second": 85.627, "eval_steps_per_second": 10.811, "step": 62000 }, { "epoch": 74.8014484007242, "grad_norm": 6.15757942199707, "learning_rate": 1.9998506415696575e-05, "loss": 0.0637, "step": 62010 }, { "epoch": 74.8135184067592, "grad_norm": 5.764975070953369, "learning_rate": 1.999850617444178e-05, "loss": 0.0648, "step": 62020 }, { "epoch": 74.8255884127942, "grad_norm": 5.485536575317383, "learning_rate": 1.9998505933186988e-05, "loss": 0.0663, "step": 62030 }, { "epoch": 74.83765841882921, "grad_norm": 5.759946823120117, "learning_rate": 1.9998505691932194e-05, "loss": 0.0643, "step": 62040 }, { "epoch": 74.84972842486421, "grad_norm": 5.575753211975098, "learning_rate": 1.99985054506774e-05, "loss": 0.0667, "step": 62050 }, { "epoch": 74.86179843089921, "grad_norm": 5.381206512451172, "learning_rate": 1.9998505209422607e-05, "loss": 0.0691, "step": 62060 }, { "epoch": 74.87386843693422, "grad_norm": 5.837087631225586, "learning_rate": 1.9998504968167813e-05, "loss": 0.0648, "step": 62070 }, { "epoch": 74.88593844296922, "grad_norm": 5.84214973449707, "learning_rate": 1.999850472691302e-05, "loss": 0.0672, "step": 62080 }, { "epoch": 74.89800844900422, "grad_norm": 5.728078842163086, "learning_rate": 1.9998504485658225e-05, "loss": 0.065, "step": 62090 }, { "epoch": 74.91007845503923, "grad_norm": 6.220569133758545, "learning_rate": 1.999850424440343e-05, "loss": 0.0677, "step": 62100 }, { "epoch": 74.92214846107423, "grad_norm": 5.504183292388916, "learning_rate": 1.9998504003148638e-05, "loss": 0.0655, "step": 62110 }, { "epoch": 74.93421846710923, "grad_norm": 6.160990238189697, "learning_rate": 1.9998503761893844e-05, "loss": 0.0659, "step": 62120 }, { "epoch": 74.94628847314424, "grad_norm": 6.104223728179932, "learning_rate": 1.999850352063905e-05, "loss": 0.0655, "step": 62130 }, { "epoch": 74.95835847917924, "grad_norm": 5.7260308265686035, "learning_rate": 1.9998503279384256e-05, "loss": 0.0677, "step": 62140 }, { "epoch": 74.97042848521424, "grad_norm": 5.94151496887207, "learning_rate": 1.9998503038129462e-05, "loss": 0.0669, "step": 62150 }, { "epoch": 74.98249849124925, "grad_norm": 5.4488325119018555, "learning_rate": 1.999850279687467e-05, "loss": 0.0649, "step": 62160 }, { "epoch": 74.99456849728425, "grad_norm": 6.4454498291015625, "learning_rate": 1.9998502555619875e-05, "loss": 0.0677, "step": 62170 }, { "epoch": 75.0060350030175, "grad_norm": 5.041813850402832, "learning_rate": 1.999850231436508e-05, "loss": 0.0559, "step": 62180 }, { "epoch": 75.0181050090525, "grad_norm": 4.732917308807373, "learning_rate": 1.9998502073110287e-05, "loss": 0.0457, "step": 62190 }, { "epoch": 75.03017501508751, "grad_norm": 5.156983852386475, "learning_rate": 1.9998501831855494e-05, "loss": 0.0481, "step": 62200 }, { "epoch": 75.04224502112251, "grad_norm": 4.859971523284912, "learning_rate": 1.99985015906007e-05, "loss": 0.048, "step": 62210 }, { "epoch": 75.05431502715751, "grad_norm": 5.480277061462402, "learning_rate": 1.9998501349345906e-05, "loss": 0.0479, "step": 62220 }, { "epoch": 75.06638503319252, "grad_norm": 5.093904495239258, "learning_rate": 1.9998501108091112e-05, "loss": 0.0512, "step": 62230 }, { "epoch": 75.07845503922752, "grad_norm": 4.614807605743408, "learning_rate": 1.999850086683632e-05, "loss": 0.0494, "step": 62240 }, { "epoch": 75.09052504526252, "grad_norm": 4.9979023933410645, "learning_rate": 1.999850062558152e-05, "loss": 0.0519, "step": 62250 }, { "epoch": 75.10259505129753, "grad_norm": 5.853631496429443, "learning_rate": 1.9998500384326727e-05, "loss": 0.0554, "step": 62260 }, { "epoch": 75.11466505733253, "grad_norm": 4.858168125152588, "learning_rate": 1.9998500143071934e-05, "loss": 0.0518, "step": 62270 }, { "epoch": 75.12673506336753, "grad_norm": 5.051150798797607, "learning_rate": 1.999849990181714e-05, "loss": 0.0532, "step": 62280 }, { "epoch": 75.13880506940254, "grad_norm": 5.825297832489014, "learning_rate": 1.9998499660562346e-05, "loss": 0.0515, "step": 62290 }, { "epoch": 75.15087507543754, "grad_norm": 4.916078090667725, "learning_rate": 1.9998499419307552e-05, "loss": 0.0526, "step": 62300 }, { "epoch": 75.16294508147254, "grad_norm": 5.213300704956055, "learning_rate": 1.999849917805276e-05, "loss": 0.0551, "step": 62310 }, { "epoch": 75.17501508750755, "grad_norm": 4.7943925857543945, "learning_rate": 1.9998498936797965e-05, "loss": 0.0545, "step": 62320 }, { "epoch": 75.18708509354255, "grad_norm": 5.656097412109375, "learning_rate": 1.999849869554317e-05, "loss": 0.0545, "step": 62330 }, { "epoch": 75.19915509957755, "grad_norm": 6.054347515106201, "learning_rate": 1.9998498454288377e-05, "loss": 0.0541, "step": 62340 }, { "epoch": 75.21122510561256, "grad_norm": 4.839255332946777, "learning_rate": 1.9998498213033583e-05, "loss": 0.0555, "step": 62350 }, { "epoch": 75.22329511164756, "grad_norm": 5.0553364753723145, "learning_rate": 1.999849797177879e-05, "loss": 0.0574, "step": 62360 }, { "epoch": 75.23536511768256, "grad_norm": 5.823068141937256, "learning_rate": 1.9998497730523996e-05, "loss": 0.0575, "step": 62370 }, { "epoch": 75.24743512371757, "grad_norm": 5.5859761238098145, "learning_rate": 1.9998497489269202e-05, "loss": 0.0561, "step": 62380 }, { "epoch": 75.25950512975257, "grad_norm": 5.308485984802246, "learning_rate": 1.9998497248014408e-05, "loss": 0.0569, "step": 62390 }, { "epoch": 75.27157513578757, "grad_norm": 5.720944404602051, "learning_rate": 1.9998497006759614e-05, "loss": 0.0595, "step": 62400 }, { "epoch": 75.28364514182257, "grad_norm": 4.9244256019592285, "learning_rate": 1.999849676550482e-05, "loss": 0.0548, "step": 62410 }, { "epoch": 75.29571514785758, "grad_norm": 5.372369289398193, "learning_rate": 1.9998496524250027e-05, "loss": 0.0584, "step": 62420 }, { "epoch": 75.30778515389258, "grad_norm": 5.854755878448486, "learning_rate": 1.9998496282995233e-05, "loss": 0.0566, "step": 62430 }, { "epoch": 75.31985515992758, "grad_norm": 5.59868049621582, "learning_rate": 1.999849604174044e-05, "loss": 0.0586, "step": 62440 }, { "epoch": 75.33192516596259, "grad_norm": 5.5240068435668945, "learning_rate": 1.9998495800485646e-05, "loss": 0.0591, "step": 62450 }, { "epoch": 75.34399517199759, "grad_norm": 5.0304646492004395, "learning_rate": 1.9998495559230852e-05, "loss": 0.0574, "step": 62460 }, { "epoch": 75.3560651780326, "grad_norm": 4.9630126953125, "learning_rate": 1.9998495317976058e-05, "loss": 0.0578, "step": 62470 }, { "epoch": 75.3681351840676, "grad_norm": 5.261146068572998, "learning_rate": 1.9998495076721264e-05, "loss": 0.0585, "step": 62480 }, { "epoch": 75.3802051901026, "grad_norm": 5.84384298324585, "learning_rate": 1.999849483546647e-05, "loss": 0.0587, "step": 62490 }, { "epoch": 75.3922751961376, "grad_norm": 5.14823579788208, "learning_rate": 1.9998494594211677e-05, "loss": 0.0562, "step": 62500 }, { "epoch": 75.3922751961376, "eval_loss": 12.445108413696289, "eval_runtime": 8.1539, "eval_samples_per_second": 85.481, "eval_steps_per_second": 10.792, "step": 62500 }, { "epoch": 75.4043452021726, "grad_norm": 5.646452903747559, "learning_rate": 1.9998494352956883e-05, "loss": 0.0576, "step": 62510 }, { "epoch": 75.41641520820761, "grad_norm": 5.356278419494629, "learning_rate": 1.999849411170209e-05, "loss": 0.0592, "step": 62520 }, { "epoch": 75.42848521424261, "grad_norm": 5.715282440185547, "learning_rate": 1.9998493870447295e-05, "loss": 0.0592, "step": 62530 }, { "epoch": 75.44055522027762, "grad_norm": 5.579573154449463, "learning_rate": 1.99984936291925e-05, "loss": 0.0588, "step": 62540 }, { "epoch": 75.45262522631262, "grad_norm": 5.752250671386719, "learning_rate": 1.9998493387937708e-05, "loss": 0.0596, "step": 62550 }, { "epoch": 75.46469523234762, "grad_norm": 5.1672186851501465, "learning_rate": 1.9998493146682914e-05, "loss": 0.0606, "step": 62560 }, { "epoch": 75.47676523838263, "grad_norm": 5.569907188415527, "learning_rate": 1.999849290542812e-05, "loss": 0.059, "step": 62570 }, { "epoch": 75.48883524441763, "grad_norm": 5.501315116882324, "learning_rate": 1.9998492664173326e-05, "loss": 0.0593, "step": 62580 }, { "epoch": 75.50090525045263, "grad_norm": 5.678081512451172, "learning_rate": 1.9998492422918533e-05, "loss": 0.0585, "step": 62590 }, { "epoch": 75.51297525648762, "grad_norm": 5.361264228820801, "learning_rate": 1.999849218166374e-05, "loss": 0.0615, "step": 62600 }, { "epoch": 75.52504526252262, "grad_norm": 5.84285306930542, "learning_rate": 1.9998491940408945e-05, "loss": 0.0625, "step": 62610 }, { "epoch": 75.53711526855763, "grad_norm": 5.657688617706299, "learning_rate": 1.999849169915415e-05, "loss": 0.0626, "step": 62620 }, { "epoch": 75.54918527459263, "grad_norm": 5.562789440155029, "learning_rate": 1.9998491457899357e-05, "loss": 0.0615, "step": 62630 }, { "epoch": 75.56125528062763, "grad_norm": 5.281566143035889, "learning_rate": 1.9998491216644564e-05, "loss": 0.0617, "step": 62640 }, { "epoch": 75.57332528666264, "grad_norm": 5.337192535400391, "learning_rate": 1.999849097538977e-05, "loss": 0.0602, "step": 62650 }, { "epoch": 75.58539529269764, "grad_norm": 5.372178077697754, "learning_rate": 1.9998490734134973e-05, "loss": 0.064, "step": 62660 }, { "epoch": 75.59746529873264, "grad_norm": 6.509609222412109, "learning_rate": 1.999849049288018e-05, "loss": 0.0636, "step": 62670 }, { "epoch": 75.60953530476765, "grad_norm": 6.142884731292725, "learning_rate": 1.9998490251625385e-05, "loss": 0.0641, "step": 62680 }, { "epoch": 75.62160531080265, "grad_norm": 5.832804203033447, "learning_rate": 1.999849001037059e-05, "loss": 0.0626, "step": 62690 }, { "epoch": 75.63367531683765, "grad_norm": 6.097710609436035, "learning_rate": 1.9998489769115798e-05, "loss": 0.0659, "step": 62700 }, { "epoch": 75.64574532287266, "grad_norm": 5.594598293304443, "learning_rate": 1.9998489527861004e-05, "loss": 0.0627, "step": 62710 }, { "epoch": 75.65781532890766, "grad_norm": 6.117173194885254, "learning_rate": 1.999848928660621e-05, "loss": 0.0646, "step": 62720 }, { "epoch": 75.66988533494266, "grad_norm": 5.917664051055908, "learning_rate": 1.9998489045351416e-05, "loss": 0.0615, "step": 62730 }, { "epoch": 75.68195534097767, "grad_norm": 5.692289352416992, "learning_rate": 1.9998488804096622e-05, "loss": 0.0634, "step": 62740 }, { "epoch": 75.69402534701267, "grad_norm": 5.721167087554932, "learning_rate": 1.999848856284183e-05, "loss": 0.0642, "step": 62750 }, { "epoch": 75.70609535304767, "grad_norm": 5.850229740142822, "learning_rate": 1.9998488321587035e-05, "loss": 0.0629, "step": 62760 }, { "epoch": 75.71816535908268, "grad_norm": 5.855273246765137, "learning_rate": 1.999848808033224e-05, "loss": 0.0651, "step": 62770 }, { "epoch": 75.73023536511768, "grad_norm": 5.215636730194092, "learning_rate": 1.9998487839077447e-05, "loss": 0.0626, "step": 62780 }, { "epoch": 75.74230537115268, "grad_norm": 5.508934497833252, "learning_rate": 1.9998487597822653e-05, "loss": 0.0612, "step": 62790 }, { "epoch": 75.75437537718769, "grad_norm": 5.60233736038208, "learning_rate": 1.999848735656786e-05, "loss": 0.0622, "step": 62800 }, { "epoch": 75.76644538322269, "grad_norm": 5.89404821395874, "learning_rate": 1.9998487115313066e-05, "loss": 0.0639, "step": 62810 }, { "epoch": 75.77851538925769, "grad_norm": 5.799746036529541, "learning_rate": 1.9998486874058272e-05, "loss": 0.0617, "step": 62820 }, { "epoch": 75.7905853952927, "grad_norm": 5.566964626312256, "learning_rate": 1.999848663280348e-05, "loss": 0.0627, "step": 62830 }, { "epoch": 75.8026554013277, "grad_norm": 5.987963676452637, "learning_rate": 1.9998486391548685e-05, "loss": 0.0636, "step": 62840 }, { "epoch": 75.8147254073627, "grad_norm": 5.469754219055176, "learning_rate": 1.999848615029389e-05, "loss": 0.0629, "step": 62850 }, { "epoch": 75.8267954133977, "grad_norm": 5.958178520202637, "learning_rate": 1.9998485909039097e-05, "loss": 0.0647, "step": 62860 }, { "epoch": 75.83886541943271, "grad_norm": 5.92836856842041, "learning_rate": 1.9998485667784303e-05, "loss": 0.0621, "step": 62870 }, { "epoch": 75.85093542546771, "grad_norm": 5.83736515045166, "learning_rate": 1.999848542652951e-05, "loss": 0.0653, "step": 62880 }, { "epoch": 75.86300543150271, "grad_norm": 5.723050117492676, "learning_rate": 1.9998485185274716e-05, "loss": 0.0642, "step": 62890 }, { "epoch": 75.87507543753772, "grad_norm": 5.47880220413208, "learning_rate": 1.9998484944019922e-05, "loss": 0.0668, "step": 62900 }, { "epoch": 75.88714544357272, "grad_norm": 5.39068603515625, "learning_rate": 1.9998484702765125e-05, "loss": 0.0643, "step": 62910 }, { "epoch": 75.89921544960772, "grad_norm": 6.018630027770996, "learning_rate": 1.999848446151033e-05, "loss": 0.0655, "step": 62920 }, { "epoch": 75.91128545564273, "grad_norm": 6.093274116516113, "learning_rate": 1.9998484220255537e-05, "loss": 0.0663, "step": 62930 }, { "epoch": 75.92335546167773, "grad_norm": 5.590610027313232, "learning_rate": 1.9998483979000743e-05, "loss": 0.063, "step": 62940 }, { "epoch": 75.93542546771273, "grad_norm": 5.538001537322998, "learning_rate": 1.999848373774595e-05, "loss": 0.0637, "step": 62950 }, { "epoch": 75.94749547374774, "grad_norm": 6.256042003631592, "learning_rate": 1.9998483496491156e-05, "loss": 0.067, "step": 62960 }, { "epoch": 75.95956547978274, "grad_norm": 5.868277072906494, "learning_rate": 1.9998483255236362e-05, "loss": 0.0682, "step": 62970 }, { "epoch": 75.97163548581774, "grad_norm": 5.71268367767334, "learning_rate": 1.9998483013981568e-05, "loss": 0.0704, "step": 62980 }, { "epoch": 75.98370549185275, "grad_norm": 5.668898105621338, "learning_rate": 1.9998482772726774e-05, "loss": 0.0667, "step": 62990 }, { "epoch": 75.99577549788775, "grad_norm": 5.4401469230651855, "learning_rate": 1.999848253147198e-05, "loss": 0.0645, "step": 63000 }, { "epoch": 75.99577549788775, "eval_loss": 12.43143081665039, "eval_runtime": 8.1366, "eval_samples_per_second": 85.662, "eval_steps_per_second": 10.815, "step": 63000 }, { "epoch": 76.007242003621, "grad_norm": 4.850845813751221, "learning_rate": 1.9998482290217187e-05, "loss": 0.0539, "step": 63010 }, { "epoch": 76.019312009656, "grad_norm": 5.0023579597473145, "learning_rate": 1.9998482048962393e-05, "loss": 0.0463, "step": 63020 }, { "epoch": 76.03138201569101, "grad_norm": 4.803595542907715, "learning_rate": 1.99984818077076e-05, "loss": 0.0458, "step": 63030 }, { "epoch": 76.04345202172601, "grad_norm": 4.865544319152832, "learning_rate": 1.999848156645281e-05, "loss": 0.0504, "step": 63040 }, { "epoch": 76.05552202776101, "grad_norm": 5.2640061378479, "learning_rate": 1.9998481325198015e-05, "loss": 0.0503, "step": 63050 }, { "epoch": 76.06759203379602, "grad_norm": 5.329946517944336, "learning_rate": 1.999848108394322e-05, "loss": 0.052, "step": 63060 }, { "epoch": 76.07966203983102, "grad_norm": 5.364921569824219, "learning_rate": 1.9998480842688428e-05, "loss": 0.0502, "step": 63070 }, { "epoch": 76.09173204586602, "grad_norm": 5.204007625579834, "learning_rate": 1.999848060143363e-05, "loss": 0.0517, "step": 63080 }, { "epoch": 76.10380205190103, "grad_norm": 5.360864639282227, "learning_rate": 1.9998480360178837e-05, "loss": 0.0514, "step": 63090 }, { "epoch": 76.11587205793603, "grad_norm": 4.730851650238037, "learning_rate": 1.9998480118924043e-05, "loss": 0.0541, "step": 63100 }, { "epoch": 76.12794206397103, "grad_norm": 5.38609504699707, "learning_rate": 1.999847987766925e-05, "loss": 0.0528, "step": 63110 }, { "epoch": 76.14001207000604, "grad_norm": 4.926975727081299, "learning_rate": 1.9998479636414455e-05, "loss": 0.0523, "step": 63120 }, { "epoch": 76.15208207604104, "grad_norm": 4.657830715179443, "learning_rate": 1.999847939515966e-05, "loss": 0.0514, "step": 63130 }, { "epoch": 76.16415208207604, "grad_norm": 5.35233211517334, "learning_rate": 1.9998479153904868e-05, "loss": 0.0534, "step": 63140 }, { "epoch": 76.17622208811105, "grad_norm": 5.017115592956543, "learning_rate": 1.9998478912650074e-05, "loss": 0.056, "step": 63150 }, { "epoch": 76.18829209414605, "grad_norm": 4.9447150230407715, "learning_rate": 1.999847867139528e-05, "loss": 0.0555, "step": 63160 }, { "epoch": 76.20036210018105, "grad_norm": 5.847463607788086, "learning_rate": 1.9998478430140486e-05, "loss": 0.0554, "step": 63170 }, { "epoch": 76.21243210621606, "grad_norm": 5.102133274078369, "learning_rate": 1.9998478188885692e-05, "loss": 0.0569, "step": 63180 }, { "epoch": 76.22450211225106, "grad_norm": 5.07867431640625, "learning_rate": 1.99984779476309e-05, "loss": 0.0554, "step": 63190 }, { "epoch": 76.23657211828606, "grad_norm": 5.6714959144592285, "learning_rate": 1.9998477706376105e-05, "loss": 0.0554, "step": 63200 }, { "epoch": 76.24864212432107, "grad_norm": 5.565311431884766, "learning_rate": 1.999847746512131e-05, "loss": 0.0562, "step": 63210 }, { "epoch": 76.26071213035607, "grad_norm": 5.1875901222229, "learning_rate": 1.9998477223866517e-05, "loss": 0.0568, "step": 63220 }, { "epoch": 76.27278213639107, "grad_norm": 5.502852439880371, "learning_rate": 1.9998476982611724e-05, "loss": 0.0543, "step": 63230 }, { "epoch": 76.28485214242608, "grad_norm": 5.360246658325195, "learning_rate": 1.999847674135693e-05, "loss": 0.0554, "step": 63240 }, { "epoch": 76.29692214846108, "grad_norm": 5.181596755981445, "learning_rate": 1.9998476500102136e-05, "loss": 0.0574, "step": 63250 }, { "epoch": 76.30899215449608, "grad_norm": 5.4819464683532715, "learning_rate": 1.9998476258847342e-05, "loss": 0.0595, "step": 63260 }, { "epoch": 76.32106216053108, "grad_norm": 5.524516582489014, "learning_rate": 1.999847601759255e-05, "loss": 0.0596, "step": 63270 }, { "epoch": 76.33313216656609, "grad_norm": 5.653387546539307, "learning_rate": 1.9998475776337755e-05, "loss": 0.0598, "step": 63280 }, { "epoch": 76.34520217260109, "grad_norm": 5.477128505706787, "learning_rate": 1.999847553508296e-05, "loss": 0.0588, "step": 63290 }, { "epoch": 76.3572721786361, "grad_norm": 5.934033393859863, "learning_rate": 1.9998475293828167e-05, "loss": 0.0593, "step": 63300 }, { "epoch": 76.3693421846711, "grad_norm": 6.276216506958008, "learning_rate": 1.9998475052573373e-05, "loss": 0.0599, "step": 63310 }, { "epoch": 76.3814121907061, "grad_norm": 5.47567081451416, "learning_rate": 1.999847481131858e-05, "loss": 0.0596, "step": 63320 }, { "epoch": 76.3934821967411, "grad_norm": 5.063637733459473, "learning_rate": 1.9998474570063782e-05, "loss": 0.06, "step": 63330 }, { "epoch": 76.40555220277611, "grad_norm": 5.47075891494751, "learning_rate": 1.999847432880899e-05, "loss": 0.0598, "step": 63340 }, { "epoch": 76.41762220881111, "grad_norm": 4.6800713539123535, "learning_rate": 1.9998474087554195e-05, "loss": 0.057, "step": 63350 }, { "epoch": 76.42969221484611, "grad_norm": 5.280684471130371, "learning_rate": 1.99984738462994e-05, "loss": 0.0554, "step": 63360 }, { "epoch": 76.44176222088112, "grad_norm": 5.313966274261475, "learning_rate": 1.9998473605044607e-05, "loss": 0.0572, "step": 63370 }, { "epoch": 76.45383222691612, "grad_norm": 5.854083061218262, "learning_rate": 1.9998473363789813e-05, "loss": 0.0606, "step": 63380 }, { "epoch": 76.46590223295112, "grad_norm": 5.18618106842041, "learning_rate": 1.999847312253502e-05, "loss": 0.0585, "step": 63390 }, { "epoch": 76.47797223898613, "grad_norm": 5.319967746734619, "learning_rate": 1.9998472881280226e-05, "loss": 0.0616, "step": 63400 }, { "epoch": 76.49004224502113, "grad_norm": 5.325498580932617, "learning_rate": 1.9998472640025432e-05, "loss": 0.0581, "step": 63410 }, { "epoch": 76.50211225105613, "grad_norm": 5.652667045593262, "learning_rate": 1.9998472398770638e-05, "loss": 0.0573, "step": 63420 }, { "epoch": 76.51418225709112, "grad_norm": 5.585402488708496, "learning_rate": 1.9998472157515844e-05, "loss": 0.0585, "step": 63430 }, { "epoch": 76.52625226312612, "grad_norm": 6.4108123779296875, "learning_rate": 1.999847191626105e-05, "loss": 0.0617, "step": 63440 }, { "epoch": 76.53832226916113, "grad_norm": 6.052316188812256, "learning_rate": 1.9998471675006257e-05, "loss": 0.061, "step": 63450 }, { "epoch": 76.55039227519613, "grad_norm": 5.615561008453369, "learning_rate": 1.9998471433751463e-05, "loss": 0.0616, "step": 63460 }, { "epoch": 76.56246228123113, "grad_norm": 6.033749103546143, "learning_rate": 1.999847119249667e-05, "loss": 0.0623, "step": 63470 }, { "epoch": 76.57453228726614, "grad_norm": 5.4463582038879395, "learning_rate": 1.9998470951241876e-05, "loss": 0.0624, "step": 63480 }, { "epoch": 76.58660229330114, "grad_norm": 6.058542251586914, "learning_rate": 1.9998470709987082e-05, "loss": 0.0609, "step": 63490 }, { "epoch": 76.59867229933614, "grad_norm": 5.732529163360596, "learning_rate": 1.9998470468732288e-05, "loss": 0.0608, "step": 63500 }, { "epoch": 76.59867229933614, "eval_loss": 12.461484909057617, "eval_runtime": 8.13, "eval_samples_per_second": 85.732, "eval_steps_per_second": 10.824, "step": 63500 }, { "epoch": 76.61074230537115, "grad_norm": 6.259228229522705, "learning_rate": 1.9998470227477494e-05, "loss": 0.0597, "step": 63510 }, { "epoch": 76.62281231140615, "grad_norm": 5.232296943664551, "learning_rate": 1.99984699862227e-05, "loss": 0.0612, "step": 63520 }, { "epoch": 76.63488231744115, "grad_norm": 5.5050249099731445, "learning_rate": 1.9998469744967907e-05, "loss": 0.0631, "step": 63530 }, { "epoch": 76.64695232347616, "grad_norm": 5.656070232391357, "learning_rate": 1.9998469503713113e-05, "loss": 0.062, "step": 63540 }, { "epoch": 76.65902232951116, "grad_norm": 5.857895851135254, "learning_rate": 1.999846926245832e-05, "loss": 0.0619, "step": 63550 }, { "epoch": 76.67109233554616, "grad_norm": 5.389161109924316, "learning_rate": 1.9998469021203525e-05, "loss": 0.0604, "step": 63560 }, { "epoch": 76.68316234158117, "grad_norm": 6.11627197265625, "learning_rate": 1.999846877994873e-05, "loss": 0.0618, "step": 63570 }, { "epoch": 76.69523234761617, "grad_norm": 5.834686756134033, "learning_rate": 1.9998468538693938e-05, "loss": 0.0633, "step": 63580 }, { "epoch": 76.70730235365117, "grad_norm": 6.3322882652282715, "learning_rate": 1.9998468297439144e-05, "loss": 0.062, "step": 63590 }, { "epoch": 76.71937235968618, "grad_norm": 5.742048740386963, "learning_rate": 1.999846805618435e-05, "loss": 0.0603, "step": 63600 }, { "epoch": 76.73144236572118, "grad_norm": 5.650335788726807, "learning_rate": 1.9998467814929556e-05, "loss": 0.0637, "step": 63610 }, { "epoch": 76.74351237175618, "grad_norm": 5.493095874786377, "learning_rate": 1.9998467573674763e-05, "loss": 0.0629, "step": 63620 }, { "epoch": 76.75558237779119, "grad_norm": 5.7816691398620605, "learning_rate": 1.999846733241997e-05, "loss": 0.0622, "step": 63630 }, { "epoch": 76.76765238382619, "grad_norm": 5.969866752624512, "learning_rate": 1.9998467091165175e-05, "loss": 0.0608, "step": 63640 }, { "epoch": 76.77972238986119, "grad_norm": 5.685057640075684, "learning_rate": 1.999846684991038e-05, "loss": 0.0622, "step": 63650 }, { "epoch": 76.7917923958962, "grad_norm": 5.873179912567139, "learning_rate": 1.9998466608655587e-05, "loss": 0.0634, "step": 63660 }, { "epoch": 76.8038624019312, "grad_norm": 5.754227161407471, "learning_rate": 1.9998466367400794e-05, "loss": 0.0664, "step": 63670 }, { "epoch": 76.8159324079662, "grad_norm": 5.715836048126221, "learning_rate": 1.9998466126146e-05, "loss": 0.0648, "step": 63680 }, { "epoch": 76.8280024140012, "grad_norm": 5.878872394561768, "learning_rate": 1.9998465884891206e-05, "loss": 0.0643, "step": 63690 }, { "epoch": 76.84007242003621, "grad_norm": 5.840483665466309, "learning_rate": 1.9998465643636412e-05, "loss": 0.0634, "step": 63700 }, { "epoch": 76.85214242607121, "grad_norm": 5.259753704071045, "learning_rate": 1.999846540238162e-05, "loss": 0.0626, "step": 63710 }, { "epoch": 76.86421243210621, "grad_norm": 6.214850902557373, "learning_rate": 1.9998465161126825e-05, "loss": 0.0613, "step": 63720 }, { "epoch": 76.87628243814122, "grad_norm": 5.846579551696777, "learning_rate": 1.999846491987203e-05, "loss": 0.0637, "step": 63730 }, { "epoch": 76.88835244417622, "grad_norm": 6.07318639755249, "learning_rate": 1.9998464678617234e-05, "loss": 0.0636, "step": 63740 }, { "epoch": 76.90042245021122, "grad_norm": 6.395415782928467, "learning_rate": 1.999846443736244e-05, "loss": 0.0643, "step": 63750 }, { "epoch": 76.91249245624623, "grad_norm": 5.1373291015625, "learning_rate": 1.9998464196107646e-05, "loss": 0.064, "step": 63760 }, { "epoch": 76.92456246228123, "grad_norm": 5.868201732635498, "learning_rate": 1.9998463954852852e-05, "loss": 0.0645, "step": 63770 }, { "epoch": 76.93663246831623, "grad_norm": 5.972146034240723, "learning_rate": 1.999846371359806e-05, "loss": 0.066, "step": 63780 }, { "epoch": 76.94870247435124, "grad_norm": 5.442625999450684, "learning_rate": 1.9998463472343265e-05, "loss": 0.0648, "step": 63790 }, { "epoch": 76.96077248038624, "grad_norm": 5.578658103942871, "learning_rate": 1.999846323108847e-05, "loss": 0.0648, "step": 63800 }, { "epoch": 76.97284248642124, "grad_norm": 5.17810583114624, "learning_rate": 1.9998462989833677e-05, "loss": 0.0633, "step": 63810 }, { "epoch": 76.98491249245625, "grad_norm": 6.109158992767334, "learning_rate": 1.9998462748578884e-05, "loss": 0.0672, "step": 63820 }, { "epoch": 76.99698249849125, "grad_norm": 5.800846576690674, "learning_rate": 1.999846250732409e-05, "loss": 0.0636, "step": 63830 }, { "epoch": 77.0084490042245, "grad_norm": 4.931407451629639, "learning_rate": 1.9998462266069296e-05, "loss": 0.0522, "step": 63840 }, { "epoch": 77.0205190102595, "grad_norm": 4.413958549499512, "learning_rate": 1.9998462024814502e-05, "loss": 0.0446, "step": 63850 }, { "epoch": 77.03258901629451, "grad_norm": 4.51100492477417, "learning_rate": 1.999846178355971e-05, "loss": 0.0471, "step": 63860 }, { "epoch": 77.04465902232951, "grad_norm": 4.9182209968566895, "learning_rate": 1.9998461542304915e-05, "loss": 0.0477, "step": 63870 }, { "epoch": 77.05672902836451, "grad_norm": 4.789633750915527, "learning_rate": 1.999846130105012e-05, "loss": 0.0496, "step": 63880 }, { "epoch": 77.06879903439952, "grad_norm": 5.080144882202148, "learning_rate": 1.9998461059795327e-05, "loss": 0.0479, "step": 63890 }, { "epoch": 77.08086904043452, "grad_norm": 5.121945858001709, "learning_rate": 1.9998460818540533e-05, "loss": 0.0509, "step": 63900 }, { "epoch": 77.09293904646952, "grad_norm": 5.309513092041016, "learning_rate": 1.999846057728574e-05, "loss": 0.0512, "step": 63910 }, { "epoch": 77.10500905250453, "grad_norm": 5.357646942138672, "learning_rate": 1.9998460336030946e-05, "loss": 0.0528, "step": 63920 }, { "epoch": 77.11707905853953, "grad_norm": 5.360584259033203, "learning_rate": 1.9998460094776152e-05, "loss": 0.0541, "step": 63930 }, { "epoch": 77.12914906457453, "grad_norm": 4.565386772155762, "learning_rate": 1.9998459853521358e-05, "loss": 0.0536, "step": 63940 }, { "epoch": 77.14121907060954, "grad_norm": 4.733980178833008, "learning_rate": 1.9998459612266564e-05, "loss": 0.0509, "step": 63950 }, { "epoch": 77.15328907664454, "grad_norm": 5.246973514556885, "learning_rate": 1.999845937101177e-05, "loss": 0.051, "step": 63960 }, { "epoch": 77.16535908267954, "grad_norm": 5.716894149780273, "learning_rate": 1.9998459129756977e-05, "loss": 0.0538, "step": 63970 }, { "epoch": 77.17742908871455, "grad_norm": 5.474188327789307, "learning_rate": 1.9998458888502183e-05, "loss": 0.0539, "step": 63980 }, { "epoch": 77.18949909474955, "grad_norm": 5.08695125579834, "learning_rate": 1.9998458647247386e-05, "loss": 0.0539, "step": 63990 }, { "epoch": 77.20156910078455, "grad_norm": 4.764158725738525, "learning_rate": 1.9998458405992592e-05, "loss": 0.0533, "step": 64000 }, { "epoch": 77.20156910078455, "eval_loss": 12.48377799987793, "eval_runtime": 8.1347, "eval_samples_per_second": 85.683, "eval_steps_per_second": 10.818, "step": 64000 }, { "epoch": 77.21363910681956, "grad_norm": 4.866467475891113, "learning_rate": 1.9998458164737798e-05, "loss": 0.052, "step": 64010 }, { "epoch": 77.22570911285456, "grad_norm": 4.946343898773193, "learning_rate": 1.9998457923483004e-05, "loss": 0.0528, "step": 64020 }, { "epoch": 77.23777911888956, "grad_norm": 5.098020076751709, "learning_rate": 1.999845768222821e-05, "loss": 0.0531, "step": 64030 }, { "epoch": 77.24984912492457, "grad_norm": 5.613745212554932, "learning_rate": 1.9998457440973417e-05, "loss": 0.0546, "step": 64040 }, { "epoch": 77.26191913095957, "grad_norm": 5.200978755950928, "learning_rate": 1.9998457199718623e-05, "loss": 0.0547, "step": 64050 }, { "epoch": 77.27398913699457, "grad_norm": 5.585946083068848, "learning_rate": 1.999845695846383e-05, "loss": 0.0562, "step": 64060 }, { "epoch": 77.28605914302958, "grad_norm": 5.541748046875, "learning_rate": 1.9998456717209036e-05, "loss": 0.0554, "step": 64070 }, { "epoch": 77.29812914906458, "grad_norm": 5.639867305755615, "learning_rate": 1.9998456475954242e-05, "loss": 0.0566, "step": 64080 }, { "epoch": 77.31019915509958, "grad_norm": 5.346409320831299, "learning_rate": 1.9998456234699448e-05, "loss": 0.0572, "step": 64090 }, { "epoch": 77.32226916113459, "grad_norm": 5.3336920738220215, "learning_rate": 1.9998455993444654e-05, "loss": 0.0557, "step": 64100 }, { "epoch": 77.33433916716959, "grad_norm": 5.602293491363525, "learning_rate": 1.999845575218986e-05, "loss": 0.056, "step": 64110 }, { "epoch": 77.34640917320459, "grad_norm": 5.837201118469238, "learning_rate": 1.999845551093507e-05, "loss": 0.0595, "step": 64120 }, { "epoch": 77.3584791792396, "grad_norm": 5.221038818359375, "learning_rate": 1.9998455269680276e-05, "loss": 0.0559, "step": 64130 }, { "epoch": 77.3705491852746, "grad_norm": 5.152856349945068, "learning_rate": 1.9998455028425482e-05, "loss": 0.0573, "step": 64140 }, { "epoch": 77.3826191913096, "grad_norm": 5.491325855255127, "learning_rate": 1.9998454787170685e-05, "loss": 0.0579, "step": 64150 }, { "epoch": 77.3946891973446, "grad_norm": 5.602877140045166, "learning_rate": 1.999845454591589e-05, "loss": 0.0579, "step": 64160 }, { "epoch": 77.40675920337961, "grad_norm": 5.360177993774414, "learning_rate": 1.9998454304661098e-05, "loss": 0.0573, "step": 64170 }, { "epoch": 77.41882920941461, "grad_norm": 6.303408145904541, "learning_rate": 1.9998454063406304e-05, "loss": 0.0579, "step": 64180 }, { "epoch": 77.43089921544961, "grad_norm": 5.864508152008057, "learning_rate": 1.999845382215151e-05, "loss": 0.0591, "step": 64190 }, { "epoch": 77.44296922148462, "grad_norm": 5.4301557540893555, "learning_rate": 1.9998453580896716e-05, "loss": 0.0591, "step": 64200 }, { "epoch": 77.45503922751962, "grad_norm": 4.856897830963135, "learning_rate": 1.9998453339641923e-05, "loss": 0.0611, "step": 64210 }, { "epoch": 77.46710923355462, "grad_norm": 5.3994832038879395, "learning_rate": 1.999845309838713e-05, "loss": 0.0557, "step": 64220 }, { "epoch": 77.47917923958963, "grad_norm": 5.378882884979248, "learning_rate": 1.9998452857132335e-05, "loss": 0.0563, "step": 64230 }, { "epoch": 77.49124924562463, "grad_norm": 5.455469131469727, "learning_rate": 1.999845261587754e-05, "loss": 0.0598, "step": 64240 }, { "epoch": 77.50331925165963, "grad_norm": 5.419473171234131, "learning_rate": 1.9998452374622747e-05, "loss": 0.0605, "step": 64250 }, { "epoch": 77.51538925769462, "grad_norm": 5.60235071182251, "learning_rate": 1.9998452133367954e-05, "loss": 0.0573, "step": 64260 }, { "epoch": 77.52745926372963, "grad_norm": 5.370262145996094, "learning_rate": 1.999845189211316e-05, "loss": 0.06, "step": 64270 }, { "epoch": 77.53952926976463, "grad_norm": 5.429708957672119, "learning_rate": 1.9998451650858366e-05, "loss": 0.0626, "step": 64280 }, { "epoch": 77.55159927579963, "grad_norm": 5.666009426116943, "learning_rate": 1.9998451409603572e-05, "loss": 0.0595, "step": 64290 }, { "epoch": 77.56366928183463, "grad_norm": 5.6368818283081055, "learning_rate": 1.999845116834878e-05, "loss": 0.0619, "step": 64300 }, { "epoch": 77.57573928786964, "grad_norm": 5.319553375244141, "learning_rate": 1.9998450927093985e-05, "loss": 0.0598, "step": 64310 }, { "epoch": 77.58780929390464, "grad_norm": 5.387925148010254, "learning_rate": 1.999845068583919e-05, "loss": 0.0602, "step": 64320 }, { "epoch": 77.59987929993964, "grad_norm": 4.8351287841796875, "learning_rate": 1.9998450444584397e-05, "loss": 0.0585, "step": 64330 }, { "epoch": 77.61194930597465, "grad_norm": 5.466771602630615, "learning_rate": 1.9998450203329603e-05, "loss": 0.0585, "step": 64340 }, { "epoch": 77.62401931200965, "grad_norm": 5.1189961433410645, "learning_rate": 1.999844996207481e-05, "loss": 0.061, "step": 64350 }, { "epoch": 77.63608931804465, "grad_norm": 6.095648765563965, "learning_rate": 1.9998449720820016e-05, "loss": 0.0613, "step": 64360 }, { "epoch": 77.64815932407966, "grad_norm": 5.221724510192871, "learning_rate": 1.9998449479565222e-05, "loss": 0.0604, "step": 64370 }, { "epoch": 77.66022933011466, "grad_norm": 5.823817253112793, "learning_rate": 1.9998449238310428e-05, "loss": 0.0637, "step": 64380 }, { "epoch": 77.67229933614966, "grad_norm": 5.2991251945495605, "learning_rate": 1.9998448997055634e-05, "loss": 0.0615, "step": 64390 }, { "epoch": 77.68436934218467, "grad_norm": 5.282534599304199, "learning_rate": 1.9998448755800837e-05, "loss": 0.0637, "step": 64400 }, { "epoch": 77.69643934821967, "grad_norm": 5.7083024978637695, "learning_rate": 1.9998448514546043e-05, "loss": 0.0602, "step": 64410 }, { "epoch": 77.70850935425467, "grad_norm": 5.811385631561279, "learning_rate": 1.999844827329125e-05, "loss": 0.0624, "step": 64420 }, { "epoch": 77.72057936028968, "grad_norm": 5.440217971801758, "learning_rate": 1.9998448032036456e-05, "loss": 0.0605, "step": 64430 }, { "epoch": 77.73264936632468, "grad_norm": 5.61631965637207, "learning_rate": 1.9998447790781662e-05, "loss": 0.0577, "step": 64440 }, { "epoch": 77.74471937235968, "grad_norm": 5.5717620849609375, "learning_rate": 1.9998447549526868e-05, "loss": 0.0627, "step": 64450 }, { "epoch": 77.75678937839469, "grad_norm": 5.8534417152404785, "learning_rate": 1.9998447308272075e-05, "loss": 0.0615, "step": 64460 }, { "epoch": 77.76885938442969, "grad_norm": 5.646073818206787, "learning_rate": 1.999844706701728e-05, "loss": 0.0663, "step": 64470 }, { "epoch": 77.78092939046469, "grad_norm": 5.907729625701904, "learning_rate": 1.9998446825762487e-05, "loss": 0.067, "step": 64480 }, { "epoch": 77.7929993964997, "grad_norm": 5.363497257232666, "learning_rate": 1.9998446584507693e-05, "loss": 0.0615, "step": 64490 }, { "epoch": 77.8050694025347, "grad_norm": 5.576620578765869, "learning_rate": 1.99984463432529e-05, "loss": 0.0602, "step": 64500 }, { "epoch": 77.8050694025347, "eval_loss": 12.490825653076172, "eval_runtime": 8.13, "eval_samples_per_second": 85.732, "eval_steps_per_second": 10.824, "step": 64500 }, { "epoch": 77.8171394085697, "grad_norm": 5.588395595550537, "learning_rate": 1.9998446101998106e-05, "loss": 0.0647, "step": 64510 }, { "epoch": 77.8292094146047, "grad_norm": 5.590855598449707, "learning_rate": 1.9998445860743312e-05, "loss": 0.0618, "step": 64520 }, { "epoch": 77.84127942063971, "grad_norm": 5.602756977081299, "learning_rate": 1.9998445619488518e-05, "loss": 0.0645, "step": 64530 }, { "epoch": 77.85334942667471, "grad_norm": 6.022148132324219, "learning_rate": 1.9998445378233724e-05, "loss": 0.0651, "step": 64540 }, { "epoch": 77.86541943270971, "grad_norm": 5.536129951477051, "learning_rate": 1.999844513697893e-05, "loss": 0.0655, "step": 64550 }, { "epoch": 77.87748943874472, "grad_norm": 5.767073154449463, "learning_rate": 1.9998444895724137e-05, "loss": 0.0622, "step": 64560 }, { "epoch": 77.88955944477972, "grad_norm": 5.718991279602051, "learning_rate": 1.9998444654469343e-05, "loss": 0.0629, "step": 64570 }, { "epoch": 77.90162945081472, "grad_norm": 5.782289028167725, "learning_rate": 1.999844441321455e-05, "loss": 0.065, "step": 64580 }, { "epoch": 77.91369945684973, "grad_norm": 5.876413822174072, "learning_rate": 1.9998444171959755e-05, "loss": 0.0633, "step": 64590 }, { "epoch": 77.92576946288473, "grad_norm": 5.439946174621582, "learning_rate": 1.999844393070496e-05, "loss": 0.0627, "step": 64600 }, { "epoch": 77.93783946891973, "grad_norm": 5.682674407958984, "learning_rate": 1.9998443689450168e-05, "loss": 0.065, "step": 64610 }, { "epoch": 77.94990947495474, "grad_norm": 6.145557403564453, "learning_rate": 1.9998443448195374e-05, "loss": 0.0666, "step": 64620 }, { "epoch": 77.96197948098974, "grad_norm": 5.383054733276367, "learning_rate": 1.999844320694058e-05, "loss": 0.0622, "step": 64630 }, { "epoch": 77.97404948702474, "grad_norm": 5.940196990966797, "learning_rate": 1.9998442965685786e-05, "loss": 0.064, "step": 64640 }, { "epoch": 77.98611949305975, "grad_norm": 5.4963555335998535, "learning_rate": 1.999844272443099e-05, "loss": 0.0625, "step": 64650 }, { "epoch": 77.99818949909475, "grad_norm": 4.854587554931641, "learning_rate": 1.99984424831762e-05, "loss": 0.0639, "step": 64660 }, { "epoch": 78.009656004828, "grad_norm": 4.526431560516357, "learning_rate": 1.9998442241921405e-05, "loss": 0.0446, "step": 64670 }, { "epoch": 78.021726010863, "grad_norm": 5.136353492736816, "learning_rate": 1.999844200066661e-05, "loss": 0.0457, "step": 64680 }, { "epoch": 78.03379601689801, "grad_norm": 4.78432559967041, "learning_rate": 1.9998441759411817e-05, "loss": 0.0464, "step": 64690 }, { "epoch": 78.04586602293301, "grad_norm": 4.679934024810791, "learning_rate": 1.9998441518157024e-05, "loss": 0.0471, "step": 64700 }, { "epoch": 78.05793602896802, "grad_norm": 5.156429290771484, "learning_rate": 1.999844127690223e-05, "loss": 0.0492, "step": 64710 }, { "epoch": 78.07000603500302, "grad_norm": 5.106393337249756, "learning_rate": 1.9998441035647436e-05, "loss": 0.0501, "step": 64720 }, { "epoch": 78.08207604103802, "grad_norm": 4.95729398727417, "learning_rate": 1.9998440794392642e-05, "loss": 0.0479, "step": 64730 }, { "epoch": 78.09414604707302, "grad_norm": 5.445418834686279, "learning_rate": 1.999844055313785e-05, "loss": 0.0522, "step": 64740 }, { "epoch": 78.10621605310803, "grad_norm": 4.881408214569092, "learning_rate": 1.9998440311883055e-05, "loss": 0.0533, "step": 64750 }, { "epoch": 78.11828605914303, "grad_norm": 5.203004837036133, "learning_rate": 1.999844007062826e-05, "loss": 0.0515, "step": 64760 }, { "epoch": 78.13035606517803, "grad_norm": 4.858396053314209, "learning_rate": 1.9998439829373467e-05, "loss": 0.0486, "step": 64770 }, { "epoch": 78.14242607121304, "grad_norm": 4.596604347229004, "learning_rate": 1.9998439588118673e-05, "loss": 0.0502, "step": 64780 }, { "epoch": 78.15449607724804, "grad_norm": 5.13625955581665, "learning_rate": 1.999843934686388e-05, "loss": 0.0516, "step": 64790 }, { "epoch": 78.16656608328304, "grad_norm": 5.610825061798096, "learning_rate": 1.9998439105609086e-05, "loss": 0.0526, "step": 64800 }, { "epoch": 78.17863608931805, "grad_norm": 5.326855659484863, "learning_rate": 1.9998438864354292e-05, "loss": 0.0529, "step": 64810 }, { "epoch": 78.19070609535305, "grad_norm": 5.608736515045166, "learning_rate": 1.9998438623099495e-05, "loss": 0.0544, "step": 64820 }, { "epoch": 78.20277610138805, "grad_norm": 5.150036811828613, "learning_rate": 1.99984383818447e-05, "loss": 0.0543, "step": 64830 }, { "epoch": 78.21484610742306, "grad_norm": 5.08691930770874, "learning_rate": 1.9998438140589907e-05, "loss": 0.0527, "step": 64840 }, { "epoch": 78.22691611345806, "grad_norm": 5.3423871994018555, "learning_rate": 1.9998437899335114e-05, "loss": 0.0554, "step": 64850 }, { "epoch": 78.23898611949306, "grad_norm": 4.812296390533447, "learning_rate": 1.999843765808032e-05, "loss": 0.055, "step": 64860 }, { "epoch": 78.25105612552807, "grad_norm": 5.496179580688477, "learning_rate": 1.9998437416825526e-05, "loss": 0.0569, "step": 64870 }, { "epoch": 78.26312613156307, "grad_norm": 4.8187255859375, "learning_rate": 1.9998437175570732e-05, "loss": 0.0581, "step": 64880 }, { "epoch": 78.27519613759807, "grad_norm": 5.213245391845703, "learning_rate": 1.999843693431594e-05, "loss": 0.0554, "step": 64890 }, { "epoch": 78.28726614363308, "grad_norm": 5.751908779144287, "learning_rate": 1.9998436693061145e-05, "loss": 0.0583, "step": 64900 }, { "epoch": 78.29933614966808, "grad_norm": 5.719311714172363, "learning_rate": 1.999843645180635e-05, "loss": 0.055, "step": 64910 }, { "epoch": 78.31140615570308, "grad_norm": 5.290453910827637, "learning_rate": 1.9998436210551557e-05, "loss": 0.0562, "step": 64920 }, { "epoch": 78.32347616173809, "grad_norm": 5.349386692047119, "learning_rate": 1.9998435969296763e-05, "loss": 0.0558, "step": 64930 }, { "epoch": 78.33554616777309, "grad_norm": 5.241926670074463, "learning_rate": 1.999843572804197e-05, "loss": 0.0543, "step": 64940 }, { "epoch": 78.34761617380809, "grad_norm": 4.66292142868042, "learning_rate": 1.9998435486787176e-05, "loss": 0.0559, "step": 64950 }, { "epoch": 78.3596861798431, "grad_norm": 4.961451530456543, "learning_rate": 1.9998435245532382e-05, "loss": 0.0562, "step": 64960 }, { "epoch": 78.3717561858781, "grad_norm": 5.413451194763184, "learning_rate": 1.9998435004277588e-05, "loss": 0.0578, "step": 64970 }, { "epoch": 78.3838261919131, "grad_norm": 5.567990303039551, "learning_rate": 1.9998434763022794e-05, "loss": 0.0548, "step": 64980 }, { "epoch": 78.3958961979481, "grad_norm": 5.493157386779785, "learning_rate": 1.9998434521768e-05, "loss": 0.0543, "step": 64990 }, { "epoch": 78.40796620398311, "grad_norm": 5.27540397644043, "learning_rate": 1.9998434280513207e-05, "loss": 0.0585, "step": 65000 }, { "epoch": 78.40796620398311, "eval_loss": 12.484951972961426, "eval_runtime": 8.1324, "eval_samples_per_second": 85.707, "eval_steps_per_second": 10.821, "step": 65000 }, { "epoch": 78.42003621001811, "grad_norm": 5.188620567321777, "learning_rate": 1.9998434039258413e-05, "loss": 0.0579, "step": 65010 }, { "epoch": 78.43210621605311, "grad_norm": 5.603634357452393, "learning_rate": 1.999843379800362e-05, "loss": 0.0581, "step": 65020 }, { "epoch": 78.44417622208812, "grad_norm": 5.277677536010742, "learning_rate": 1.9998433556748825e-05, "loss": 0.0588, "step": 65030 }, { "epoch": 78.45624622812312, "grad_norm": 4.789653778076172, "learning_rate": 1.999843331549403e-05, "loss": 0.0552, "step": 65040 }, { "epoch": 78.46831623415812, "grad_norm": 5.452110290527344, "learning_rate": 1.9998433074239238e-05, "loss": 0.0594, "step": 65050 }, { "epoch": 78.48038624019313, "grad_norm": 5.1607890129089355, "learning_rate": 1.9998432832984444e-05, "loss": 0.0576, "step": 65060 }, { "epoch": 78.49245624622813, "grad_norm": 5.966728210449219, "learning_rate": 1.9998432591729647e-05, "loss": 0.0589, "step": 65070 }, { "epoch": 78.50452625226312, "grad_norm": 5.183290004730225, "learning_rate": 1.9998432350474853e-05, "loss": 0.0588, "step": 65080 }, { "epoch": 78.51659625829812, "grad_norm": 5.426821708679199, "learning_rate": 1.999843210922006e-05, "loss": 0.0577, "step": 65090 }, { "epoch": 78.52866626433313, "grad_norm": 5.6684041023254395, "learning_rate": 1.9998431867965266e-05, "loss": 0.0589, "step": 65100 }, { "epoch": 78.54073627036813, "grad_norm": 5.310176372528076, "learning_rate": 1.9998431626710472e-05, "loss": 0.0598, "step": 65110 }, { "epoch": 78.55280627640313, "grad_norm": 5.644728660583496, "learning_rate": 1.9998431385455678e-05, "loss": 0.058, "step": 65120 }, { "epoch": 78.56487628243814, "grad_norm": 4.8958234786987305, "learning_rate": 1.9998431144200884e-05, "loss": 0.0587, "step": 65130 }, { "epoch": 78.57694628847314, "grad_norm": 5.623547554016113, "learning_rate": 1.999843090294609e-05, "loss": 0.0606, "step": 65140 }, { "epoch": 78.58901629450814, "grad_norm": 5.40303373336792, "learning_rate": 1.9998430661691297e-05, "loss": 0.0599, "step": 65150 }, { "epoch": 78.60108630054314, "grad_norm": 5.6580047607421875, "learning_rate": 1.9998430420436503e-05, "loss": 0.0604, "step": 65160 }, { "epoch": 78.61315630657815, "grad_norm": 5.382870674133301, "learning_rate": 1.999843017918171e-05, "loss": 0.0597, "step": 65170 }, { "epoch": 78.62522631261315, "grad_norm": 5.625543117523193, "learning_rate": 1.9998429937926915e-05, "loss": 0.0594, "step": 65180 }, { "epoch": 78.63729631864815, "grad_norm": 5.874768257141113, "learning_rate": 1.999842969667212e-05, "loss": 0.0602, "step": 65190 }, { "epoch": 78.64936632468316, "grad_norm": 6.061230182647705, "learning_rate": 1.999842945541733e-05, "loss": 0.0598, "step": 65200 }, { "epoch": 78.66143633071816, "grad_norm": 5.284142971038818, "learning_rate": 1.9998429214162537e-05, "loss": 0.0609, "step": 65210 }, { "epoch": 78.67350633675316, "grad_norm": 5.076766490936279, "learning_rate": 1.9998428972907743e-05, "loss": 0.0596, "step": 65220 }, { "epoch": 78.68557634278817, "grad_norm": 5.678287506103516, "learning_rate": 1.9998428731652946e-05, "loss": 0.058, "step": 65230 }, { "epoch": 78.69764634882317, "grad_norm": 5.763050079345703, "learning_rate": 1.9998428490398153e-05, "loss": 0.0606, "step": 65240 }, { "epoch": 78.70971635485817, "grad_norm": 5.484189987182617, "learning_rate": 1.999842824914336e-05, "loss": 0.06, "step": 65250 }, { "epoch": 78.72178636089318, "grad_norm": 5.797886371612549, "learning_rate": 1.9998428007888565e-05, "loss": 0.0623, "step": 65260 }, { "epoch": 78.73385636692818, "grad_norm": 5.341977119445801, "learning_rate": 1.999842776663377e-05, "loss": 0.0593, "step": 65270 }, { "epoch": 78.74592637296318, "grad_norm": 5.8400397300720215, "learning_rate": 1.9998427525378977e-05, "loss": 0.0592, "step": 65280 }, { "epoch": 78.75799637899819, "grad_norm": 5.290843963623047, "learning_rate": 1.9998427284124184e-05, "loss": 0.0609, "step": 65290 }, { "epoch": 78.77006638503319, "grad_norm": 5.784196376800537, "learning_rate": 1.999842704286939e-05, "loss": 0.0647, "step": 65300 }, { "epoch": 78.78213639106819, "grad_norm": 5.591909885406494, "learning_rate": 1.9998426801614596e-05, "loss": 0.0626, "step": 65310 }, { "epoch": 78.7942063971032, "grad_norm": 6.0426344871521, "learning_rate": 1.9998426560359802e-05, "loss": 0.0634, "step": 65320 }, { "epoch": 78.8062764031382, "grad_norm": 5.6163787841796875, "learning_rate": 1.999842631910501e-05, "loss": 0.0637, "step": 65330 }, { "epoch": 78.8183464091732, "grad_norm": 6.016465187072754, "learning_rate": 1.9998426077850215e-05, "loss": 0.0647, "step": 65340 }, { "epoch": 78.8304164152082, "grad_norm": 5.8284592628479, "learning_rate": 1.999842583659542e-05, "loss": 0.062, "step": 65350 }, { "epoch": 78.84248642124321, "grad_norm": 5.129042148590088, "learning_rate": 1.9998425595340627e-05, "loss": 0.0621, "step": 65360 }, { "epoch": 78.85455642727821, "grad_norm": 5.951170444488525, "learning_rate": 1.9998425354085833e-05, "loss": 0.0652, "step": 65370 }, { "epoch": 78.86662643331321, "grad_norm": 5.908188819885254, "learning_rate": 1.999842511283104e-05, "loss": 0.0597, "step": 65380 }, { "epoch": 78.87869643934822, "grad_norm": 5.55095911026001, "learning_rate": 1.9998424871576246e-05, "loss": 0.0607, "step": 65390 }, { "epoch": 78.89076644538322, "grad_norm": 6.316265106201172, "learning_rate": 1.9998424630321452e-05, "loss": 0.0621, "step": 65400 }, { "epoch": 78.90283645141822, "grad_norm": 5.486141681671143, "learning_rate": 1.9998424389066658e-05, "loss": 0.0631, "step": 65410 }, { "epoch": 78.91490645745323, "grad_norm": 6.000148296356201, "learning_rate": 1.9998424147811864e-05, "loss": 0.0627, "step": 65420 }, { "epoch": 78.92697646348823, "grad_norm": 5.541837215423584, "learning_rate": 1.999842390655707e-05, "loss": 0.0625, "step": 65430 }, { "epoch": 78.93904646952323, "grad_norm": 5.508521556854248, "learning_rate": 1.9998423665302277e-05, "loss": 0.0624, "step": 65440 }, { "epoch": 78.95111647555824, "grad_norm": 5.156405448913574, "learning_rate": 1.9998423424047483e-05, "loss": 0.0619, "step": 65450 }, { "epoch": 78.96318648159324, "grad_norm": 5.697445392608643, "learning_rate": 1.999842318279269e-05, "loss": 0.0634, "step": 65460 }, { "epoch": 78.97525648762824, "grad_norm": 5.517644882202148, "learning_rate": 1.9998422941537895e-05, "loss": 0.0641, "step": 65470 }, { "epoch": 78.98732649366325, "grad_norm": 5.176995754241943, "learning_rate": 1.99984227002831e-05, "loss": 0.066, "step": 65480 }, { "epoch": 78.99939649969825, "grad_norm": 6.351353168487549, "learning_rate": 1.9998422459028305e-05, "loss": 0.0657, "step": 65490 }, { "epoch": 79.0108630054315, "grad_norm": 5.048390865325928, "learning_rate": 1.999842221777351e-05, "loss": 0.0446, "step": 65500 }, { "epoch": 79.0108630054315, "eval_loss": 12.48126220703125, "eval_runtime": 8.1453, "eval_samples_per_second": 85.57, "eval_steps_per_second": 10.804, "step": 65500 }, { "epoch": 79.0229330114665, "grad_norm": 4.782122611999512, "learning_rate": 1.9998421976518717e-05, "loss": 0.0479, "step": 65510 }, { "epoch": 79.03500301750151, "grad_norm": 4.660264492034912, "learning_rate": 1.9998421735263923e-05, "loss": 0.0478, "step": 65520 }, { "epoch": 79.04707302353651, "grad_norm": 5.007423400878906, "learning_rate": 1.999842149400913e-05, "loss": 0.0475, "step": 65530 }, { "epoch": 79.05914302957152, "grad_norm": 4.776615619659424, "learning_rate": 1.9998421252754336e-05, "loss": 0.0482, "step": 65540 }, { "epoch": 79.07121303560652, "grad_norm": 4.869558334350586, "learning_rate": 1.9998421011499542e-05, "loss": 0.0499, "step": 65550 }, { "epoch": 79.08328304164152, "grad_norm": 4.867159843444824, "learning_rate": 1.9998420770244748e-05, "loss": 0.0471, "step": 65560 }, { "epoch": 79.09535304767653, "grad_norm": 4.904251575469971, "learning_rate": 1.9998420528989954e-05, "loss": 0.0504, "step": 65570 }, { "epoch": 79.10742305371153, "grad_norm": 4.942698955535889, "learning_rate": 1.999842028773516e-05, "loss": 0.0491, "step": 65580 }, { "epoch": 79.11949305974653, "grad_norm": 5.5014824867248535, "learning_rate": 1.9998420046480367e-05, "loss": 0.0484, "step": 65590 }, { "epoch": 79.13156306578153, "grad_norm": 4.908763885498047, "learning_rate": 1.9998419805225573e-05, "loss": 0.0529, "step": 65600 }, { "epoch": 79.14363307181654, "grad_norm": 4.618313312530518, "learning_rate": 1.999841956397078e-05, "loss": 0.0523, "step": 65610 }, { "epoch": 79.15570307785154, "grad_norm": 5.1724348068237305, "learning_rate": 1.9998419322715985e-05, "loss": 0.0496, "step": 65620 }, { "epoch": 79.16777308388654, "grad_norm": 5.08097505569458, "learning_rate": 1.999841908146119e-05, "loss": 0.0505, "step": 65630 }, { "epoch": 79.17984308992155, "grad_norm": 5.071349620819092, "learning_rate": 1.9998418840206398e-05, "loss": 0.0526, "step": 65640 }, { "epoch": 79.19191309595655, "grad_norm": 5.012113571166992, "learning_rate": 1.9998418598951604e-05, "loss": 0.0512, "step": 65650 }, { "epoch": 79.20398310199155, "grad_norm": 5.760970592498779, "learning_rate": 1.999841835769681e-05, "loss": 0.0547, "step": 65660 }, { "epoch": 79.21605310802656, "grad_norm": 4.712629795074463, "learning_rate": 1.9998418116442016e-05, "loss": 0.052, "step": 65670 }, { "epoch": 79.22812311406156, "grad_norm": 4.6568217277526855, "learning_rate": 1.9998417875187223e-05, "loss": 0.0537, "step": 65680 }, { "epoch": 79.24019312009656, "grad_norm": 5.171385288238525, "learning_rate": 1.999841763393243e-05, "loss": 0.0522, "step": 65690 }, { "epoch": 79.25226312613157, "grad_norm": 5.110752105712891, "learning_rate": 1.9998417392677635e-05, "loss": 0.0552, "step": 65700 }, { "epoch": 79.26433313216657, "grad_norm": 5.151800632476807, "learning_rate": 1.999841715142284e-05, "loss": 0.0523, "step": 65710 }, { "epoch": 79.27640313820157, "grad_norm": 5.336409568786621, "learning_rate": 1.9998416910168047e-05, "loss": 0.0574, "step": 65720 }, { "epoch": 79.28847314423658, "grad_norm": 4.829287528991699, "learning_rate": 1.999841666891325e-05, "loss": 0.0547, "step": 65730 }, { "epoch": 79.30054315027158, "grad_norm": 4.994051456451416, "learning_rate": 1.999841642765846e-05, "loss": 0.054, "step": 65740 }, { "epoch": 79.31261315630658, "grad_norm": 5.609334945678711, "learning_rate": 1.9998416186403666e-05, "loss": 0.0544, "step": 65750 }, { "epoch": 79.32468316234159, "grad_norm": 5.188549518585205, "learning_rate": 1.9998415945148872e-05, "loss": 0.0559, "step": 65760 }, { "epoch": 79.33675316837659, "grad_norm": 5.095184803009033, "learning_rate": 1.999841570389408e-05, "loss": 0.0567, "step": 65770 }, { "epoch": 79.34882317441159, "grad_norm": 4.88671350479126, "learning_rate": 1.9998415462639285e-05, "loss": 0.0536, "step": 65780 }, { "epoch": 79.3608931804466, "grad_norm": 5.289173603057861, "learning_rate": 1.999841522138449e-05, "loss": 0.0558, "step": 65790 }, { "epoch": 79.3729631864816, "grad_norm": 5.165438175201416, "learning_rate": 1.9998414980129697e-05, "loss": 0.0561, "step": 65800 }, { "epoch": 79.3850331925166, "grad_norm": 5.646057605743408, "learning_rate": 1.9998414738874903e-05, "loss": 0.0564, "step": 65810 }, { "epoch": 79.3971031985516, "grad_norm": 5.42555046081543, "learning_rate": 1.999841449762011e-05, "loss": 0.056, "step": 65820 }, { "epoch": 79.40917320458661, "grad_norm": 5.577963352203369, "learning_rate": 1.9998414256365316e-05, "loss": 0.0574, "step": 65830 }, { "epoch": 79.42124321062161, "grad_norm": 4.9671101570129395, "learning_rate": 1.9998414015110522e-05, "loss": 0.0563, "step": 65840 }, { "epoch": 79.43331321665661, "grad_norm": 5.558563232421875, "learning_rate": 1.9998413773855728e-05, "loss": 0.0554, "step": 65850 }, { "epoch": 79.44538322269162, "grad_norm": 5.246769428253174, "learning_rate": 1.9998413532600935e-05, "loss": 0.0543, "step": 65860 }, { "epoch": 79.45745322872662, "grad_norm": 5.2807698249816895, "learning_rate": 1.999841329134614e-05, "loss": 0.059, "step": 65870 }, { "epoch": 79.46952323476162, "grad_norm": 5.344820499420166, "learning_rate": 1.9998413050091347e-05, "loss": 0.0578, "step": 65880 }, { "epoch": 79.48159324079663, "grad_norm": 5.91482400894165, "learning_rate": 1.999841280883655e-05, "loss": 0.0582, "step": 65890 }, { "epoch": 79.49366324683163, "grad_norm": 5.498391628265381, "learning_rate": 1.9998412567581756e-05, "loss": 0.0553, "step": 65900 }, { "epoch": 79.50573325286662, "grad_norm": 5.939740180969238, "learning_rate": 1.9998412326326962e-05, "loss": 0.0568, "step": 65910 }, { "epoch": 79.51780325890162, "grad_norm": 4.93056058883667, "learning_rate": 1.999841208507217e-05, "loss": 0.0606, "step": 65920 }, { "epoch": 79.52987326493663, "grad_norm": 5.548977851867676, "learning_rate": 1.9998411843817375e-05, "loss": 0.0567, "step": 65930 }, { "epoch": 79.54194327097163, "grad_norm": 6.129636764526367, "learning_rate": 1.999841160256258e-05, "loss": 0.0619, "step": 65940 }, { "epoch": 79.55401327700663, "grad_norm": 5.664607524871826, "learning_rate": 1.9998411361307787e-05, "loss": 0.0591, "step": 65950 }, { "epoch": 79.56608328304164, "grad_norm": 5.385622978210449, "learning_rate": 1.9998411120052993e-05, "loss": 0.0608, "step": 65960 }, { "epoch": 79.57815328907664, "grad_norm": 5.173802852630615, "learning_rate": 1.99984108787982e-05, "loss": 0.0585, "step": 65970 }, { "epoch": 79.59022329511164, "grad_norm": 5.57874870300293, "learning_rate": 1.9998410637543406e-05, "loss": 0.0596, "step": 65980 }, { "epoch": 79.60229330114664, "grad_norm": 5.8324689865112305, "learning_rate": 1.9998410396288612e-05, "loss": 0.0601, "step": 65990 }, { "epoch": 79.61436330718165, "grad_norm": 5.916537761688232, "learning_rate": 1.9998410155033818e-05, "loss": 0.0612, "step": 66000 }, { "epoch": 79.61436330718165, "eval_loss": 12.505671501159668, "eval_runtime": 8.1262, "eval_samples_per_second": 85.772, "eval_steps_per_second": 10.829, "step": 66000 }, { "epoch": 79.62643331321665, "grad_norm": 6.099052906036377, "learning_rate": 1.9998409913779024e-05, "loss": 0.0636, "step": 66010 }, { "epoch": 79.63850331925165, "grad_norm": 5.249346733093262, "learning_rate": 1.999840967252423e-05, "loss": 0.0577, "step": 66020 }, { "epoch": 79.65057332528666, "grad_norm": 5.169116497039795, "learning_rate": 1.9998409431269437e-05, "loss": 0.0596, "step": 66030 }, { "epoch": 79.66264333132166, "grad_norm": 4.995204448699951, "learning_rate": 1.9998409190014643e-05, "loss": 0.0612, "step": 66040 }, { "epoch": 79.67471333735666, "grad_norm": 5.920412540435791, "learning_rate": 1.999840894875985e-05, "loss": 0.0591, "step": 66050 }, { "epoch": 79.68678334339167, "grad_norm": 4.836869239807129, "learning_rate": 1.9998408707505055e-05, "loss": 0.0606, "step": 66060 }, { "epoch": 79.69885334942667, "grad_norm": 5.226631164550781, "learning_rate": 1.999840846625026e-05, "loss": 0.058, "step": 66070 }, { "epoch": 79.71092335546167, "grad_norm": 5.755155086517334, "learning_rate": 1.9998408224995468e-05, "loss": 0.0584, "step": 66080 }, { "epoch": 79.72299336149668, "grad_norm": 4.674586296081543, "learning_rate": 1.9998407983740674e-05, "loss": 0.0593, "step": 66090 }, { "epoch": 79.73506336753168, "grad_norm": 5.275477409362793, "learning_rate": 1.999840774248588e-05, "loss": 0.0584, "step": 66100 }, { "epoch": 79.74713337356668, "grad_norm": 5.122812747955322, "learning_rate": 1.9998407501231087e-05, "loss": 0.0598, "step": 66110 }, { "epoch": 79.75920337960169, "grad_norm": 5.395511150360107, "learning_rate": 1.9998407259976293e-05, "loss": 0.0604, "step": 66120 }, { "epoch": 79.77127338563669, "grad_norm": 5.165327548980713, "learning_rate": 1.99984070187215e-05, "loss": 0.0627, "step": 66130 }, { "epoch": 79.78334339167169, "grad_norm": 5.325479507446289, "learning_rate": 1.9998406777466702e-05, "loss": 0.0592, "step": 66140 }, { "epoch": 79.7954133977067, "grad_norm": 5.676788806915283, "learning_rate": 1.9998406536211908e-05, "loss": 0.0571, "step": 66150 }, { "epoch": 79.8074834037417, "grad_norm": 6.005427360534668, "learning_rate": 1.9998406294957114e-05, "loss": 0.0611, "step": 66160 }, { "epoch": 79.8195534097767, "grad_norm": 5.522212028503418, "learning_rate": 1.999840605370232e-05, "loss": 0.0637, "step": 66170 }, { "epoch": 79.8316234158117, "grad_norm": 5.080296039581299, "learning_rate": 1.9998405812447527e-05, "loss": 0.0621, "step": 66180 }, { "epoch": 79.84369342184671, "grad_norm": 5.491708755493164, "learning_rate": 1.9998405571192733e-05, "loss": 0.061, "step": 66190 }, { "epoch": 79.85576342788171, "grad_norm": 5.731345176696777, "learning_rate": 1.999840532993794e-05, "loss": 0.0624, "step": 66200 }, { "epoch": 79.86783343391672, "grad_norm": 5.227781295776367, "learning_rate": 1.9998405088683145e-05, "loss": 0.0609, "step": 66210 }, { "epoch": 79.87990343995172, "grad_norm": 5.896675109863281, "learning_rate": 1.999840484742835e-05, "loss": 0.0616, "step": 66220 }, { "epoch": 79.89197344598672, "grad_norm": 5.717888832092285, "learning_rate": 1.9998404606173558e-05, "loss": 0.0594, "step": 66230 }, { "epoch": 79.90404345202172, "grad_norm": 5.68435001373291, "learning_rate": 1.9998404364918764e-05, "loss": 0.0621, "step": 66240 }, { "epoch": 79.91611345805673, "grad_norm": 5.7991437911987305, "learning_rate": 1.999840412366397e-05, "loss": 0.0623, "step": 66250 }, { "epoch": 79.92818346409173, "grad_norm": 5.546557903289795, "learning_rate": 1.9998403882409176e-05, "loss": 0.0626, "step": 66260 }, { "epoch": 79.94025347012673, "grad_norm": 5.307889461517334, "learning_rate": 1.9998403641154383e-05, "loss": 0.0609, "step": 66270 }, { "epoch": 79.95232347616174, "grad_norm": 5.205746173858643, "learning_rate": 1.9998403399899592e-05, "loss": 0.0634, "step": 66280 }, { "epoch": 79.96439348219674, "grad_norm": 5.465376853942871, "learning_rate": 1.99984031586448e-05, "loss": 0.0639, "step": 66290 }, { "epoch": 79.97646348823174, "grad_norm": 5.10026216506958, "learning_rate": 1.9998402917390005e-05, "loss": 0.0612, "step": 66300 }, { "epoch": 79.98853349426675, "grad_norm": 6.1134748458862305, "learning_rate": 1.9998402676135207e-05, "loss": 0.0626, "step": 66310 }, { "epoch": 80.0, "grad_norm": 9.969947814941406, "learning_rate": 1.9998402434880414e-05, "loss": 0.0664, "step": 66320 }, { "epoch": 80.012070006035, "grad_norm": 5.029106616973877, "learning_rate": 1.999840219362562e-05, "loss": 0.0427, "step": 66330 }, { "epoch": 80.02414001207, "grad_norm": 4.293231010437012, "learning_rate": 1.9998401952370826e-05, "loss": 0.0435, "step": 66340 }, { "epoch": 80.03621001810501, "grad_norm": 4.680843353271484, "learning_rate": 1.9998401711116032e-05, "loss": 0.0462, "step": 66350 }, { "epoch": 80.04828002414001, "grad_norm": 4.993288516998291, "learning_rate": 1.999840146986124e-05, "loss": 0.0458, "step": 66360 }, { "epoch": 80.06035003017502, "grad_norm": 4.718488693237305, "learning_rate": 1.9998401228606445e-05, "loss": 0.0479, "step": 66370 }, { "epoch": 80.07242003621002, "grad_norm": 4.68001651763916, "learning_rate": 1.999840098735165e-05, "loss": 0.0454, "step": 66380 }, { "epoch": 80.08449004224502, "grad_norm": 4.755067825317383, "learning_rate": 1.9998400746096857e-05, "loss": 0.047, "step": 66390 }, { "epoch": 80.09656004828003, "grad_norm": 4.5624213218688965, "learning_rate": 1.9998400504842063e-05, "loss": 0.048, "step": 66400 }, { "epoch": 80.10863005431503, "grad_norm": 4.625026702880859, "learning_rate": 1.999840026358727e-05, "loss": 0.0503, "step": 66410 }, { "epoch": 80.12070006035003, "grad_norm": 4.634173393249512, "learning_rate": 1.9998400022332476e-05, "loss": 0.0518, "step": 66420 }, { "epoch": 80.13277006638504, "grad_norm": 5.02766752243042, "learning_rate": 1.9998399781077682e-05, "loss": 0.0516, "step": 66430 }, { "epoch": 80.14484007242004, "grad_norm": 4.5353593826293945, "learning_rate": 1.9998399539822888e-05, "loss": 0.0492, "step": 66440 }, { "epoch": 80.15691007845504, "grad_norm": 4.643618583679199, "learning_rate": 1.9998399298568094e-05, "loss": 0.0515, "step": 66450 }, { "epoch": 80.16898008449004, "grad_norm": 5.146731376647949, "learning_rate": 1.99983990573133e-05, "loss": 0.0509, "step": 66460 }, { "epoch": 80.18105009052505, "grad_norm": 5.306248664855957, "learning_rate": 1.9998398816058507e-05, "loss": 0.0526, "step": 66470 }, { "epoch": 80.19312009656005, "grad_norm": 5.087640762329102, "learning_rate": 1.9998398574803713e-05, "loss": 0.0497, "step": 66480 }, { "epoch": 80.20519010259505, "grad_norm": 5.259634017944336, "learning_rate": 1.999839833354892e-05, "loss": 0.053, "step": 66490 }, { "epoch": 80.21726010863006, "grad_norm": 5.163248062133789, "learning_rate": 1.9998398092294126e-05, "loss": 0.0531, "step": 66500 }, { "epoch": 80.21726010863006, "eval_loss": 12.506991386413574, "eval_runtime": 8.1414, "eval_samples_per_second": 85.612, "eval_steps_per_second": 10.809, "step": 66500 }, { "epoch": 80.22933011466506, "grad_norm": 5.313006401062012, "learning_rate": 1.9998397851039332e-05, "loss": 0.0532, "step": 66510 }, { "epoch": 80.24140012070006, "grad_norm": 5.341460704803467, "learning_rate": 1.9998397609784538e-05, "loss": 0.0518, "step": 66520 }, { "epoch": 80.25347012673507, "grad_norm": 5.382905006408691, "learning_rate": 1.9998397368529744e-05, "loss": 0.0521, "step": 66530 }, { "epoch": 80.26554013277007, "grad_norm": 5.403218746185303, "learning_rate": 1.999839712727495e-05, "loss": 0.0547, "step": 66540 }, { "epoch": 80.27761013880507, "grad_norm": 5.274344444274902, "learning_rate": 1.9998396886020157e-05, "loss": 0.054, "step": 66550 }, { "epoch": 80.28968014484008, "grad_norm": 5.389572620391846, "learning_rate": 1.999839664476536e-05, "loss": 0.0552, "step": 66560 }, { "epoch": 80.30175015087508, "grad_norm": 5.366660118103027, "learning_rate": 1.9998396403510566e-05, "loss": 0.0558, "step": 66570 }, { "epoch": 80.31382015691008, "grad_norm": 5.524882793426514, "learning_rate": 1.9998396162255772e-05, "loss": 0.0557, "step": 66580 }, { "epoch": 80.32589016294509, "grad_norm": 5.405610084533691, "learning_rate": 1.9998395921000978e-05, "loss": 0.056, "step": 66590 }, { "epoch": 80.33796016898009, "grad_norm": 5.413893699645996, "learning_rate": 1.9998395679746184e-05, "loss": 0.0551, "step": 66600 }, { "epoch": 80.35003017501509, "grad_norm": 5.188392639160156, "learning_rate": 1.999839543849139e-05, "loss": 0.0554, "step": 66610 }, { "epoch": 80.3621001810501, "grad_norm": 4.779399394989014, "learning_rate": 1.9998395197236597e-05, "loss": 0.0548, "step": 66620 }, { "epoch": 80.3741701870851, "grad_norm": 5.1528096199035645, "learning_rate": 1.9998394955981803e-05, "loss": 0.0538, "step": 66630 }, { "epoch": 80.3862401931201, "grad_norm": 5.835824966430664, "learning_rate": 1.999839471472701e-05, "loss": 0.0546, "step": 66640 }, { "epoch": 80.3983101991551, "grad_norm": 5.210485935211182, "learning_rate": 1.9998394473472215e-05, "loss": 0.0561, "step": 66650 }, { "epoch": 80.41038020519011, "grad_norm": 5.06102180480957, "learning_rate": 1.999839423221742e-05, "loss": 0.0555, "step": 66660 }, { "epoch": 80.42245021122511, "grad_norm": 5.585063934326172, "learning_rate": 1.9998393990962628e-05, "loss": 0.0579, "step": 66670 }, { "epoch": 80.43452021726011, "grad_norm": 5.473961353302002, "learning_rate": 1.9998393749707834e-05, "loss": 0.0538, "step": 66680 }, { "epoch": 80.44659022329512, "grad_norm": 5.253476619720459, "learning_rate": 1.999839350845304e-05, "loss": 0.0574, "step": 66690 }, { "epoch": 80.45866022933012, "grad_norm": 5.403327941894531, "learning_rate": 1.9998393267198246e-05, "loss": 0.055, "step": 66700 }, { "epoch": 80.47073023536512, "grad_norm": 5.303191184997559, "learning_rate": 1.9998393025943453e-05, "loss": 0.0581, "step": 66710 }, { "epoch": 80.48280024140013, "grad_norm": 5.553586006164551, "learning_rate": 1.999839278468866e-05, "loss": 0.0555, "step": 66720 }, { "epoch": 80.49487024743513, "grad_norm": 5.094935894012451, "learning_rate": 1.9998392543433865e-05, "loss": 0.0563, "step": 66730 }, { "epoch": 80.50694025347012, "grad_norm": 5.089351177215576, "learning_rate": 1.999839230217907e-05, "loss": 0.0558, "step": 66740 }, { "epoch": 80.51901025950512, "grad_norm": 4.933394432067871, "learning_rate": 1.9998392060924278e-05, "loss": 0.058, "step": 66750 }, { "epoch": 80.53108026554013, "grad_norm": 5.382403373718262, "learning_rate": 1.9998391819669484e-05, "loss": 0.0587, "step": 66760 }, { "epoch": 80.54315027157513, "grad_norm": 6.0528130531311035, "learning_rate": 1.999839157841469e-05, "loss": 0.0572, "step": 66770 }, { "epoch": 80.55522027761013, "grad_norm": 5.364448070526123, "learning_rate": 1.9998391337159896e-05, "loss": 0.0573, "step": 66780 }, { "epoch": 80.56729028364514, "grad_norm": 5.767241477966309, "learning_rate": 1.9998391095905102e-05, "loss": 0.0579, "step": 66790 }, { "epoch": 80.57936028968014, "grad_norm": 5.241068363189697, "learning_rate": 1.999839085465031e-05, "loss": 0.0595, "step": 66800 }, { "epoch": 80.59143029571514, "grad_norm": 5.595676898956299, "learning_rate": 1.999839061339551e-05, "loss": 0.0573, "step": 66810 }, { "epoch": 80.60350030175015, "grad_norm": 5.670836448669434, "learning_rate": 1.999839037214072e-05, "loss": 0.0556, "step": 66820 }, { "epoch": 80.61557030778515, "grad_norm": 5.528497695922852, "learning_rate": 1.9998390130885927e-05, "loss": 0.0582, "step": 66830 }, { "epoch": 80.62764031382015, "grad_norm": 5.509511947631836, "learning_rate": 1.9998389889631133e-05, "loss": 0.0596, "step": 66840 }, { "epoch": 80.63971031985515, "grad_norm": 5.6322126388549805, "learning_rate": 1.999838964837634e-05, "loss": 0.0612, "step": 66850 }, { "epoch": 80.65178032589016, "grad_norm": 4.880138397216797, "learning_rate": 1.9998389407121546e-05, "loss": 0.057, "step": 66860 }, { "epoch": 80.66385033192516, "grad_norm": 6.259881496429443, "learning_rate": 1.9998389165866752e-05, "loss": 0.0586, "step": 66870 }, { "epoch": 80.67592033796016, "grad_norm": 5.527336120605469, "learning_rate": 1.999838892461196e-05, "loss": 0.0615, "step": 66880 }, { "epoch": 80.68799034399517, "grad_norm": 5.300381660461426, "learning_rate": 1.9998388683357165e-05, "loss": 0.0592, "step": 66890 }, { "epoch": 80.70006035003017, "grad_norm": 5.481378078460693, "learning_rate": 1.999838844210237e-05, "loss": 0.0612, "step": 66900 }, { "epoch": 80.71213035606517, "grad_norm": 5.600503444671631, "learning_rate": 1.9998388200847577e-05, "loss": 0.0611, "step": 66910 }, { "epoch": 80.72420036210018, "grad_norm": 5.464258193969727, "learning_rate": 1.9998387959592783e-05, "loss": 0.0607, "step": 66920 }, { "epoch": 80.73627036813518, "grad_norm": 5.892408847808838, "learning_rate": 1.999838771833799e-05, "loss": 0.06, "step": 66930 }, { "epoch": 80.74834037417018, "grad_norm": 5.5773234367370605, "learning_rate": 1.9998387477083196e-05, "loss": 0.0621, "step": 66940 }, { "epoch": 80.76041038020519, "grad_norm": 5.23211145401001, "learning_rate": 1.9998387235828402e-05, "loss": 0.0602, "step": 66950 }, { "epoch": 80.77248038624019, "grad_norm": 5.138144016265869, "learning_rate": 1.9998386994573608e-05, "loss": 0.0614, "step": 66960 }, { "epoch": 80.7845503922752, "grad_norm": 5.504759788513184, "learning_rate": 1.999838675331881e-05, "loss": 0.0611, "step": 66970 }, { "epoch": 80.7966203983102, "grad_norm": 5.336289882659912, "learning_rate": 1.9998386512064017e-05, "loss": 0.0591, "step": 66980 }, { "epoch": 80.8086904043452, "grad_norm": 5.874889373779297, "learning_rate": 1.9998386270809223e-05, "loss": 0.0609, "step": 66990 }, { "epoch": 80.8207604103802, "grad_norm": 5.6830854415893555, "learning_rate": 1.999838602955443e-05, "loss": 0.0613, "step": 67000 }, { "epoch": 80.8207604103802, "eval_loss": 12.527606010437012, "eval_runtime": 8.1351, "eval_samples_per_second": 85.678, "eval_steps_per_second": 10.817, "step": 67000 }, { "epoch": 80.8328304164152, "grad_norm": 5.931861400604248, "learning_rate": 1.9998385788299636e-05, "loss": 0.0622, "step": 67010 }, { "epoch": 80.84490042245021, "grad_norm": 4.969570636749268, "learning_rate": 1.9998385547044842e-05, "loss": 0.0611, "step": 67020 }, { "epoch": 80.85697042848521, "grad_norm": 5.129650115966797, "learning_rate": 1.9998385305790048e-05, "loss": 0.0625, "step": 67030 }, { "epoch": 80.86904043452022, "grad_norm": 5.877858638763428, "learning_rate": 1.9998385064535254e-05, "loss": 0.0595, "step": 67040 }, { "epoch": 80.88111044055522, "grad_norm": 4.961245536804199, "learning_rate": 1.999838482328046e-05, "loss": 0.0601, "step": 67050 }, { "epoch": 80.89318044659022, "grad_norm": 5.704771995544434, "learning_rate": 1.9998384582025667e-05, "loss": 0.0595, "step": 67060 }, { "epoch": 80.90525045262522, "grad_norm": 5.263516426086426, "learning_rate": 1.9998384340770873e-05, "loss": 0.0636, "step": 67070 }, { "epoch": 80.91732045866023, "grad_norm": 5.671525478363037, "learning_rate": 1.999838409951608e-05, "loss": 0.0622, "step": 67080 }, { "epoch": 80.92939046469523, "grad_norm": 5.806789398193359, "learning_rate": 1.9998383858261285e-05, "loss": 0.0638, "step": 67090 }, { "epoch": 80.94146047073023, "grad_norm": 5.769906044006348, "learning_rate": 1.999838361700649e-05, "loss": 0.0612, "step": 67100 }, { "epoch": 80.95353047676524, "grad_norm": 5.181122303009033, "learning_rate": 1.9998383375751698e-05, "loss": 0.063, "step": 67110 }, { "epoch": 80.96560048280024, "grad_norm": 5.210780620574951, "learning_rate": 1.9998383134496904e-05, "loss": 0.0636, "step": 67120 }, { "epoch": 80.97767048883524, "grad_norm": 5.521365165710449, "learning_rate": 1.999838289324211e-05, "loss": 0.0582, "step": 67130 }, { "epoch": 80.98974049487025, "grad_norm": 5.465786933898926, "learning_rate": 1.9998382651987317e-05, "loss": 0.064, "step": 67140 }, { "epoch": 81.0012070006035, "grad_norm": 4.453993797302246, "learning_rate": 1.9998382410732523e-05, "loss": 0.06, "step": 67150 }, { "epoch": 81.0132770066385, "grad_norm": 4.674171447753906, "learning_rate": 1.999838216947773e-05, "loss": 0.0405, "step": 67160 }, { "epoch": 81.0253470126735, "grad_norm": 5.193133354187012, "learning_rate": 1.9998381928222935e-05, "loss": 0.0471, "step": 67170 }, { "epoch": 81.03741701870851, "grad_norm": 4.506906509399414, "learning_rate": 1.999838168696814e-05, "loss": 0.0458, "step": 67180 }, { "epoch": 81.04948702474351, "grad_norm": 5.041962146759033, "learning_rate": 1.9998381445713348e-05, "loss": 0.0505, "step": 67190 }, { "epoch": 81.06155703077852, "grad_norm": 4.587237358093262, "learning_rate": 1.9998381204458554e-05, "loss": 0.0477, "step": 67200 }, { "epoch": 81.07362703681352, "grad_norm": 4.837600231170654, "learning_rate": 1.999838096320376e-05, "loss": 0.0482, "step": 67210 }, { "epoch": 81.08569704284852, "grad_norm": 5.009627819061279, "learning_rate": 1.9998380721948963e-05, "loss": 0.0488, "step": 67220 }, { "epoch": 81.09776704888353, "grad_norm": 4.6441473960876465, "learning_rate": 1.999838048069417e-05, "loss": 0.0499, "step": 67230 }, { "epoch": 81.10983705491853, "grad_norm": 4.682608604431152, "learning_rate": 1.9998380239439375e-05, "loss": 0.0476, "step": 67240 }, { "epoch": 81.12190706095353, "grad_norm": 4.619714736938477, "learning_rate": 1.999837999818458e-05, "loss": 0.0475, "step": 67250 }, { "epoch": 81.13397706698854, "grad_norm": 5.320340633392334, "learning_rate": 1.9998379756929788e-05, "loss": 0.0506, "step": 67260 }, { "epoch": 81.14604707302354, "grad_norm": 4.698822021484375, "learning_rate": 1.9998379515674994e-05, "loss": 0.0488, "step": 67270 }, { "epoch": 81.15811707905854, "grad_norm": 4.884844779968262, "learning_rate": 1.99983792744202e-05, "loss": 0.0495, "step": 67280 }, { "epoch": 81.17018708509354, "grad_norm": 4.56401252746582, "learning_rate": 1.9998379033165406e-05, "loss": 0.0484, "step": 67290 }, { "epoch": 81.18225709112855, "grad_norm": 5.240931034088135, "learning_rate": 1.9998378791910613e-05, "loss": 0.0534, "step": 67300 }, { "epoch": 81.19432709716355, "grad_norm": 4.856487274169922, "learning_rate": 1.999837855065582e-05, "loss": 0.048, "step": 67310 }, { "epoch": 81.20639710319855, "grad_norm": 5.267258167266846, "learning_rate": 1.9998378309401025e-05, "loss": 0.0505, "step": 67320 }, { "epoch": 81.21846710923356, "grad_norm": 5.177761554718018, "learning_rate": 1.999837806814623e-05, "loss": 0.0516, "step": 67330 }, { "epoch": 81.23053711526856, "grad_norm": 5.438549995422363, "learning_rate": 1.9998377826891437e-05, "loss": 0.0532, "step": 67340 }, { "epoch": 81.24260712130356, "grad_norm": 4.973447799682617, "learning_rate": 1.9998377585636644e-05, "loss": 0.0518, "step": 67350 }, { "epoch": 81.25467712733857, "grad_norm": 4.933896541595459, "learning_rate": 1.9998377344381853e-05, "loss": 0.0504, "step": 67360 }, { "epoch": 81.26674713337357, "grad_norm": 4.8730854988098145, "learning_rate": 1.999837710312706e-05, "loss": 0.0515, "step": 67370 }, { "epoch": 81.27881713940857, "grad_norm": 5.731990337371826, "learning_rate": 1.9998376861872266e-05, "loss": 0.0506, "step": 67380 }, { "epoch": 81.29088714544358, "grad_norm": 5.539745330810547, "learning_rate": 1.999837662061747e-05, "loss": 0.0564, "step": 67390 }, { "epoch": 81.30295715147858, "grad_norm": 5.095299243927002, "learning_rate": 1.9998376379362675e-05, "loss": 0.0544, "step": 67400 }, { "epoch": 81.31502715751358, "grad_norm": 5.236217021942139, "learning_rate": 1.999837613810788e-05, "loss": 0.0542, "step": 67410 }, { "epoch": 81.32709716354859, "grad_norm": 5.17903995513916, "learning_rate": 1.9998375896853087e-05, "loss": 0.0512, "step": 67420 }, { "epoch": 81.33916716958359, "grad_norm": 4.741039276123047, "learning_rate": 1.9998375655598293e-05, "loss": 0.0541, "step": 67430 }, { "epoch": 81.35123717561859, "grad_norm": 5.390334606170654, "learning_rate": 1.99983754143435e-05, "loss": 0.0524, "step": 67440 }, { "epoch": 81.3633071816536, "grad_norm": 5.069419860839844, "learning_rate": 1.9998375173088706e-05, "loss": 0.0538, "step": 67450 }, { "epoch": 81.3753771876886, "grad_norm": 5.154407978057861, "learning_rate": 1.9998374931833912e-05, "loss": 0.0527, "step": 67460 }, { "epoch": 81.3874471937236, "grad_norm": 5.151788711547852, "learning_rate": 1.9998374690579118e-05, "loss": 0.0551, "step": 67470 }, { "epoch": 81.3995171997586, "grad_norm": 5.343837738037109, "learning_rate": 1.9998374449324324e-05, "loss": 0.0539, "step": 67480 }, { "epoch": 81.41158720579361, "grad_norm": 5.096736431121826, "learning_rate": 1.999837420806953e-05, "loss": 0.0542, "step": 67490 }, { "epoch": 81.42365721182861, "grad_norm": 5.7244415283203125, "learning_rate": 1.9998373966814737e-05, "loss": 0.0546, "step": 67500 }, { "epoch": 81.42365721182861, "eval_loss": 12.549853324890137, "eval_runtime": 8.134, "eval_samples_per_second": 85.69, "eval_steps_per_second": 10.819, "step": 67500 }, { "epoch": 81.43572721786362, "grad_norm": 4.965661525726318, "learning_rate": 1.9998373725559943e-05, "loss": 0.0542, "step": 67510 }, { "epoch": 81.44779722389862, "grad_norm": 5.481508731842041, "learning_rate": 1.999837348430515e-05, "loss": 0.0531, "step": 67520 }, { "epoch": 81.45986722993362, "grad_norm": 5.721755504608154, "learning_rate": 1.9998373243050356e-05, "loss": 0.0567, "step": 67530 }, { "epoch": 81.47193723596862, "grad_norm": 5.714839935302734, "learning_rate": 1.9998373001795562e-05, "loss": 0.0551, "step": 67540 }, { "epoch": 81.48400724200363, "grad_norm": 5.352684497833252, "learning_rate": 1.9998372760540768e-05, "loss": 0.0559, "step": 67550 }, { "epoch": 81.49607724803863, "grad_norm": 5.372300624847412, "learning_rate": 1.9998372519285974e-05, "loss": 0.0565, "step": 67560 }, { "epoch": 81.50814725407362, "grad_norm": 5.085289001464844, "learning_rate": 1.999837227803118e-05, "loss": 0.0581, "step": 67570 }, { "epoch": 81.52021726010862, "grad_norm": 5.273970603942871, "learning_rate": 1.9998372036776387e-05, "loss": 0.0559, "step": 67580 }, { "epoch": 81.53228726614363, "grad_norm": 5.354611396789551, "learning_rate": 1.9998371795521593e-05, "loss": 0.0582, "step": 67590 }, { "epoch": 81.54435727217863, "grad_norm": 5.320944309234619, "learning_rate": 1.99983715542668e-05, "loss": 0.0583, "step": 67600 }, { "epoch": 81.55642727821363, "grad_norm": 5.6064677238464355, "learning_rate": 1.9998371313012005e-05, "loss": 0.0565, "step": 67610 }, { "epoch": 81.56849728424864, "grad_norm": 5.334126949310303, "learning_rate": 1.999837107175721e-05, "loss": 0.0559, "step": 67620 }, { "epoch": 81.58056729028364, "grad_norm": 5.375230312347412, "learning_rate": 1.9998370830502418e-05, "loss": 0.059, "step": 67630 }, { "epoch": 81.59263729631864, "grad_norm": 5.353614807128906, "learning_rate": 1.999837058924762e-05, "loss": 0.056, "step": 67640 }, { "epoch": 81.60470730235365, "grad_norm": 5.5587053298950195, "learning_rate": 1.9998370347992827e-05, "loss": 0.0573, "step": 67650 }, { "epoch": 81.61677730838865, "grad_norm": 4.9998250007629395, "learning_rate": 1.9998370106738033e-05, "loss": 0.0603, "step": 67660 }, { "epoch": 81.62884731442365, "grad_norm": 4.681044578552246, "learning_rate": 1.999836986548324e-05, "loss": 0.0574, "step": 67670 }, { "epoch": 81.64091732045866, "grad_norm": 5.744080066680908, "learning_rate": 1.9998369624228445e-05, "loss": 0.0582, "step": 67680 }, { "epoch": 81.65298732649366, "grad_norm": 5.395355701446533, "learning_rate": 1.999836938297365e-05, "loss": 0.0594, "step": 67690 }, { "epoch": 81.66505733252866, "grad_norm": 5.634134292602539, "learning_rate": 1.9998369141718858e-05, "loss": 0.0613, "step": 67700 }, { "epoch": 81.67712733856366, "grad_norm": 5.134435176849365, "learning_rate": 1.9998368900464064e-05, "loss": 0.0585, "step": 67710 }, { "epoch": 81.68919734459867, "grad_norm": 5.298087120056152, "learning_rate": 1.999836865920927e-05, "loss": 0.0607, "step": 67720 }, { "epoch": 81.70126735063367, "grad_norm": 5.218008995056152, "learning_rate": 1.9998368417954476e-05, "loss": 0.0573, "step": 67730 }, { "epoch": 81.71333735666867, "grad_norm": 5.754040718078613, "learning_rate": 1.9998368176699683e-05, "loss": 0.0598, "step": 67740 }, { "epoch": 81.72540736270368, "grad_norm": 5.428292274475098, "learning_rate": 1.999836793544489e-05, "loss": 0.0585, "step": 67750 }, { "epoch": 81.73747736873868, "grad_norm": 5.119063377380371, "learning_rate": 1.9998367694190095e-05, "loss": 0.0565, "step": 67760 }, { "epoch": 81.74954737477368, "grad_norm": 5.498736381530762, "learning_rate": 1.99983674529353e-05, "loss": 0.0555, "step": 67770 }, { "epoch": 81.76161738080869, "grad_norm": 6.104394435882568, "learning_rate": 1.9998367211680508e-05, "loss": 0.061, "step": 67780 }, { "epoch": 81.77368738684369, "grad_norm": 4.820679187774658, "learning_rate": 1.9998366970425714e-05, "loss": 0.0578, "step": 67790 }, { "epoch": 81.7857573928787, "grad_norm": 5.144402980804443, "learning_rate": 1.999836672917092e-05, "loss": 0.0609, "step": 67800 }, { "epoch": 81.7978273989137, "grad_norm": 5.110511302947998, "learning_rate": 1.9998366487916126e-05, "loss": 0.058, "step": 67810 }, { "epoch": 81.8098974049487, "grad_norm": 5.72288703918457, "learning_rate": 1.9998366246661332e-05, "loss": 0.0612, "step": 67820 }, { "epoch": 81.8219674109837, "grad_norm": 4.992381572723389, "learning_rate": 1.999836600540654e-05, "loss": 0.0592, "step": 67830 }, { "epoch": 81.8340374170187, "grad_norm": 5.539302825927734, "learning_rate": 1.9998365764151745e-05, "loss": 0.0625, "step": 67840 }, { "epoch": 81.84610742305371, "grad_norm": 5.943055629730225, "learning_rate": 1.999836552289695e-05, "loss": 0.062, "step": 67850 }, { "epoch": 81.85817742908871, "grad_norm": 5.720965385437012, "learning_rate": 1.9998365281642157e-05, "loss": 0.0582, "step": 67860 }, { "epoch": 81.87024743512372, "grad_norm": 5.627582550048828, "learning_rate": 1.9998365040387363e-05, "loss": 0.0599, "step": 67870 }, { "epoch": 81.88231744115872, "grad_norm": 5.207618713378906, "learning_rate": 1.999836479913257e-05, "loss": 0.0605, "step": 67880 }, { "epoch": 81.89438744719372, "grad_norm": 5.477315425872803, "learning_rate": 1.9998364557877773e-05, "loss": 0.0606, "step": 67890 }, { "epoch": 81.90645745322873, "grad_norm": 5.659132957458496, "learning_rate": 1.9998364316622982e-05, "loss": 0.0592, "step": 67900 }, { "epoch": 81.91852745926373, "grad_norm": 5.6460065841674805, "learning_rate": 1.999836407536819e-05, "loss": 0.062, "step": 67910 }, { "epoch": 81.93059746529873, "grad_norm": 5.183572292327881, "learning_rate": 1.9998363834113395e-05, "loss": 0.0602, "step": 67920 }, { "epoch": 81.94266747133373, "grad_norm": 5.606837272644043, "learning_rate": 1.99983635928586e-05, "loss": 0.0623, "step": 67930 }, { "epoch": 81.95473747736874, "grad_norm": 5.713935375213623, "learning_rate": 1.9998363351603807e-05, "loss": 0.0621, "step": 67940 }, { "epoch": 81.96680748340374, "grad_norm": 5.143772125244141, "learning_rate": 1.9998363110349013e-05, "loss": 0.0611, "step": 67950 }, { "epoch": 81.97887748943874, "grad_norm": 5.646153450012207, "learning_rate": 1.999836286909422e-05, "loss": 0.0604, "step": 67960 }, { "epoch": 81.99094749547375, "grad_norm": 5.63379430770874, "learning_rate": 1.9998362627839426e-05, "loss": 0.0624, "step": 67970 }, { "epoch": 82.002414001207, "grad_norm": 4.73036003112793, "learning_rate": 1.9998362386584632e-05, "loss": 0.059, "step": 67980 }, { "epoch": 82.014484007242, "grad_norm": 4.388375759124756, "learning_rate": 1.9998362145329838e-05, "loss": 0.0412, "step": 67990 }, { "epoch": 82.026554013277, "grad_norm": 4.785322666168213, "learning_rate": 1.9998361904075044e-05, "loss": 0.0447, "step": 68000 }, { "epoch": 82.026554013277, "eval_loss": 12.536049842834473, "eval_runtime": 8.1309, "eval_samples_per_second": 85.723, "eval_steps_per_second": 10.823, "step": 68000 }, { "epoch": 82.03862401931201, "grad_norm": 5.2772955894470215, "learning_rate": 1.999836166282025e-05, "loss": 0.0456, "step": 68010 }, { "epoch": 82.05069402534701, "grad_norm": 4.696455955505371, "learning_rate": 1.9998361421565457e-05, "loss": 0.0461, "step": 68020 }, { "epoch": 82.06276403138202, "grad_norm": 5.167632102966309, "learning_rate": 1.9998361180310663e-05, "loss": 0.051, "step": 68030 }, { "epoch": 82.07483403741702, "grad_norm": 5.264662265777588, "learning_rate": 1.999836093905587e-05, "loss": 0.0499, "step": 68040 }, { "epoch": 82.08690404345202, "grad_norm": 5.066656112670898, "learning_rate": 1.9998360697801072e-05, "loss": 0.0488, "step": 68050 }, { "epoch": 82.09897404948703, "grad_norm": 4.619762420654297, "learning_rate": 1.9998360456546278e-05, "loss": 0.049, "step": 68060 }, { "epoch": 82.11104405552203, "grad_norm": 4.359014511108398, "learning_rate": 1.9998360215291484e-05, "loss": 0.0498, "step": 68070 }, { "epoch": 82.12311406155703, "grad_norm": 4.785669803619385, "learning_rate": 1.999835997403669e-05, "loss": 0.048, "step": 68080 }, { "epoch": 82.13518406759204, "grad_norm": 5.175955772399902, "learning_rate": 1.9998359732781897e-05, "loss": 0.0497, "step": 68090 }, { "epoch": 82.14725407362704, "grad_norm": 4.411837577819824, "learning_rate": 1.9998359491527103e-05, "loss": 0.0498, "step": 68100 }, { "epoch": 82.15932407966204, "grad_norm": 5.100308418273926, "learning_rate": 1.999835925027231e-05, "loss": 0.0508, "step": 68110 }, { "epoch": 82.17139408569705, "grad_norm": 4.846618175506592, "learning_rate": 1.9998359009017515e-05, "loss": 0.0494, "step": 68120 }, { "epoch": 82.18346409173205, "grad_norm": 4.802457809448242, "learning_rate": 1.999835876776272e-05, "loss": 0.0519, "step": 68130 }, { "epoch": 82.19553409776705, "grad_norm": 4.603299617767334, "learning_rate": 1.9998358526507928e-05, "loss": 0.05, "step": 68140 }, { "epoch": 82.20760410380205, "grad_norm": 5.660634517669678, "learning_rate": 1.9998358285253134e-05, "loss": 0.0496, "step": 68150 }, { "epoch": 82.21967410983706, "grad_norm": 4.568277359008789, "learning_rate": 1.999835804399834e-05, "loss": 0.0491, "step": 68160 }, { "epoch": 82.23174411587206, "grad_norm": 5.311522006988525, "learning_rate": 1.9998357802743547e-05, "loss": 0.0504, "step": 68170 }, { "epoch": 82.24381412190706, "grad_norm": 5.157040596008301, "learning_rate": 1.9998357561488753e-05, "loss": 0.051, "step": 68180 }, { "epoch": 82.25588412794207, "grad_norm": 4.695820331573486, "learning_rate": 1.999835732023396e-05, "loss": 0.0528, "step": 68190 }, { "epoch": 82.26795413397707, "grad_norm": 4.730595111846924, "learning_rate": 1.9998357078979165e-05, "loss": 0.0496, "step": 68200 }, { "epoch": 82.28002414001207, "grad_norm": 4.74273681640625, "learning_rate": 1.999835683772437e-05, "loss": 0.0516, "step": 68210 }, { "epoch": 82.29209414604708, "grad_norm": 4.610339164733887, "learning_rate": 1.9998356596469578e-05, "loss": 0.0553, "step": 68220 }, { "epoch": 82.30416415208208, "grad_norm": 5.088507175445557, "learning_rate": 1.9998356355214784e-05, "loss": 0.0538, "step": 68230 }, { "epoch": 82.31623415811708, "grad_norm": 5.087621688842773, "learning_rate": 1.999835611395999e-05, "loss": 0.055, "step": 68240 }, { "epoch": 82.32830416415209, "grad_norm": 4.836313247680664, "learning_rate": 1.9998355872705196e-05, "loss": 0.0551, "step": 68250 }, { "epoch": 82.34037417018709, "grad_norm": 5.717047214508057, "learning_rate": 1.9998355631450402e-05, "loss": 0.0547, "step": 68260 }, { "epoch": 82.3524441762221, "grad_norm": 5.505490779876709, "learning_rate": 1.999835539019561e-05, "loss": 0.0549, "step": 68270 }, { "epoch": 82.3645141822571, "grad_norm": 4.994042873382568, "learning_rate": 1.9998355148940815e-05, "loss": 0.0542, "step": 68280 }, { "epoch": 82.3765841882921, "grad_norm": 5.134110450744629, "learning_rate": 1.999835490768602e-05, "loss": 0.0546, "step": 68290 }, { "epoch": 82.3886541943271, "grad_norm": 5.238455772399902, "learning_rate": 1.9998354666431224e-05, "loss": 0.0548, "step": 68300 }, { "epoch": 82.4007242003621, "grad_norm": 4.755467891693115, "learning_rate": 1.999835442517643e-05, "loss": 0.0556, "step": 68310 }, { "epoch": 82.41279420639711, "grad_norm": 5.073837757110596, "learning_rate": 1.9998354183921636e-05, "loss": 0.0534, "step": 68320 }, { "epoch": 82.42486421243211, "grad_norm": 4.835320472717285, "learning_rate": 1.9998353942666843e-05, "loss": 0.0544, "step": 68330 }, { "epoch": 82.43693421846712, "grad_norm": 5.118345737457275, "learning_rate": 1.999835370141205e-05, "loss": 0.0532, "step": 68340 }, { "epoch": 82.44900422450212, "grad_norm": 4.848635673522949, "learning_rate": 1.9998353460157255e-05, "loss": 0.0539, "step": 68350 }, { "epoch": 82.46107423053712, "grad_norm": 4.986633777618408, "learning_rate": 1.999835321890246e-05, "loss": 0.0554, "step": 68360 }, { "epoch": 82.47314423657213, "grad_norm": 5.301085948944092, "learning_rate": 1.9998352977647667e-05, "loss": 0.0541, "step": 68370 }, { "epoch": 82.48521424260713, "grad_norm": 5.1482720375061035, "learning_rate": 1.9998352736392874e-05, "loss": 0.0544, "step": 68380 }, { "epoch": 82.49728424864213, "grad_norm": 4.668992519378662, "learning_rate": 1.999835249513808e-05, "loss": 0.0548, "step": 68390 }, { "epoch": 82.50935425467712, "grad_norm": 4.8442301750183105, "learning_rate": 1.9998352253883286e-05, "loss": 0.053, "step": 68400 }, { "epoch": 82.52142426071212, "grad_norm": 4.66680908203125, "learning_rate": 1.9998352012628492e-05, "loss": 0.0543, "step": 68410 }, { "epoch": 82.53349426674713, "grad_norm": 5.100085258483887, "learning_rate": 1.99983517713737e-05, "loss": 0.0547, "step": 68420 }, { "epoch": 82.54556427278213, "grad_norm": 5.413245677947998, "learning_rate": 1.9998351530118905e-05, "loss": 0.0557, "step": 68430 }, { "epoch": 82.55763427881713, "grad_norm": 4.87555456161499, "learning_rate": 1.9998351288864114e-05, "loss": 0.0573, "step": 68440 }, { "epoch": 82.56970428485214, "grad_norm": 5.11168909072876, "learning_rate": 1.999835104760932e-05, "loss": 0.0554, "step": 68450 }, { "epoch": 82.58177429088714, "grad_norm": 5.3752760887146, "learning_rate": 1.9998350806354523e-05, "loss": 0.0575, "step": 68460 }, { "epoch": 82.59384429692214, "grad_norm": 5.056478977203369, "learning_rate": 1.999835056509973e-05, "loss": 0.0546, "step": 68470 }, { "epoch": 82.60591430295715, "grad_norm": 4.8330793380737305, "learning_rate": 1.9998350323844936e-05, "loss": 0.0579, "step": 68480 }, { "epoch": 82.61798430899215, "grad_norm": 5.156948089599609, "learning_rate": 1.9998350082590142e-05, "loss": 0.0577, "step": 68490 }, { "epoch": 82.63005431502715, "grad_norm": 5.261240005493164, "learning_rate": 1.9998349841335348e-05, "loss": 0.0548, "step": 68500 }, { "epoch": 82.63005431502715, "eval_loss": 12.568981170654297, "eval_runtime": 8.1308, "eval_samples_per_second": 85.724, "eval_steps_per_second": 10.823, "step": 68500 }, { "epoch": 82.64212432106216, "grad_norm": 5.851475238800049, "learning_rate": 1.9998349600080554e-05, "loss": 0.0558, "step": 68510 }, { "epoch": 82.65419432709716, "grad_norm": 5.474430561065674, "learning_rate": 1.999834935882576e-05, "loss": 0.0585, "step": 68520 }, { "epoch": 82.66626433313216, "grad_norm": 5.27010440826416, "learning_rate": 1.9998349117570967e-05, "loss": 0.0597, "step": 68530 }, { "epoch": 82.67833433916717, "grad_norm": 5.128119468688965, "learning_rate": 1.9998348876316173e-05, "loss": 0.0591, "step": 68540 }, { "epoch": 82.69040434520217, "grad_norm": 5.7232489585876465, "learning_rate": 1.999834863506138e-05, "loss": 0.0562, "step": 68550 }, { "epoch": 82.70247435123717, "grad_norm": 5.295707702636719, "learning_rate": 1.9998348393806586e-05, "loss": 0.0563, "step": 68560 }, { "epoch": 82.71454435727217, "grad_norm": 5.007434844970703, "learning_rate": 1.9998348152551792e-05, "loss": 0.0561, "step": 68570 }, { "epoch": 82.72661436330718, "grad_norm": 5.3647966384887695, "learning_rate": 1.9998347911296998e-05, "loss": 0.0565, "step": 68580 }, { "epoch": 82.73868436934218, "grad_norm": 5.625888347625732, "learning_rate": 1.9998347670042204e-05, "loss": 0.0573, "step": 68590 }, { "epoch": 82.75075437537718, "grad_norm": 5.216412544250488, "learning_rate": 1.999834742878741e-05, "loss": 0.0573, "step": 68600 }, { "epoch": 82.76282438141219, "grad_norm": 5.360692024230957, "learning_rate": 1.9998347187532617e-05, "loss": 0.0578, "step": 68610 }, { "epoch": 82.77489438744719, "grad_norm": 5.410419464111328, "learning_rate": 1.9998346946277823e-05, "loss": 0.0565, "step": 68620 }, { "epoch": 82.7869643934822, "grad_norm": 5.423883438110352, "learning_rate": 1.999834670502303e-05, "loss": 0.0604, "step": 68630 }, { "epoch": 82.7990343995172, "grad_norm": 5.284257888793945, "learning_rate": 1.9998346463768235e-05, "loss": 0.0583, "step": 68640 }, { "epoch": 82.8111044055522, "grad_norm": 5.233118534088135, "learning_rate": 1.999834622251344e-05, "loss": 0.0561, "step": 68650 }, { "epoch": 82.8231744115872, "grad_norm": 5.164747714996338, "learning_rate": 1.9998345981258648e-05, "loss": 0.0614, "step": 68660 }, { "epoch": 82.8352444176222, "grad_norm": 5.411870002746582, "learning_rate": 1.9998345740003854e-05, "loss": 0.06, "step": 68670 }, { "epoch": 82.84731442365721, "grad_norm": 5.275054931640625, "learning_rate": 1.999834549874906e-05, "loss": 0.0607, "step": 68680 }, { "epoch": 82.85938442969221, "grad_norm": 5.273706436157227, "learning_rate": 1.9998345257494266e-05, "loss": 0.0583, "step": 68690 }, { "epoch": 82.87145443572722, "grad_norm": 5.017655372619629, "learning_rate": 1.9998345016239473e-05, "loss": 0.0579, "step": 68700 }, { "epoch": 82.88352444176222, "grad_norm": 5.369320392608643, "learning_rate": 1.9998344774984675e-05, "loss": 0.0602, "step": 68710 }, { "epoch": 82.89559444779722, "grad_norm": 5.468875885009766, "learning_rate": 1.999834453372988e-05, "loss": 0.0593, "step": 68720 }, { "epoch": 82.90766445383223, "grad_norm": 4.971333026885986, "learning_rate": 1.9998344292475088e-05, "loss": 0.0586, "step": 68730 }, { "epoch": 82.91973445986723, "grad_norm": 5.495071887969971, "learning_rate": 1.9998344051220294e-05, "loss": 0.0602, "step": 68740 }, { "epoch": 82.93180446590223, "grad_norm": 5.9874653816223145, "learning_rate": 1.99983438099655e-05, "loss": 0.0613, "step": 68750 }, { "epoch": 82.94387447193724, "grad_norm": 5.382701396942139, "learning_rate": 1.9998343568710706e-05, "loss": 0.0593, "step": 68760 }, { "epoch": 82.95594447797224, "grad_norm": 6.391441345214844, "learning_rate": 1.9998343327455913e-05, "loss": 0.0618, "step": 68770 }, { "epoch": 82.96801448400724, "grad_norm": 5.692450523376465, "learning_rate": 1.999834308620112e-05, "loss": 0.0619, "step": 68780 }, { "epoch": 82.98008449004224, "grad_norm": 5.566617965698242, "learning_rate": 1.9998342844946325e-05, "loss": 0.0578, "step": 68790 }, { "epoch": 82.99215449607725, "grad_norm": 5.006233215332031, "learning_rate": 1.999834260369153e-05, "loss": 0.057, "step": 68800 }, { "epoch": 83.0036210018105, "grad_norm": 4.268877029418945, "learning_rate": 1.9998342362436738e-05, "loss": 0.0538, "step": 68810 }, { "epoch": 83.0156910078455, "grad_norm": 4.896791458129883, "learning_rate": 1.9998342121181944e-05, "loss": 0.0425, "step": 68820 }, { "epoch": 83.02776101388051, "grad_norm": 4.906429767608643, "learning_rate": 1.999834187992715e-05, "loss": 0.0412, "step": 68830 }, { "epoch": 83.03983101991551, "grad_norm": 5.414362907409668, "learning_rate": 1.9998341638672356e-05, "loss": 0.0443, "step": 68840 }, { "epoch": 83.05190102595051, "grad_norm": 4.551591396331787, "learning_rate": 1.9998341397417562e-05, "loss": 0.0451, "step": 68850 }, { "epoch": 83.06397103198552, "grad_norm": 4.5759758949279785, "learning_rate": 1.999834115616277e-05, "loss": 0.0464, "step": 68860 }, { "epoch": 83.07604103802052, "grad_norm": 5.031744480133057, "learning_rate": 1.9998340914907975e-05, "loss": 0.0448, "step": 68870 }, { "epoch": 83.08811104405552, "grad_norm": 4.154232501983643, "learning_rate": 1.999834067365318e-05, "loss": 0.049, "step": 68880 }, { "epoch": 83.10018105009053, "grad_norm": 4.548018932342529, "learning_rate": 1.9998340432398387e-05, "loss": 0.0474, "step": 68890 }, { "epoch": 83.11225105612553, "grad_norm": 4.993680477142334, "learning_rate": 1.9998340191143593e-05, "loss": 0.0484, "step": 68900 }, { "epoch": 83.12432106216053, "grad_norm": 4.889462471008301, "learning_rate": 1.99983399498888e-05, "loss": 0.0493, "step": 68910 }, { "epoch": 83.13639106819554, "grad_norm": 5.137351036071777, "learning_rate": 1.9998339708634006e-05, "loss": 0.0483, "step": 68920 }, { "epoch": 83.14846107423054, "grad_norm": 5.058670997619629, "learning_rate": 1.9998339467379212e-05, "loss": 0.0501, "step": 68930 }, { "epoch": 83.16053108026554, "grad_norm": 4.871838092803955, "learning_rate": 1.999833922612442e-05, "loss": 0.0502, "step": 68940 }, { "epoch": 83.17260108630055, "grad_norm": 4.650781631469727, "learning_rate": 1.9998338984869625e-05, "loss": 0.0487, "step": 68950 }, { "epoch": 83.18467109233555, "grad_norm": 4.8576579093933105, "learning_rate": 1.9998338743614827e-05, "loss": 0.0489, "step": 68960 }, { "epoch": 83.19674109837055, "grad_norm": 4.696033477783203, "learning_rate": 1.9998338502360034e-05, "loss": 0.0491, "step": 68970 }, { "epoch": 83.20881110440556, "grad_norm": 4.993026256561279, "learning_rate": 1.9998338261105243e-05, "loss": 0.0523, "step": 68980 }, { "epoch": 83.22088111044056, "grad_norm": 4.755360126495361, "learning_rate": 1.999833801985045e-05, "loss": 0.0483, "step": 68990 }, { "epoch": 83.23295111647556, "grad_norm": 5.074746608734131, "learning_rate": 1.9998337778595656e-05, "loss": 0.0501, "step": 69000 }, { "epoch": 83.23295111647556, "eval_loss": 12.585859298706055, "eval_runtime": 8.118, "eval_samples_per_second": 85.859, "eval_steps_per_second": 10.84, "step": 69000 }, { "epoch": 83.24502112251056, "grad_norm": 5.200024127960205, "learning_rate": 1.9998337537340862e-05, "loss": 0.0546, "step": 69010 }, { "epoch": 83.25709112854557, "grad_norm": 4.994756698608398, "learning_rate": 1.9998337296086068e-05, "loss": 0.0513, "step": 69020 }, { "epoch": 83.26916113458057, "grad_norm": 5.054103851318359, "learning_rate": 1.9998337054831274e-05, "loss": 0.0534, "step": 69030 }, { "epoch": 83.28123114061557, "grad_norm": 4.934669017791748, "learning_rate": 1.999833681357648e-05, "loss": 0.0506, "step": 69040 }, { "epoch": 83.29330114665058, "grad_norm": 5.565598964691162, "learning_rate": 1.9998336572321687e-05, "loss": 0.0543, "step": 69050 }, { "epoch": 83.30537115268558, "grad_norm": 5.149264812469482, "learning_rate": 1.9998336331066893e-05, "loss": 0.0529, "step": 69060 }, { "epoch": 83.31744115872058, "grad_norm": 5.300600051879883, "learning_rate": 1.99983360898121e-05, "loss": 0.054, "step": 69070 }, { "epoch": 83.32951116475559, "grad_norm": 5.017077922821045, "learning_rate": 1.9998335848557305e-05, "loss": 0.0542, "step": 69080 }, { "epoch": 83.34158117079059, "grad_norm": 4.914315700531006, "learning_rate": 1.999833560730251e-05, "loss": 0.0526, "step": 69090 }, { "epoch": 83.3536511768256, "grad_norm": 5.064779281616211, "learning_rate": 1.9998335366047718e-05, "loss": 0.0535, "step": 69100 }, { "epoch": 83.3657211828606, "grad_norm": 4.71847677230835, "learning_rate": 1.9998335124792924e-05, "loss": 0.0522, "step": 69110 }, { "epoch": 83.3777911888956, "grad_norm": 5.492131233215332, "learning_rate": 1.999833488353813e-05, "loss": 0.0532, "step": 69120 }, { "epoch": 83.3898611949306, "grad_norm": 4.9501190185546875, "learning_rate": 1.9998334642283333e-05, "loss": 0.0531, "step": 69130 }, { "epoch": 83.4019312009656, "grad_norm": 4.770423412322998, "learning_rate": 1.999833440102854e-05, "loss": 0.0535, "step": 69140 }, { "epoch": 83.41400120700061, "grad_norm": 4.945001125335693, "learning_rate": 1.9998334159773745e-05, "loss": 0.0527, "step": 69150 }, { "epoch": 83.42607121303561, "grad_norm": 5.13511323928833, "learning_rate": 1.9998333918518952e-05, "loss": 0.0531, "step": 69160 }, { "epoch": 83.43814121907062, "grad_norm": 6.069188594818115, "learning_rate": 1.9998333677264158e-05, "loss": 0.0554, "step": 69170 }, { "epoch": 83.45021122510562, "grad_norm": 5.6572957038879395, "learning_rate": 1.9998333436009364e-05, "loss": 0.0545, "step": 69180 }, { "epoch": 83.46228123114062, "grad_norm": 4.924929618835449, "learning_rate": 1.999833319475457e-05, "loss": 0.055, "step": 69190 }, { "epoch": 83.47435123717563, "grad_norm": 4.934643745422363, "learning_rate": 1.9998332953499777e-05, "loss": 0.0533, "step": 69200 }, { "epoch": 83.48642124321063, "grad_norm": 4.797537803649902, "learning_rate": 1.9998332712244983e-05, "loss": 0.0557, "step": 69210 }, { "epoch": 83.49849124924563, "grad_norm": 5.453699111938477, "learning_rate": 1.999833247099019e-05, "loss": 0.0531, "step": 69220 }, { "epoch": 83.51056125528062, "grad_norm": 5.176029682159424, "learning_rate": 1.9998332229735395e-05, "loss": 0.0533, "step": 69230 }, { "epoch": 83.52263126131562, "grad_norm": 5.2503228187561035, "learning_rate": 1.99983319884806e-05, "loss": 0.0567, "step": 69240 }, { "epoch": 83.53470126735063, "grad_norm": 5.236601829528809, "learning_rate": 1.9998331747225808e-05, "loss": 0.0558, "step": 69250 }, { "epoch": 83.54677127338563, "grad_norm": 5.092426776885986, "learning_rate": 1.9998331505971014e-05, "loss": 0.0555, "step": 69260 }, { "epoch": 83.55884127942063, "grad_norm": 4.7240471839904785, "learning_rate": 1.999833126471622e-05, "loss": 0.0544, "step": 69270 }, { "epoch": 83.57091128545564, "grad_norm": 5.7743821144104, "learning_rate": 1.9998331023461426e-05, "loss": 0.053, "step": 69280 }, { "epoch": 83.58298129149064, "grad_norm": 5.391125679016113, "learning_rate": 1.9998330782206633e-05, "loss": 0.0569, "step": 69290 }, { "epoch": 83.59505129752564, "grad_norm": 4.9393463134765625, "learning_rate": 1.999833054095184e-05, "loss": 0.0555, "step": 69300 }, { "epoch": 83.60712130356065, "grad_norm": 5.256810188293457, "learning_rate": 1.9998330299697045e-05, "loss": 0.056, "step": 69310 }, { "epoch": 83.61919130959565, "grad_norm": 4.64094352722168, "learning_rate": 1.999833005844225e-05, "loss": 0.0535, "step": 69320 }, { "epoch": 83.63126131563065, "grad_norm": 5.281157970428467, "learning_rate": 1.9998329817187457e-05, "loss": 0.0561, "step": 69330 }, { "epoch": 83.64333132166566, "grad_norm": 5.693446636199951, "learning_rate": 1.9998329575932664e-05, "loss": 0.0542, "step": 69340 }, { "epoch": 83.65540132770066, "grad_norm": 5.354091644287109, "learning_rate": 1.999832933467787e-05, "loss": 0.0567, "step": 69350 }, { "epoch": 83.66747133373566, "grad_norm": 5.696505069732666, "learning_rate": 1.9998329093423076e-05, "loss": 0.0567, "step": 69360 }, { "epoch": 83.67954133977067, "grad_norm": 5.771193981170654, "learning_rate": 1.9998328852168282e-05, "loss": 0.0589, "step": 69370 }, { "epoch": 83.69161134580567, "grad_norm": 4.958669662475586, "learning_rate": 1.9998328610913485e-05, "loss": 0.0557, "step": 69380 }, { "epoch": 83.70368135184067, "grad_norm": 5.425338268280029, "learning_rate": 1.999832836965869e-05, "loss": 0.0592, "step": 69390 }, { "epoch": 83.71575135787567, "grad_norm": 5.773736953735352, "learning_rate": 1.9998328128403897e-05, "loss": 0.0563, "step": 69400 }, { "epoch": 83.72782136391068, "grad_norm": 5.611748218536377, "learning_rate": 1.9998327887149104e-05, "loss": 0.059, "step": 69410 }, { "epoch": 83.73989136994568, "grad_norm": 5.550783634185791, "learning_rate": 1.999832764589431e-05, "loss": 0.0597, "step": 69420 }, { "epoch": 83.75196137598068, "grad_norm": 4.681576251983643, "learning_rate": 1.9998327404639516e-05, "loss": 0.0593, "step": 69430 }, { "epoch": 83.76403138201569, "grad_norm": 5.081993103027344, "learning_rate": 1.9998327163384722e-05, "loss": 0.0557, "step": 69440 }, { "epoch": 83.77610138805069, "grad_norm": 5.447761535644531, "learning_rate": 1.999832692212993e-05, "loss": 0.0555, "step": 69450 }, { "epoch": 83.7881713940857, "grad_norm": 4.479170322418213, "learning_rate": 1.9998326680875135e-05, "loss": 0.0548, "step": 69460 }, { "epoch": 83.8002414001207, "grad_norm": 5.174189567565918, "learning_rate": 1.999832643962034e-05, "loss": 0.0556, "step": 69470 }, { "epoch": 83.8123114061557, "grad_norm": 5.448447227478027, "learning_rate": 1.9998326198365547e-05, "loss": 0.0602, "step": 69480 }, { "epoch": 83.8243814121907, "grad_norm": 5.603274822235107, "learning_rate": 1.9998325957110753e-05, "loss": 0.0589, "step": 69490 }, { "epoch": 83.8364514182257, "grad_norm": 5.1630940437316895, "learning_rate": 1.999832571585596e-05, "loss": 0.06, "step": 69500 }, { "epoch": 83.8364514182257, "eval_loss": 12.586185455322266, "eval_runtime": 8.1319, "eval_samples_per_second": 85.712, "eval_steps_per_second": 10.822, "step": 69500 }, { "epoch": 83.84852142426071, "grad_norm": 5.643831729888916, "learning_rate": 1.9998325474601166e-05, "loss": 0.0594, "step": 69510 }, { "epoch": 83.86059143029571, "grad_norm": 4.7727155685424805, "learning_rate": 1.9998325233346375e-05, "loss": 0.0568, "step": 69520 }, { "epoch": 83.87266143633072, "grad_norm": 5.4369425773620605, "learning_rate": 1.999832499209158e-05, "loss": 0.0566, "step": 69530 }, { "epoch": 83.88473144236572, "grad_norm": 5.295812129974365, "learning_rate": 1.9998324750836785e-05, "loss": 0.0588, "step": 69540 }, { "epoch": 83.89680144840072, "grad_norm": 5.544506549835205, "learning_rate": 1.999832450958199e-05, "loss": 0.0582, "step": 69550 }, { "epoch": 83.90887145443573, "grad_norm": 5.909781455993652, "learning_rate": 1.9998324268327197e-05, "loss": 0.0595, "step": 69560 }, { "epoch": 83.92094146047073, "grad_norm": 5.556373119354248, "learning_rate": 1.9998324027072403e-05, "loss": 0.0627, "step": 69570 }, { "epoch": 83.93301146650573, "grad_norm": 5.435447692871094, "learning_rate": 1.999832378581761e-05, "loss": 0.0634, "step": 69580 }, { "epoch": 83.94508147254074, "grad_norm": 5.07047700881958, "learning_rate": 1.9998323544562816e-05, "loss": 0.0599, "step": 69590 }, { "epoch": 83.95715147857574, "grad_norm": 5.822970867156982, "learning_rate": 1.9998323303308022e-05, "loss": 0.0585, "step": 69600 }, { "epoch": 83.96922148461074, "grad_norm": 5.145660877227783, "learning_rate": 1.9998323062053228e-05, "loss": 0.0587, "step": 69610 }, { "epoch": 83.98129149064575, "grad_norm": 4.946815490722656, "learning_rate": 1.9998322820798434e-05, "loss": 0.0589, "step": 69620 }, { "epoch": 83.99336149668075, "grad_norm": 5.366283416748047, "learning_rate": 1.999832257954364e-05, "loss": 0.0597, "step": 69630 }, { "epoch": 84.004828002414, "grad_norm": 4.56962251663208, "learning_rate": 1.9998322338288847e-05, "loss": 0.054, "step": 69640 }, { "epoch": 84.016898008449, "grad_norm": 4.386937141418457, "learning_rate": 1.9998322097034053e-05, "loss": 0.039, "step": 69650 }, { "epoch": 84.02896801448401, "grad_norm": 4.0451340675354, "learning_rate": 1.999832185577926e-05, "loss": 0.0413, "step": 69660 }, { "epoch": 84.04103802051901, "grad_norm": 4.688874244689941, "learning_rate": 1.9998321614524465e-05, "loss": 0.0444, "step": 69670 }, { "epoch": 84.05310802655401, "grad_norm": 4.098537921905518, "learning_rate": 1.999832137326967e-05, "loss": 0.0449, "step": 69680 }, { "epoch": 84.06517803258902, "grad_norm": 5.303950309753418, "learning_rate": 1.9998321132014878e-05, "loss": 0.0474, "step": 69690 }, { "epoch": 84.07724803862402, "grad_norm": 4.181321620941162, "learning_rate": 1.9998320890760084e-05, "loss": 0.0461, "step": 69700 }, { "epoch": 84.08931804465902, "grad_norm": 4.810417652130127, "learning_rate": 1.999832064950529e-05, "loss": 0.0464, "step": 69710 }, { "epoch": 84.10138805069403, "grad_norm": 4.662810802459717, "learning_rate": 1.9998320408250496e-05, "loss": 0.0477, "step": 69720 }, { "epoch": 84.11345805672903, "grad_norm": 4.591186046600342, "learning_rate": 1.9998320166995703e-05, "loss": 0.0485, "step": 69730 }, { "epoch": 84.12552806276403, "grad_norm": 4.7648420333862305, "learning_rate": 1.999831992574091e-05, "loss": 0.0473, "step": 69740 }, { "epoch": 84.13759806879904, "grad_norm": 4.296531677246094, "learning_rate": 1.9998319684486115e-05, "loss": 0.05, "step": 69750 }, { "epoch": 84.14966807483404, "grad_norm": 4.778243064880371, "learning_rate": 1.999831944323132e-05, "loss": 0.0472, "step": 69760 }, { "epoch": 84.16173808086904, "grad_norm": 5.0329060554504395, "learning_rate": 1.9998319201976527e-05, "loss": 0.0472, "step": 69770 }, { "epoch": 84.17380808690405, "grad_norm": 4.954878807067871, "learning_rate": 1.9998318960721734e-05, "loss": 0.0499, "step": 69780 }, { "epoch": 84.18587809293905, "grad_norm": 4.486057758331299, "learning_rate": 1.9998318719466937e-05, "loss": 0.0482, "step": 69790 }, { "epoch": 84.19794809897405, "grad_norm": 5.4357075691223145, "learning_rate": 1.9998318478212143e-05, "loss": 0.0498, "step": 69800 }, { "epoch": 84.21001810500906, "grad_norm": 5.108859062194824, "learning_rate": 1.999831823695735e-05, "loss": 0.0487, "step": 69810 }, { "epoch": 84.22208811104406, "grad_norm": 4.470118999481201, "learning_rate": 1.9998317995702555e-05, "loss": 0.0478, "step": 69820 }, { "epoch": 84.23415811707906, "grad_norm": 5.484651565551758, "learning_rate": 1.999831775444776e-05, "loss": 0.0516, "step": 69830 }, { "epoch": 84.24622812311407, "grad_norm": 4.7272748947143555, "learning_rate": 1.9998317513192968e-05, "loss": 0.0488, "step": 69840 }, { "epoch": 84.25829812914907, "grad_norm": 4.976243495941162, "learning_rate": 1.9998317271938174e-05, "loss": 0.0499, "step": 69850 }, { "epoch": 84.27036813518407, "grad_norm": 5.111591339111328, "learning_rate": 1.999831703068338e-05, "loss": 0.0509, "step": 69860 }, { "epoch": 84.28243814121907, "grad_norm": 4.988166332244873, "learning_rate": 1.9998316789428586e-05, "loss": 0.0506, "step": 69870 }, { "epoch": 84.29450814725408, "grad_norm": 5.033985137939453, "learning_rate": 1.9998316548173792e-05, "loss": 0.0524, "step": 69880 }, { "epoch": 84.30657815328908, "grad_norm": 5.130568027496338, "learning_rate": 1.9998316306919e-05, "loss": 0.0499, "step": 69890 }, { "epoch": 84.31864815932408, "grad_norm": 5.386241912841797, "learning_rate": 1.9998316065664205e-05, "loss": 0.052, "step": 69900 }, { "epoch": 84.33071816535909, "grad_norm": 5.090318202972412, "learning_rate": 1.999831582440941e-05, "loss": 0.0539, "step": 69910 }, { "epoch": 84.34278817139409, "grad_norm": 5.0616607666015625, "learning_rate": 1.9998315583154617e-05, "loss": 0.0535, "step": 69920 }, { "epoch": 84.3548581774291, "grad_norm": 4.664544582366943, "learning_rate": 1.9998315341899824e-05, "loss": 0.0523, "step": 69930 }, { "epoch": 84.3669281834641, "grad_norm": 4.7885613441467285, "learning_rate": 1.999831510064503e-05, "loss": 0.0518, "step": 69940 }, { "epoch": 84.3789981894991, "grad_norm": 4.850630760192871, "learning_rate": 1.9998314859390236e-05, "loss": 0.0523, "step": 69950 }, { "epoch": 84.3910681955341, "grad_norm": 5.762831687927246, "learning_rate": 1.9998314618135442e-05, "loss": 0.0536, "step": 69960 }, { "epoch": 84.4031382015691, "grad_norm": 5.283523082733154, "learning_rate": 1.999831437688065e-05, "loss": 0.0537, "step": 69970 }, { "epoch": 84.41520820760411, "grad_norm": 4.944746017456055, "learning_rate": 1.9998314135625855e-05, "loss": 0.0536, "step": 69980 }, { "epoch": 84.42727821363911, "grad_norm": 5.147722244262695, "learning_rate": 1.999831389437106e-05, "loss": 0.0534, "step": 69990 }, { "epoch": 84.43934821967412, "grad_norm": 4.716782569885254, "learning_rate": 1.9998313653116267e-05, "loss": 0.0511, "step": 70000 }, { "epoch": 84.43934821967412, "eval_loss": 12.594963073730469, "eval_runtime": 8.1274, "eval_samples_per_second": 85.759, "eval_steps_per_second": 10.828, "step": 70000 }, { "epoch": 84.45141822570912, "grad_norm": 5.040719985961914, "learning_rate": 1.9998313411861473e-05, "loss": 0.053, "step": 70010 }, { "epoch": 84.46348823174412, "grad_norm": 4.650675296783447, "learning_rate": 1.999831317060668e-05, "loss": 0.0516, "step": 70020 }, { "epoch": 84.47555823777913, "grad_norm": 5.464195728302002, "learning_rate": 1.9998312929351886e-05, "loss": 0.0552, "step": 70030 }, { "epoch": 84.48762824381413, "grad_norm": 5.16123104095459, "learning_rate": 1.999831268809709e-05, "loss": 0.0538, "step": 70040 }, { "epoch": 84.49969824984913, "grad_norm": 4.631096363067627, "learning_rate": 1.9998312446842295e-05, "loss": 0.0532, "step": 70050 }, { "epoch": 84.51176825588412, "grad_norm": 5.495206832885742, "learning_rate": 1.9998312205587504e-05, "loss": 0.0557, "step": 70060 }, { "epoch": 84.52383826191912, "grad_norm": 5.351572513580322, "learning_rate": 1.999831196433271e-05, "loss": 0.0547, "step": 70070 }, { "epoch": 84.53590826795413, "grad_norm": 5.319433212280273, "learning_rate": 1.9998311723077917e-05, "loss": 0.0556, "step": 70080 }, { "epoch": 84.54797827398913, "grad_norm": 5.189785957336426, "learning_rate": 1.9998311481823123e-05, "loss": 0.0553, "step": 70090 }, { "epoch": 84.56004828002413, "grad_norm": 4.896689414978027, "learning_rate": 1.999831124056833e-05, "loss": 0.0545, "step": 70100 }, { "epoch": 84.57211828605914, "grad_norm": 5.364083290100098, "learning_rate": 1.9998310999313535e-05, "loss": 0.0549, "step": 70110 }, { "epoch": 84.58418829209414, "grad_norm": 4.78892707824707, "learning_rate": 1.999831075805874e-05, "loss": 0.0551, "step": 70120 }, { "epoch": 84.59625829812914, "grad_norm": 5.151998996734619, "learning_rate": 1.9998310516803948e-05, "loss": 0.0569, "step": 70130 }, { "epoch": 84.60832830416415, "grad_norm": 4.9591851234436035, "learning_rate": 1.9998310275549154e-05, "loss": 0.0567, "step": 70140 }, { "epoch": 84.62039831019915, "grad_norm": 5.320375442504883, "learning_rate": 1.999831003429436e-05, "loss": 0.0526, "step": 70150 }, { "epoch": 84.63246831623415, "grad_norm": 5.300334930419922, "learning_rate": 1.9998309793039566e-05, "loss": 0.0522, "step": 70160 }, { "epoch": 84.64453832226916, "grad_norm": 4.891429901123047, "learning_rate": 1.9998309551784773e-05, "loss": 0.0538, "step": 70170 }, { "epoch": 84.65660832830416, "grad_norm": 5.750734806060791, "learning_rate": 1.999830931052998e-05, "loss": 0.0568, "step": 70180 }, { "epoch": 84.66867833433916, "grad_norm": 5.455581188201904, "learning_rate": 1.9998309069275185e-05, "loss": 0.056, "step": 70190 }, { "epoch": 84.68074834037417, "grad_norm": 5.142576217651367, "learning_rate": 1.999830882802039e-05, "loss": 0.0564, "step": 70200 }, { "epoch": 84.69281834640917, "grad_norm": 5.220257759094238, "learning_rate": 1.9998308586765594e-05, "loss": 0.0577, "step": 70210 }, { "epoch": 84.70488835244417, "grad_norm": 6.033132076263428, "learning_rate": 1.99983083455108e-05, "loss": 0.0569, "step": 70220 }, { "epoch": 84.71695835847918, "grad_norm": 5.543970108032227, "learning_rate": 1.9998308104256007e-05, "loss": 0.0588, "step": 70230 }, { "epoch": 84.72902836451418, "grad_norm": 5.390600204467773, "learning_rate": 1.9998307863001213e-05, "loss": 0.0596, "step": 70240 }, { "epoch": 84.74109837054918, "grad_norm": 5.698572635650635, "learning_rate": 1.999830762174642e-05, "loss": 0.0582, "step": 70250 }, { "epoch": 84.75316837658418, "grad_norm": 5.229294776916504, "learning_rate": 1.9998307380491625e-05, "loss": 0.055, "step": 70260 }, { "epoch": 84.76523838261919, "grad_norm": 5.1724019050598145, "learning_rate": 1.999830713923683e-05, "loss": 0.0566, "step": 70270 }, { "epoch": 84.77730838865419, "grad_norm": 5.873648166656494, "learning_rate": 1.9998306897982038e-05, "loss": 0.0595, "step": 70280 }, { "epoch": 84.7893783946892, "grad_norm": 5.5429229736328125, "learning_rate": 1.9998306656727244e-05, "loss": 0.057, "step": 70290 }, { "epoch": 84.8014484007242, "grad_norm": 5.468005657196045, "learning_rate": 1.999830641547245e-05, "loss": 0.0577, "step": 70300 }, { "epoch": 84.8135184067592, "grad_norm": 4.953578948974609, "learning_rate": 1.9998306174217656e-05, "loss": 0.057, "step": 70310 }, { "epoch": 84.8255884127942, "grad_norm": 5.058003902435303, "learning_rate": 1.9998305932962863e-05, "loss": 0.0578, "step": 70320 }, { "epoch": 84.83765841882921, "grad_norm": 5.037931442260742, "learning_rate": 1.999830569170807e-05, "loss": 0.0589, "step": 70330 }, { "epoch": 84.84972842486421, "grad_norm": 5.392440319061279, "learning_rate": 1.9998305450453275e-05, "loss": 0.0568, "step": 70340 }, { "epoch": 84.86179843089921, "grad_norm": 5.2247724533081055, "learning_rate": 1.999830520919848e-05, "loss": 0.0602, "step": 70350 }, { "epoch": 84.87386843693422, "grad_norm": 5.2308149337768555, "learning_rate": 1.9998304967943687e-05, "loss": 0.0585, "step": 70360 }, { "epoch": 84.88593844296922, "grad_norm": 5.276071548461914, "learning_rate": 1.9998304726688894e-05, "loss": 0.0562, "step": 70370 }, { "epoch": 84.89800844900422, "grad_norm": 5.683687210083008, "learning_rate": 1.99983044854341e-05, "loss": 0.0588, "step": 70380 }, { "epoch": 84.91007845503923, "grad_norm": 5.405019760131836, "learning_rate": 1.9998304244179306e-05, "loss": 0.0591, "step": 70390 }, { "epoch": 84.92214846107423, "grad_norm": 5.92366886138916, "learning_rate": 1.9998304002924512e-05, "loss": 0.0607, "step": 70400 }, { "epoch": 84.93421846710923, "grad_norm": 5.264595031738281, "learning_rate": 1.999830376166972e-05, "loss": 0.0602, "step": 70410 }, { "epoch": 84.94628847314424, "grad_norm": 5.612883567810059, "learning_rate": 1.9998303520414925e-05, "loss": 0.0586, "step": 70420 }, { "epoch": 84.95835847917924, "grad_norm": 4.970304012298584, "learning_rate": 1.999830327916013e-05, "loss": 0.0585, "step": 70430 }, { "epoch": 84.97042848521424, "grad_norm": 4.712219715118408, "learning_rate": 1.9998303037905337e-05, "loss": 0.0579, "step": 70440 }, { "epoch": 84.98249849124925, "grad_norm": 5.3078107833862305, "learning_rate": 1.9998302796650543e-05, "loss": 0.0585, "step": 70450 }, { "epoch": 84.99456849728425, "grad_norm": 5.709197521209717, "learning_rate": 1.9998302555395746e-05, "loss": 0.0628, "step": 70460 }, { "epoch": 85.0060350030175, "grad_norm": 4.2268829345703125, "learning_rate": 1.9998302314140952e-05, "loss": 0.0502, "step": 70470 }, { "epoch": 85.0181050090525, "grad_norm": 4.708034515380859, "learning_rate": 1.999830207288616e-05, "loss": 0.041, "step": 70480 }, { "epoch": 85.03017501508751, "grad_norm": 4.8753461837768555, "learning_rate": 1.9998301831631365e-05, "loss": 0.0435, "step": 70490 }, { "epoch": 85.04224502112251, "grad_norm": 4.760623931884766, "learning_rate": 1.999830159037657e-05, "loss": 0.0453, "step": 70500 }, { "epoch": 85.04224502112251, "eval_loss": 12.593851089477539, "eval_runtime": 8.1276, "eval_samples_per_second": 85.757, "eval_steps_per_second": 10.827, "step": 70500 }, { "epoch": 85.05431502715751, "grad_norm": 4.943490028381348, "learning_rate": 1.9998301349121777e-05, "loss": 0.0458, "step": 70510 }, { "epoch": 85.06638503319252, "grad_norm": 4.74222993850708, "learning_rate": 1.9998301107866983e-05, "loss": 0.044, "step": 70520 }, { "epoch": 85.07845503922752, "grad_norm": 5.163647651672363, "learning_rate": 1.999830086661219e-05, "loss": 0.0465, "step": 70530 }, { "epoch": 85.09052504526252, "grad_norm": 4.843979835510254, "learning_rate": 1.9998300625357396e-05, "loss": 0.0461, "step": 70540 }, { "epoch": 85.10259505129753, "grad_norm": 4.992773532867432, "learning_rate": 1.9998300384102602e-05, "loss": 0.0491, "step": 70550 }, { "epoch": 85.11466505733253, "grad_norm": 4.818084239959717, "learning_rate": 1.999830014284781e-05, "loss": 0.0479, "step": 70560 }, { "epoch": 85.12673506336753, "grad_norm": 4.857024669647217, "learning_rate": 1.9998299901593015e-05, "loss": 0.0494, "step": 70570 }, { "epoch": 85.13880506940254, "grad_norm": 4.561071872711182, "learning_rate": 1.999829966033822e-05, "loss": 0.0503, "step": 70580 }, { "epoch": 85.15087507543754, "grad_norm": 5.175675392150879, "learning_rate": 1.9998299419083427e-05, "loss": 0.0494, "step": 70590 }, { "epoch": 85.16294508147254, "grad_norm": 4.700107574462891, "learning_rate": 1.9998299177828637e-05, "loss": 0.0503, "step": 70600 }, { "epoch": 85.17501508750755, "grad_norm": 4.530763626098633, "learning_rate": 1.9998298936573843e-05, "loss": 0.0492, "step": 70610 }, { "epoch": 85.18708509354255, "grad_norm": 4.639326572418213, "learning_rate": 1.9998298695319046e-05, "loss": 0.049, "step": 70620 }, { "epoch": 85.19915509957755, "grad_norm": 4.87137508392334, "learning_rate": 1.9998298454064252e-05, "loss": 0.0512, "step": 70630 }, { "epoch": 85.21122510561256, "grad_norm": 4.766262054443359, "learning_rate": 1.9998298212809458e-05, "loss": 0.048, "step": 70640 }, { "epoch": 85.22329511164756, "grad_norm": 4.633093357086182, "learning_rate": 1.9998297971554664e-05, "loss": 0.0486, "step": 70650 }, { "epoch": 85.23536511768256, "grad_norm": 4.583652973175049, "learning_rate": 1.999829773029987e-05, "loss": 0.049, "step": 70660 }, { "epoch": 85.24743512371757, "grad_norm": 5.188004970550537, "learning_rate": 1.9998297489045077e-05, "loss": 0.0496, "step": 70670 }, { "epoch": 85.25950512975257, "grad_norm": 4.777129650115967, "learning_rate": 1.9998297247790283e-05, "loss": 0.0507, "step": 70680 }, { "epoch": 85.27157513578757, "grad_norm": 4.851781368255615, "learning_rate": 1.999829700653549e-05, "loss": 0.0506, "step": 70690 }, { "epoch": 85.28364514182257, "grad_norm": 5.034247875213623, "learning_rate": 1.9998296765280695e-05, "loss": 0.0499, "step": 70700 }, { "epoch": 85.29571514785758, "grad_norm": 4.829607963562012, "learning_rate": 1.99982965240259e-05, "loss": 0.052, "step": 70710 }, { "epoch": 85.30778515389258, "grad_norm": 5.297936916351318, "learning_rate": 1.9998296282771108e-05, "loss": 0.0514, "step": 70720 }, { "epoch": 85.31985515992758, "grad_norm": 4.8175835609436035, "learning_rate": 1.9998296041516314e-05, "loss": 0.0519, "step": 70730 }, { "epoch": 85.33192516596259, "grad_norm": 4.6177659034729, "learning_rate": 1.999829580026152e-05, "loss": 0.0521, "step": 70740 }, { "epoch": 85.34399517199759, "grad_norm": 4.837204933166504, "learning_rate": 1.9998295559006726e-05, "loss": 0.0504, "step": 70750 }, { "epoch": 85.3560651780326, "grad_norm": 5.09120512008667, "learning_rate": 1.9998295317751933e-05, "loss": 0.0508, "step": 70760 }, { "epoch": 85.3681351840676, "grad_norm": 4.865938186645508, "learning_rate": 1.999829507649714e-05, "loss": 0.0512, "step": 70770 }, { "epoch": 85.3802051901026, "grad_norm": 5.202056407928467, "learning_rate": 1.9998294835242345e-05, "loss": 0.0511, "step": 70780 }, { "epoch": 85.3922751961376, "grad_norm": 5.499136924743652, "learning_rate": 1.999829459398755e-05, "loss": 0.0533, "step": 70790 }, { "epoch": 85.4043452021726, "grad_norm": 5.093447208404541, "learning_rate": 1.9998294352732757e-05, "loss": 0.0512, "step": 70800 }, { "epoch": 85.41641520820761, "grad_norm": 5.132645606994629, "learning_rate": 1.9998294111477964e-05, "loss": 0.0545, "step": 70810 }, { "epoch": 85.42848521424261, "grad_norm": 5.277790069580078, "learning_rate": 1.999829387022317e-05, "loss": 0.0546, "step": 70820 }, { "epoch": 85.44055522027762, "grad_norm": 4.897423267364502, "learning_rate": 1.9998293628968376e-05, "loss": 0.0521, "step": 70830 }, { "epoch": 85.45262522631262, "grad_norm": 5.207908630371094, "learning_rate": 1.9998293387713582e-05, "loss": 0.052, "step": 70840 }, { "epoch": 85.46469523234762, "grad_norm": 5.144831657409668, "learning_rate": 1.999829314645879e-05, "loss": 0.0521, "step": 70850 }, { "epoch": 85.47676523838263, "grad_norm": 5.225340843200684, "learning_rate": 1.9998292905203995e-05, "loss": 0.0521, "step": 70860 }, { "epoch": 85.48883524441763, "grad_norm": 5.358889102935791, "learning_rate": 1.9998292663949198e-05, "loss": 0.0533, "step": 70870 }, { "epoch": 85.50090525045263, "grad_norm": 5.0970025062561035, "learning_rate": 1.9998292422694404e-05, "loss": 0.0536, "step": 70880 }, { "epoch": 85.51297525648762, "grad_norm": 5.581485748291016, "learning_rate": 1.999829218143961e-05, "loss": 0.0533, "step": 70890 }, { "epoch": 85.52504526252262, "grad_norm": 5.13602352142334, "learning_rate": 1.9998291940184816e-05, "loss": 0.0539, "step": 70900 }, { "epoch": 85.53711526855763, "grad_norm": 5.081990718841553, "learning_rate": 1.9998291698930022e-05, "loss": 0.0536, "step": 70910 }, { "epoch": 85.54918527459263, "grad_norm": 5.206764221191406, "learning_rate": 1.999829145767523e-05, "loss": 0.056, "step": 70920 }, { "epoch": 85.56125528062763, "grad_norm": 5.214202404022217, "learning_rate": 1.9998291216420435e-05, "loss": 0.0549, "step": 70930 }, { "epoch": 85.57332528666264, "grad_norm": 5.190173625946045, "learning_rate": 1.999829097516564e-05, "loss": 0.0535, "step": 70940 }, { "epoch": 85.58539529269764, "grad_norm": 4.979319095611572, "learning_rate": 1.9998290733910847e-05, "loss": 0.0552, "step": 70950 }, { "epoch": 85.59746529873264, "grad_norm": 5.276096820831299, "learning_rate": 1.9998290492656054e-05, "loss": 0.054, "step": 70960 }, { "epoch": 85.60953530476765, "grad_norm": 5.932394027709961, "learning_rate": 1.999829025140126e-05, "loss": 0.0547, "step": 70970 }, { "epoch": 85.62160531080265, "grad_norm": 4.881147384643555, "learning_rate": 1.9998290010146466e-05, "loss": 0.0541, "step": 70980 }, { "epoch": 85.63367531683765, "grad_norm": 6.01585054397583, "learning_rate": 1.9998289768891672e-05, "loss": 0.0556, "step": 70990 }, { "epoch": 85.64574532287266, "grad_norm": 4.954843997955322, "learning_rate": 1.999828952763688e-05, "loss": 0.0551, "step": 71000 }, { "epoch": 85.64574532287266, "eval_loss": 12.613746643066406, "eval_runtime": 8.1297, "eval_samples_per_second": 85.735, "eval_steps_per_second": 10.824, "step": 71000 }, { "epoch": 85.65781532890766, "grad_norm": 5.131587982177734, "learning_rate": 1.9998289286382085e-05, "loss": 0.0524, "step": 71010 }, { "epoch": 85.66988533494266, "grad_norm": 5.069258213043213, "learning_rate": 1.999828904512729e-05, "loss": 0.0569, "step": 71020 }, { "epoch": 85.68195534097767, "grad_norm": 5.017118453979492, "learning_rate": 1.9998288803872497e-05, "loss": 0.0561, "step": 71030 }, { "epoch": 85.69402534701267, "grad_norm": 5.477418899536133, "learning_rate": 1.9998288562617703e-05, "loss": 0.0568, "step": 71040 }, { "epoch": 85.70609535304767, "grad_norm": 4.49566650390625, "learning_rate": 1.999828832136291e-05, "loss": 0.0542, "step": 71050 }, { "epoch": 85.71816535908268, "grad_norm": 5.0609893798828125, "learning_rate": 1.9998288080108116e-05, "loss": 0.0552, "step": 71060 }, { "epoch": 85.73023536511768, "grad_norm": 4.838519096374512, "learning_rate": 1.9998287838853322e-05, "loss": 0.0548, "step": 71070 }, { "epoch": 85.74230537115268, "grad_norm": 5.185921669006348, "learning_rate": 1.9998287597598528e-05, "loss": 0.0559, "step": 71080 }, { "epoch": 85.75437537718769, "grad_norm": 5.734113693237305, "learning_rate": 1.9998287356343734e-05, "loss": 0.0569, "step": 71090 }, { "epoch": 85.76644538322269, "grad_norm": 5.434546947479248, "learning_rate": 1.999828711508894e-05, "loss": 0.0574, "step": 71100 }, { "epoch": 85.77851538925769, "grad_norm": 5.2566118240356445, "learning_rate": 1.9998286873834147e-05, "loss": 0.0556, "step": 71110 }, { "epoch": 85.7905853952927, "grad_norm": 4.975744247436523, "learning_rate": 1.999828663257935e-05, "loss": 0.0563, "step": 71120 }, { "epoch": 85.8026554013277, "grad_norm": 5.61521053314209, "learning_rate": 1.9998286391324556e-05, "loss": 0.0573, "step": 71130 }, { "epoch": 85.8147254073627, "grad_norm": 5.665817737579346, "learning_rate": 1.9998286150069765e-05, "loss": 0.058, "step": 71140 }, { "epoch": 85.8267954133977, "grad_norm": 5.455501079559326, "learning_rate": 1.999828590881497e-05, "loss": 0.0615, "step": 71150 }, { "epoch": 85.83886541943271, "grad_norm": Infinity, "learning_rate": 1.9998285667560178e-05, "loss": 0.0589, "step": 71160 }, { "epoch": 85.85093542546771, "grad_norm": 5.935067176818848, "learning_rate": 1.9998285426305384e-05, "loss": 0.0571, "step": 71170 }, { "epoch": 85.86300543150271, "grad_norm": 5.623991966247559, "learning_rate": 1.999828518505059e-05, "loss": 0.0588, "step": 71180 }, { "epoch": 85.87507543753772, "grad_norm": 5.509769916534424, "learning_rate": 1.9998284943795796e-05, "loss": 0.059, "step": 71190 }, { "epoch": 85.88714544357272, "grad_norm": 5.956339359283447, "learning_rate": 1.9998284702541003e-05, "loss": 0.0584, "step": 71200 }, { "epoch": 85.89921544960772, "grad_norm": 4.871735095977783, "learning_rate": 1.999828446128621e-05, "loss": 0.0573, "step": 71210 }, { "epoch": 85.91128545564273, "grad_norm": 5.423430919647217, "learning_rate": 1.9998284220031415e-05, "loss": 0.0604, "step": 71220 }, { "epoch": 85.92335546167773, "grad_norm": 5.299644947052002, "learning_rate": 1.999828397877662e-05, "loss": 0.0574, "step": 71230 }, { "epoch": 85.93542546771273, "grad_norm": 4.9329938888549805, "learning_rate": 1.9998283737521828e-05, "loss": 0.0581, "step": 71240 }, { "epoch": 85.94749547374774, "grad_norm": 5.219168663024902, "learning_rate": 1.9998283496267034e-05, "loss": 0.0577, "step": 71250 }, { "epoch": 85.95956547978274, "grad_norm": 5.312048435211182, "learning_rate": 1.999828325501224e-05, "loss": 0.0575, "step": 71260 }, { "epoch": 85.97163548581774, "grad_norm": 5.160652160644531, "learning_rate": 1.9998283013757446e-05, "loss": 0.06, "step": 71270 }, { "epoch": 85.98370549185275, "grad_norm": 5.08075475692749, "learning_rate": 1.999828277250265e-05, "loss": 0.0612, "step": 71280 }, { "epoch": 85.99577549788775, "grad_norm": 5.499906063079834, "learning_rate": 1.9998282531247855e-05, "loss": 0.0591, "step": 71290 }, { "epoch": 86.007242003621, "grad_norm": 4.737781524658203, "learning_rate": 1.999828228999306e-05, "loss": 0.0477, "step": 71300 }, { "epoch": 86.019312009656, "grad_norm": 4.696320533752441, "learning_rate": 1.9998282048738268e-05, "loss": 0.0407, "step": 71310 }, { "epoch": 86.03138201569101, "grad_norm": 4.156368732452393, "learning_rate": 1.9998281807483474e-05, "loss": 0.0427, "step": 71320 }, { "epoch": 86.04345202172601, "grad_norm": 4.667436122894287, "learning_rate": 1.999828156622868e-05, "loss": 0.0455, "step": 71330 }, { "epoch": 86.05552202776101, "grad_norm": 4.726617336273193, "learning_rate": 1.9998281324973886e-05, "loss": 0.0458, "step": 71340 }, { "epoch": 86.06759203379602, "grad_norm": 4.355136394500732, "learning_rate": 1.9998281083719093e-05, "loss": 0.0458, "step": 71350 }, { "epoch": 86.07966203983102, "grad_norm": 3.8066632747650146, "learning_rate": 1.99982808424643e-05, "loss": 0.0464, "step": 71360 }, { "epoch": 86.09173204586602, "grad_norm": 4.887198448181152, "learning_rate": 1.9998280601209505e-05, "loss": 0.0457, "step": 71370 }, { "epoch": 86.10380205190103, "grad_norm": 4.7789835929870605, "learning_rate": 1.999828035995471e-05, "loss": 0.0453, "step": 71380 }, { "epoch": 86.11587205793603, "grad_norm": 4.590598106384277, "learning_rate": 1.9998280118699917e-05, "loss": 0.0465, "step": 71390 }, { "epoch": 86.12794206397103, "grad_norm": 4.456676006317139, "learning_rate": 1.9998279877445124e-05, "loss": 0.0456, "step": 71400 }, { "epoch": 86.14001207000604, "grad_norm": 5.006324768066406, "learning_rate": 1.999827963619033e-05, "loss": 0.0479, "step": 71410 }, { "epoch": 86.15208207604104, "grad_norm": 5.046788215637207, "learning_rate": 1.9998279394935536e-05, "loss": 0.0485, "step": 71420 }, { "epoch": 86.16415208207604, "grad_norm": 4.462083339691162, "learning_rate": 1.9998279153680742e-05, "loss": 0.0484, "step": 71430 }, { "epoch": 86.17622208811105, "grad_norm": 5.383782386779785, "learning_rate": 1.999827891242595e-05, "loss": 0.0477, "step": 71440 }, { "epoch": 86.18829209414605, "grad_norm": 5.111294269561768, "learning_rate": 1.9998278671171155e-05, "loss": 0.0471, "step": 71450 }, { "epoch": 86.20036210018105, "grad_norm": 5.058508396148682, "learning_rate": 1.999827842991636e-05, "loss": 0.0492, "step": 71460 }, { "epoch": 86.21243210621606, "grad_norm": 5.586070537567139, "learning_rate": 1.9998278188661567e-05, "loss": 0.0516, "step": 71470 }, { "epoch": 86.22450211225106, "grad_norm": 5.008844375610352, "learning_rate": 1.9998277947406773e-05, "loss": 0.049, "step": 71480 }, { "epoch": 86.23657211828606, "grad_norm": 4.585065841674805, "learning_rate": 1.999827770615198e-05, "loss": 0.0472, "step": 71490 }, { "epoch": 86.24864212432107, "grad_norm": 4.82794189453125, "learning_rate": 1.9998277464897186e-05, "loss": 0.0494, "step": 71500 }, { "epoch": 86.24864212432107, "eval_loss": 12.62394905090332, "eval_runtime": 8.1494, "eval_samples_per_second": 85.528, "eval_steps_per_second": 10.798, "step": 71500 }, { "epoch": 86.26071213035607, "grad_norm": 4.631082057952881, "learning_rate": 1.9998277223642392e-05, "loss": 0.0487, "step": 71510 }, { "epoch": 86.27278213639107, "grad_norm": 4.706298828125, "learning_rate": 1.9998276982387598e-05, "loss": 0.0497, "step": 71520 }, { "epoch": 86.28485214242608, "grad_norm": 5.136537075042725, "learning_rate": 1.99982767411328e-05, "loss": 0.0541, "step": 71530 }, { "epoch": 86.29692214846108, "grad_norm": 4.621399402618408, "learning_rate": 1.9998276499878007e-05, "loss": 0.049, "step": 71540 }, { "epoch": 86.30899215449608, "grad_norm": 4.85302209854126, "learning_rate": 1.9998276258623213e-05, "loss": 0.0504, "step": 71550 }, { "epoch": 86.32106216053108, "grad_norm": 5.068403244018555, "learning_rate": 1.999827601736842e-05, "loss": 0.0504, "step": 71560 }, { "epoch": 86.33313216656609, "grad_norm": 5.23268461227417, "learning_rate": 1.9998275776113626e-05, "loss": 0.0508, "step": 71570 }, { "epoch": 86.34520217260109, "grad_norm": 5.339344501495361, "learning_rate": 1.9998275534858832e-05, "loss": 0.0518, "step": 71580 }, { "epoch": 86.3572721786361, "grad_norm": 5.557727336883545, "learning_rate": 1.999827529360404e-05, "loss": 0.0526, "step": 71590 }, { "epoch": 86.3693421846711, "grad_norm": 5.12903356552124, "learning_rate": 1.9998275052349245e-05, "loss": 0.0506, "step": 71600 }, { "epoch": 86.3814121907061, "grad_norm": 5.06486177444458, "learning_rate": 1.999827481109445e-05, "loss": 0.0531, "step": 71610 }, { "epoch": 86.3934821967411, "grad_norm": 4.84412956237793, "learning_rate": 1.9998274569839657e-05, "loss": 0.049, "step": 71620 }, { "epoch": 86.40555220277611, "grad_norm": 5.756929397583008, "learning_rate": 1.9998274328584863e-05, "loss": 0.052, "step": 71630 }, { "epoch": 86.41762220881111, "grad_norm": 5.221804618835449, "learning_rate": 1.999827408733007e-05, "loss": 0.054, "step": 71640 }, { "epoch": 86.42969221484611, "grad_norm": 4.841829776763916, "learning_rate": 1.9998273846075276e-05, "loss": 0.0514, "step": 71650 }, { "epoch": 86.44176222088112, "grad_norm": 4.42153263092041, "learning_rate": 1.9998273604820482e-05, "loss": 0.0527, "step": 71660 }, { "epoch": 86.45383222691612, "grad_norm": 5.4810309410095215, "learning_rate": 1.9998273363565688e-05, "loss": 0.0504, "step": 71670 }, { "epoch": 86.46590223295112, "grad_norm": 4.980533599853516, "learning_rate": 1.9998273122310898e-05, "loss": 0.0531, "step": 71680 }, { "epoch": 86.47797223898613, "grad_norm": 5.317887783050537, "learning_rate": 1.9998272881056104e-05, "loss": 0.054, "step": 71690 }, { "epoch": 86.49004224502113, "grad_norm": 5.167504787445068, "learning_rate": 1.9998272639801307e-05, "loss": 0.0549, "step": 71700 }, { "epoch": 86.50211225105613, "grad_norm": 5.1198577880859375, "learning_rate": 1.9998272398546513e-05, "loss": 0.0533, "step": 71710 }, { "epoch": 86.51418225709112, "grad_norm": 4.665541172027588, "learning_rate": 1.999827215729172e-05, "loss": 0.0513, "step": 71720 }, { "epoch": 86.52625226312612, "grad_norm": 5.113002777099609, "learning_rate": 1.9998271916036925e-05, "loss": 0.054, "step": 71730 }, { "epoch": 86.53832226916113, "grad_norm": 5.403487205505371, "learning_rate": 1.999827167478213e-05, "loss": 0.0537, "step": 71740 }, { "epoch": 86.55039227519613, "grad_norm": 5.407042980194092, "learning_rate": 1.9998271433527338e-05, "loss": 0.0539, "step": 71750 }, { "epoch": 86.56246228123113, "grad_norm": 5.077506065368652, "learning_rate": 1.9998271192272544e-05, "loss": 0.0525, "step": 71760 }, { "epoch": 86.57453228726614, "grad_norm": 5.016091823577881, "learning_rate": 1.999827095101775e-05, "loss": 0.0526, "step": 71770 }, { "epoch": 86.58660229330114, "grad_norm": 5.269848346710205, "learning_rate": 1.9998270709762956e-05, "loss": 0.052, "step": 71780 }, { "epoch": 86.59867229933614, "grad_norm": 5.238850116729736, "learning_rate": 1.9998270468508163e-05, "loss": 0.0548, "step": 71790 }, { "epoch": 86.61074230537115, "grad_norm": 4.7605366706848145, "learning_rate": 1.999827022725337e-05, "loss": 0.0556, "step": 71800 }, { "epoch": 86.62281231140615, "grad_norm": 5.351512908935547, "learning_rate": 1.9998269985998575e-05, "loss": 0.0536, "step": 71810 }, { "epoch": 86.63488231744115, "grad_norm": 5.094408988952637, "learning_rate": 1.999826974474378e-05, "loss": 0.0553, "step": 71820 }, { "epoch": 86.64695232347616, "grad_norm": 4.780093669891357, "learning_rate": 1.9998269503488988e-05, "loss": 0.0556, "step": 71830 }, { "epoch": 86.65902232951116, "grad_norm": 6.076395034790039, "learning_rate": 1.9998269262234194e-05, "loss": 0.0565, "step": 71840 }, { "epoch": 86.67109233554616, "grad_norm": 5.545893669128418, "learning_rate": 1.99982690209794e-05, "loss": 0.0554, "step": 71850 }, { "epoch": 86.68316234158117, "grad_norm": 4.953128814697266, "learning_rate": 1.9998268779724606e-05, "loss": 0.0558, "step": 71860 }, { "epoch": 86.69523234761617, "grad_norm": 6.190186500549316, "learning_rate": 1.9998268538469812e-05, "loss": 0.0543, "step": 71870 }, { "epoch": 86.70730235365117, "grad_norm": 5.448315620422363, "learning_rate": 1.999826829721502e-05, "loss": 0.0561, "step": 71880 }, { "epoch": 86.71937235968618, "grad_norm": 5.286009788513184, "learning_rate": 1.9998268055960225e-05, "loss": 0.0573, "step": 71890 }, { "epoch": 86.73144236572118, "grad_norm": 5.643766403198242, "learning_rate": 1.999826781470543e-05, "loss": 0.0561, "step": 71900 }, { "epoch": 86.74351237175618, "grad_norm": 6.092132568359375, "learning_rate": 1.9998267573450637e-05, "loss": 0.0575, "step": 71910 }, { "epoch": 86.75558237779119, "grad_norm": 4.941643238067627, "learning_rate": 1.9998267332195843e-05, "loss": 0.0554, "step": 71920 }, { "epoch": 86.76765238382619, "grad_norm": 5.719244480133057, "learning_rate": 1.999826709094105e-05, "loss": 0.0577, "step": 71930 }, { "epoch": 86.77972238986119, "grad_norm": 5.02743673324585, "learning_rate": 1.9998266849686256e-05, "loss": 0.0576, "step": 71940 }, { "epoch": 86.7917923958962, "grad_norm": 5.016444206237793, "learning_rate": 1.999826660843146e-05, "loss": 0.0566, "step": 71950 }, { "epoch": 86.8038624019312, "grad_norm": 5.066845417022705, "learning_rate": 1.9998266367176665e-05, "loss": 0.057, "step": 71960 }, { "epoch": 86.8159324079662, "grad_norm": 5.294569969177246, "learning_rate": 1.999826612592187e-05, "loss": 0.0568, "step": 71970 }, { "epoch": 86.8280024140012, "grad_norm": 5.267810344696045, "learning_rate": 1.9998265884667077e-05, "loss": 0.0573, "step": 71980 }, { "epoch": 86.84007242003621, "grad_norm": 5.217939853668213, "learning_rate": 1.9998265643412284e-05, "loss": 0.058, "step": 71990 }, { "epoch": 86.85214242607121, "grad_norm": 5.48293399810791, "learning_rate": 1.999826540215749e-05, "loss": 0.0564, "step": 72000 }, { "epoch": 86.85214242607121, "eval_loss": 12.652044296264648, "eval_runtime": 8.1333, "eval_samples_per_second": 85.697, "eval_steps_per_second": 10.82, "step": 72000 }, { "epoch": 86.86421243210621, "grad_norm": 5.157077789306641, "learning_rate": 1.9998265160902696e-05, "loss": 0.0578, "step": 72010 }, { "epoch": 86.87628243814122, "grad_norm": 5.268298625946045, "learning_rate": 1.9998264919647902e-05, "loss": 0.0563, "step": 72020 }, { "epoch": 86.88835244417622, "grad_norm": 5.371480941772461, "learning_rate": 1.999826467839311e-05, "loss": 0.0571, "step": 72030 }, { "epoch": 86.90042245021122, "grad_norm": 5.8838934898376465, "learning_rate": 1.9998264437138315e-05, "loss": 0.0566, "step": 72040 }, { "epoch": 86.91249245624623, "grad_norm": 4.865788459777832, "learning_rate": 1.999826419588352e-05, "loss": 0.0553, "step": 72050 }, { "epoch": 86.92456246228123, "grad_norm": 5.29450798034668, "learning_rate": 1.9998263954628727e-05, "loss": 0.0566, "step": 72060 }, { "epoch": 86.93663246831623, "grad_norm": 5.278465270996094, "learning_rate": 1.9998263713373933e-05, "loss": 0.0577, "step": 72070 }, { "epoch": 86.94870247435124, "grad_norm": 5.602022647857666, "learning_rate": 1.999826347211914e-05, "loss": 0.0577, "step": 72080 }, { "epoch": 86.96077248038624, "grad_norm": 5.3499298095703125, "learning_rate": 1.9998263230864346e-05, "loss": 0.0585, "step": 72090 }, { "epoch": 86.97284248642124, "grad_norm": 5.505865097045898, "learning_rate": 1.9998262989609552e-05, "loss": 0.0561, "step": 72100 }, { "epoch": 86.98491249245625, "grad_norm": 4.888979434967041, "learning_rate": 1.9998262748354758e-05, "loss": 0.0553, "step": 72110 }, { "epoch": 86.99698249849125, "grad_norm": 5.317998886108398, "learning_rate": 1.9998262507099964e-05, "loss": 0.0561, "step": 72120 }, { "epoch": 87.0084490042245, "grad_norm": 4.3961405754089355, "learning_rate": 1.999826226584517e-05, "loss": 0.0436, "step": 72130 }, { "epoch": 87.0205190102595, "grad_norm": 4.041713237762451, "learning_rate": 1.9998262024590377e-05, "loss": 0.039, "step": 72140 }, { "epoch": 87.03258901629451, "grad_norm": 4.088405132293701, "learning_rate": 1.9998261783335583e-05, "loss": 0.0416, "step": 72150 }, { "epoch": 87.04465902232951, "grad_norm": 4.89187479019165, "learning_rate": 1.999826154208079e-05, "loss": 0.0407, "step": 72160 }, { "epoch": 87.05672902836451, "grad_norm": 4.500971794128418, "learning_rate": 1.9998261300825995e-05, "loss": 0.0436, "step": 72170 }, { "epoch": 87.06879903439952, "grad_norm": 4.443504810333252, "learning_rate": 1.99982610595712e-05, "loss": 0.0468, "step": 72180 }, { "epoch": 87.08086904043452, "grad_norm": 5.216516971588135, "learning_rate": 1.9998260818316408e-05, "loss": 0.0472, "step": 72190 }, { "epoch": 87.09293904646952, "grad_norm": 4.689235687255859, "learning_rate": 1.999826057706161e-05, "loss": 0.0449, "step": 72200 }, { "epoch": 87.10500905250453, "grad_norm": 4.8853583335876465, "learning_rate": 1.9998260335806817e-05, "loss": 0.0446, "step": 72210 }, { "epoch": 87.11707905853953, "grad_norm": 4.889583110809326, "learning_rate": 1.9998260094552027e-05, "loss": 0.0437, "step": 72220 }, { "epoch": 87.12914906457453, "grad_norm": 5.1956963539123535, "learning_rate": 1.9998259853297233e-05, "loss": 0.0459, "step": 72230 }, { "epoch": 87.14121907060954, "grad_norm": 4.5701985359191895, "learning_rate": 1.999825961204244e-05, "loss": 0.0459, "step": 72240 }, { "epoch": 87.15328907664454, "grad_norm": 4.737488269805908, "learning_rate": 1.9998259370787645e-05, "loss": 0.0446, "step": 72250 }, { "epoch": 87.16535908267954, "grad_norm": 4.904539585113525, "learning_rate": 1.999825912953285e-05, "loss": 0.0487, "step": 72260 }, { "epoch": 87.17742908871455, "grad_norm": 5.445257663726807, "learning_rate": 1.9998258888278058e-05, "loss": 0.0501, "step": 72270 }, { "epoch": 87.18949909474955, "grad_norm": 4.901302337646484, "learning_rate": 1.9998258647023264e-05, "loss": 0.0488, "step": 72280 }, { "epoch": 87.20156910078455, "grad_norm": 4.7103657722473145, "learning_rate": 1.999825840576847e-05, "loss": 0.0472, "step": 72290 }, { "epoch": 87.21363910681956, "grad_norm": 4.3403120040893555, "learning_rate": 1.9998258164513676e-05, "loss": 0.0474, "step": 72300 }, { "epoch": 87.22570911285456, "grad_norm": 4.937482833862305, "learning_rate": 1.9998257923258882e-05, "loss": 0.0473, "step": 72310 }, { "epoch": 87.23777911888956, "grad_norm": 4.262750625610352, "learning_rate": 1.999825768200409e-05, "loss": 0.0473, "step": 72320 }, { "epoch": 87.24984912492457, "grad_norm": 5.039493083953857, "learning_rate": 1.9998257440749295e-05, "loss": 0.0498, "step": 72330 }, { "epoch": 87.26191913095957, "grad_norm": 5.025540351867676, "learning_rate": 1.99982571994945e-05, "loss": 0.0493, "step": 72340 }, { "epoch": 87.27398913699457, "grad_norm": 4.45260763168335, "learning_rate": 1.9998256958239707e-05, "loss": 0.0513, "step": 72350 }, { "epoch": 87.28605914302958, "grad_norm": 4.211001873016357, "learning_rate": 1.999825671698491e-05, "loss": 0.0481, "step": 72360 }, { "epoch": 87.29812914906458, "grad_norm": 5.273646354675293, "learning_rate": 1.9998256475730116e-05, "loss": 0.0502, "step": 72370 }, { "epoch": 87.31019915509958, "grad_norm": 5.3702712059021, "learning_rate": 1.9998256234475323e-05, "loss": 0.0506, "step": 72380 }, { "epoch": 87.32226916113459, "grad_norm": 4.935483455657959, "learning_rate": 1.999825599322053e-05, "loss": 0.0501, "step": 72390 }, { "epoch": 87.33433916716959, "grad_norm": 4.386733055114746, "learning_rate": 1.9998255751965735e-05, "loss": 0.0517, "step": 72400 }, { "epoch": 87.34640917320459, "grad_norm": 4.759774208068848, "learning_rate": 1.999825551071094e-05, "loss": 0.051, "step": 72410 }, { "epoch": 87.3584791792396, "grad_norm": 5.852808952331543, "learning_rate": 1.9998255269456147e-05, "loss": 0.0491, "step": 72420 }, { "epoch": 87.3705491852746, "grad_norm": 5.507535934448242, "learning_rate": 1.9998255028201354e-05, "loss": 0.0531, "step": 72430 }, { "epoch": 87.3826191913096, "grad_norm": 5.019320964813232, "learning_rate": 1.999825478694656e-05, "loss": 0.0509, "step": 72440 }, { "epoch": 87.3946891973446, "grad_norm": 5.353975296020508, "learning_rate": 1.9998254545691766e-05, "loss": 0.0501, "step": 72450 }, { "epoch": 87.40675920337961, "grad_norm": 4.700850486755371, "learning_rate": 1.9998254304436972e-05, "loss": 0.0493, "step": 72460 }, { "epoch": 87.41882920941461, "grad_norm": 4.909725189208984, "learning_rate": 1.999825406318218e-05, "loss": 0.0514, "step": 72470 }, { "epoch": 87.43089921544961, "grad_norm": 5.119602203369141, "learning_rate": 1.9998253821927385e-05, "loss": 0.0514, "step": 72480 }, { "epoch": 87.44296922148462, "grad_norm": 4.781346321105957, "learning_rate": 1.999825358067259e-05, "loss": 0.0507, "step": 72490 }, { "epoch": 87.45503922751962, "grad_norm": 5.133533000946045, "learning_rate": 1.9998253339417797e-05, "loss": 0.0532, "step": 72500 }, { "epoch": 87.45503922751962, "eval_loss": 12.668484687805176, "eval_runtime": 8.138, "eval_samples_per_second": 85.648, "eval_steps_per_second": 10.813, "step": 72500 }, { "epoch": 87.46710923355462, "grad_norm": 5.371032238006592, "learning_rate": 1.9998253098163003e-05, "loss": 0.0525, "step": 72510 }, { "epoch": 87.47917923958963, "grad_norm": 4.851141452789307, "learning_rate": 1.999825285690821e-05, "loss": 0.0537, "step": 72520 }, { "epoch": 87.49124924562463, "grad_norm": 4.898065567016602, "learning_rate": 1.9998252615653416e-05, "loss": 0.0514, "step": 72530 }, { "epoch": 87.50331925165963, "grad_norm": 5.096607685089111, "learning_rate": 1.9998252374398622e-05, "loss": 0.0523, "step": 72540 }, { "epoch": 87.51538925769462, "grad_norm": 4.971444606781006, "learning_rate": 1.9998252133143828e-05, "loss": 0.0531, "step": 72550 }, { "epoch": 87.52745926372963, "grad_norm": 5.130185127258301, "learning_rate": 1.9998251891889034e-05, "loss": 0.0549, "step": 72560 }, { "epoch": 87.53952926976463, "grad_norm": 5.058588981628418, "learning_rate": 1.999825165063424e-05, "loss": 0.0519, "step": 72570 }, { "epoch": 87.55159927579963, "grad_norm": 4.787028789520264, "learning_rate": 1.9998251409379447e-05, "loss": 0.0508, "step": 72580 }, { "epoch": 87.56366928183463, "grad_norm": 5.324141025543213, "learning_rate": 1.9998251168124653e-05, "loss": 0.0546, "step": 72590 }, { "epoch": 87.57573928786964, "grad_norm": 4.822300434112549, "learning_rate": 1.999825092686986e-05, "loss": 0.0549, "step": 72600 }, { "epoch": 87.58780929390464, "grad_norm": 4.979687690734863, "learning_rate": 1.9998250685615062e-05, "loss": 0.0552, "step": 72610 }, { "epoch": 87.59987929993964, "grad_norm": 5.059196472167969, "learning_rate": 1.999825044436027e-05, "loss": 0.0538, "step": 72620 }, { "epoch": 87.61194930597465, "grad_norm": 5.455411911010742, "learning_rate": 1.9998250203105475e-05, "loss": 0.0542, "step": 72630 }, { "epoch": 87.62401931200965, "grad_norm": 5.576635837554932, "learning_rate": 1.999824996185068e-05, "loss": 0.0533, "step": 72640 }, { "epoch": 87.63608931804465, "grad_norm": 5.996700763702393, "learning_rate": 1.9998249720595887e-05, "loss": 0.0575, "step": 72650 }, { "epoch": 87.64815932407966, "grad_norm": 5.110058307647705, "learning_rate": 1.9998249479341093e-05, "loss": 0.055, "step": 72660 }, { "epoch": 87.66022933011466, "grad_norm": 5.214412689208984, "learning_rate": 1.99982492380863e-05, "loss": 0.056, "step": 72670 }, { "epoch": 87.67229933614966, "grad_norm": 5.1991190910339355, "learning_rate": 1.9998248996831506e-05, "loss": 0.0554, "step": 72680 }, { "epoch": 87.68436934218467, "grad_norm": 4.94088888168335, "learning_rate": 1.9998248755576712e-05, "loss": 0.0548, "step": 72690 }, { "epoch": 87.69643934821967, "grad_norm": 5.857967376708984, "learning_rate": 1.9998248514321918e-05, "loss": 0.0554, "step": 72700 }, { "epoch": 87.70850935425467, "grad_norm": 4.789823532104492, "learning_rate": 1.9998248273067124e-05, "loss": 0.0537, "step": 72710 }, { "epoch": 87.72057936028968, "grad_norm": 4.861485004425049, "learning_rate": 1.999824803181233e-05, "loss": 0.0564, "step": 72720 }, { "epoch": 87.73264936632468, "grad_norm": 5.237661838531494, "learning_rate": 1.9998247790557537e-05, "loss": 0.0568, "step": 72730 }, { "epoch": 87.74471937235968, "grad_norm": 4.7786865234375, "learning_rate": 1.9998247549302743e-05, "loss": 0.054, "step": 72740 }, { "epoch": 87.75678937839469, "grad_norm": 5.617940902709961, "learning_rate": 1.999824730804795e-05, "loss": 0.0567, "step": 72750 }, { "epoch": 87.76885938442969, "grad_norm": 5.50827693939209, "learning_rate": 1.999824706679316e-05, "loss": 0.0566, "step": 72760 }, { "epoch": 87.78092939046469, "grad_norm": 5.204529762268066, "learning_rate": 1.999824682553836e-05, "loss": 0.0535, "step": 72770 }, { "epoch": 87.7929993964997, "grad_norm": 5.04777717590332, "learning_rate": 1.9998246584283568e-05, "loss": 0.0545, "step": 72780 }, { "epoch": 87.8050694025347, "grad_norm": 5.1518874168396, "learning_rate": 1.9998246343028774e-05, "loss": 0.057, "step": 72790 }, { "epoch": 87.8171394085697, "grad_norm": 4.780630111694336, "learning_rate": 1.999824610177398e-05, "loss": 0.0551, "step": 72800 }, { "epoch": 87.8292094146047, "grad_norm": 5.569289684295654, "learning_rate": 1.9998245860519186e-05, "loss": 0.057, "step": 72810 }, { "epoch": 87.84127942063971, "grad_norm": 5.243703365325928, "learning_rate": 1.9998245619264393e-05, "loss": 0.0571, "step": 72820 }, { "epoch": 87.85334942667471, "grad_norm": 5.550778388977051, "learning_rate": 1.99982453780096e-05, "loss": 0.0585, "step": 72830 }, { "epoch": 87.86541943270971, "grad_norm": 5.25465202331543, "learning_rate": 1.9998245136754805e-05, "loss": 0.0558, "step": 72840 }, { "epoch": 87.87748943874472, "grad_norm": 5.358205795288086, "learning_rate": 1.999824489550001e-05, "loss": 0.057, "step": 72850 }, { "epoch": 87.88955944477972, "grad_norm": 4.768321990966797, "learning_rate": 1.9998244654245218e-05, "loss": 0.0567, "step": 72860 }, { "epoch": 87.90162945081472, "grad_norm": 5.223747730255127, "learning_rate": 1.9998244412990424e-05, "loss": 0.058, "step": 72870 }, { "epoch": 87.91369945684973, "grad_norm": 5.140872001647949, "learning_rate": 1.999824417173563e-05, "loss": 0.0585, "step": 72880 }, { "epoch": 87.92576946288473, "grad_norm": 5.231062412261963, "learning_rate": 1.9998243930480836e-05, "loss": 0.0554, "step": 72890 }, { "epoch": 87.93783946891973, "grad_norm": 5.116702556610107, "learning_rate": 1.9998243689226042e-05, "loss": 0.0556, "step": 72900 }, { "epoch": 87.94990947495474, "grad_norm": 5.281896591186523, "learning_rate": 1.999824344797125e-05, "loss": 0.0593, "step": 72910 }, { "epoch": 87.96197948098974, "grad_norm": 5.337218761444092, "learning_rate": 1.9998243206716455e-05, "loss": 0.0567, "step": 72920 }, { "epoch": 87.97404948702474, "grad_norm": 5.4201812744140625, "learning_rate": 1.999824296546166e-05, "loss": 0.0585, "step": 72930 }, { "epoch": 87.98611949305975, "grad_norm": 5.023910999298096, "learning_rate": 1.9998242724206867e-05, "loss": 0.0558, "step": 72940 }, { "epoch": 87.99818949909475, "grad_norm": 5.574783802032471, "learning_rate": 1.9998242482952073e-05, "loss": 0.0553, "step": 72950 }, { "epoch": 88.009656004828, "grad_norm": 4.523532390594482, "learning_rate": 1.999824224169728e-05, "loss": 0.0414, "step": 72960 }, { "epoch": 88.021726010863, "grad_norm": 4.356777191162109, "learning_rate": 1.9998242000442486e-05, "loss": 0.0412, "step": 72970 }, { "epoch": 88.03379601689801, "grad_norm": 4.001343250274658, "learning_rate": 1.9998241759187692e-05, "loss": 0.042, "step": 72980 }, { "epoch": 88.04586602293301, "grad_norm": 4.807476043701172, "learning_rate": 1.99982415179329e-05, "loss": 0.0414, "step": 72990 }, { "epoch": 88.05793602896802, "grad_norm": 4.68324089050293, "learning_rate": 1.9998241276678105e-05, "loss": 0.0431, "step": 73000 }, { "epoch": 88.05793602896802, "eval_loss": 12.671979904174805, "eval_runtime": 8.1209, "eval_samples_per_second": 85.828, "eval_steps_per_second": 10.836, "step": 73000 }, { "epoch": 88.07000603500302, "grad_norm": 5.106143951416016, "learning_rate": 1.999824103542331e-05, "loss": 0.046, "step": 73010 }, { "epoch": 88.08207604103802, "grad_norm": 4.57693338394165, "learning_rate": 1.9998240794168514e-05, "loss": 0.0451, "step": 73020 }, { "epoch": 88.09414604707302, "grad_norm": 4.756309986114502, "learning_rate": 1.999824055291372e-05, "loss": 0.044, "step": 73030 }, { "epoch": 88.10621605310803, "grad_norm": 4.351072788238525, "learning_rate": 1.9998240311658926e-05, "loss": 0.0467, "step": 73040 }, { "epoch": 88.11828605914303, "grad_norm": 4.390578269958496, "learning_rate": 1.9998240070404132e-05, "loss": 0.0445, "step": 73050 }, { "epoch": 88.13035606517803, "grad_norm": 4.552156925201416, "learning_rate": 1.999823982914934e-05, "loss": 0.0446, "step": 73060 }, { "epoch": 88.14242607121304, "grad_norm": 4.898573398590088, "learning_rate": 1.9998239587894545e-05, "loss": 0.0505, "step": 73070 }, { "epoch": 88.15449607724804, "grad_norm": 4.215783596038818, "learning_rate": 1.999823934663975e-05, "loss": 0.0459, "step": 73080 }, { "epoch": 88.16656608328304, "grad_norm": 4.996152400970459, "learning_rate": 1.9998239105384957e-05, "loss": 0.0458, "step": 73090 }, { "epoch": 88.17863608931805, "grad_norm": 4.559503078460693, "learning_rate": 1.9998238864130163e-05, "loss": 0.0468, "step": 73100 }, { "epoch": 88.19070609535305, "grad_norm": 4.631608963012695, "learning_rate": 1.999823862287537e-05, "loss": 0.047, "step": 73110 }, { "epoch": 88.20277610138805, "grad_norm": 4.08887243270874, "learning_rate": 1.9998238381620576e-05, "loss": 0.0463, "step": 73120 }, { "epoch": 88.21484610742306, "grad_norm": 5.023643970489502, "learning_rate": 1.9998238140365782e-05, "loss": 0.0482, "step": 73130 }, { "epoch": 88.22691611345806, "grad_norm": 4.737005710601807, "learning_rate": 1.9998237899110988e-05, "loss": 0.0477, "step": 73140 }, { "epoch": 88.23898611949306, "grad_norm": 5.142590045928955, "learning_rate": 1.9998237657856194e-05, "loss": 0.0495, "step": 73150 }, { "epoch": 88.25105612552807, "grad_norm": 4.501081466674805, "learning_rate": 1.99982374166014e-05, "loss": 0.0487, "step": 73160 }, { "epoch": 88.26312613156307, "grad_norm": 4.464102745056152, "learning_rate": 1.9998237175346607e-05, "loss": 0.0477, "step": 73170 }, { "epoch": 88.27519613759807, "grad_norm": 4.877764701843262, "learning_rate": 1.9998236934091813e-05, "loss": 0.0447, "step": 73180 }, { "epoch": 88.28726614363308, "grad_norm": 4.889609336853027, "learning_rate": 1.999823669283702e-05, "loss": 0.0508, "step": 73190 }, { "epoch": 88.29933614966808, "grad_norm": 5.0022454261779785, "learning_rate": 1.9998236451582225e-05, "loss": 0.0488, "step": 73200 }, { "epoch": 88.31140615570308, "grad_norm": 4.957468509674072, "learning_rate": 1.999823621032743e-05, "loss": 0.0524, "step": 73210 }, { "epoch": 88.32347616173809, "grad_norm": 5.047135829925537, "learning_rate": 1.9998235969072638e-05, "loss": 0.05, "step": 73220 }, { "epoch": 88.33554616777309, "grad_norm": 4.573697090148926, "learning_rate": 1.9998235727817844e-05, "loss": 0.0501, "step": 73230 }, { "epoch": 88.34761617380809, "grad_norm": 4.75182580947876, "learning_rate": 1.999823548656305e-05, "loss": 0.0489, "step": 73240 }, { "epoch": 88.3596861798431, "grad_norm": 5.393363952636719, "learning_rate": 1.9998235245308257e-05, "loss": 0.0498, "step": 73250 }, { "epoch": 88.3717561858781, "grad_norm": 4.783815383911133, "learning_rate": 1.9998235004053463e-05, "loss": 0.0519, "step": 73260 }, { "epoch": 88.3838261919131, "grad_norm": 5.014332294464111, "learning_rate": 1.9998234762798666e-05, "loss": 0.0513, "step": 73270 }, { "epoch": 88.3958961979481, "grad_norm": 5.159853458404541, "learning_rate": 1.9998234521543872e-05, "loss": 0.0518, "step": 73280 }, { "epoch": 88.40796620398311, "grad_norm": 5.18233060836792, "learning_rate": 1.9998234280289078e-05, "loss": 0.0535, "step": 73290 }, { "epoch": 88.42003621001811, "grad_norm": 4.955976486206055, "learning_rate": 1.9998234039034288e-05, "loss": 0.0513, "step": 73300 }, { "epoch": 88.43210621605311, "grad_norm": 5.42468786239624, "learning_rate": 1.9998233797779494e-05, "loss": 0.0499, "step": 73310 }, { "epoch": 88.44417622208812, "grad_norm": 4.730503559112549, "learning_rate": 1.99982335565247e-05, "loss": 0.0499, "step": 73320 }, { "epoch": 88.45624622812312, "grad_norm": 4.854334831237793, "learning_rate": 1.9998233315269906e-05, "loss": 0.053, "step": 73330 }, { "epoch": 88.46831623415812, "grad_norm": 5.565651893615723, "learning_rate": 1.9998233074015112e-05, "loss": 0.0539, "step": 73340 }, { "epoch": 88.48038624019313, "grad_norm": 4.710357666015625, "learning_rate": 1.999823283276032e-05, "loss": 0.0522, "step": 73350 }, { "epoch": 88.49245624622813, "grad_norm": 5.461997985839844, "learning_rate": 1.9998232591505525e-05, "loss": 0.0507, "step": 73360 }, { "epoch": 88.50452625226312, "grad_norm": 5.130308628082275, "learning_rate": 1.999823235025073e-05, "loss": 0.0537, "step": 73370 }, { "epoch": 88.51659625829812, "grad_norm": 5.466922760009766, "learning_rate": 1.9998232108995937e-05, "loss": 0.0533, "step": 73380 }, { "epoch": 88.52866626433313, "grad_norm": 4.364869117736816, "learning_rate": 1.9998231867741144e-05, "loss": 0.0525, "step": 73390 }, { "epoch": 88.54073627036813, "grad_norm": 5.212472915649414, "learning_rate": 1.999823162648635e-05, "loss": 0.0525, "step": 73400 }, { "epoch": 88.55280627640313, "grad_norm": 4.798239707946777, "learning_rate": 1.9998231385231556e-05, "loss": 0.053, "step": 73410 }, { "epoch": 88.56487628243814, "grad_norm": 5.196662425994873, "learning_rate": 1.9998231143976762e-05, "loss": 0.0518, "step": 73420 }, { "epoch": 88.57694628847314, "grad_norm": 5.291032314300537, "learning_rate": 1.999823090272197e-05, "loss": 0.0503, "step": 73430 }, { "epoch": 88.58901629450814, "grad_norm": 4.446304798126221, "learning_rate": 1.999823066146717e-05, "loss": 0.0519, "step": 73440 }, { "epoch": 88.60108630054314, "grad_norm": 4.77467155456543, "learning_rate": 1.9998230420212377e-05, "loss": 0.053, "step": 73450 }, { "epoch": 88.61315630657815, "grad_norm": 5.038043975830078, "learning_rate": 1.9998230178957584e-05, "loss": 0.0531, "step": 73460 }, { "epoch": 88.62522631261315, "grad_norm": 5.1554718017578125, "learning_rate": 1.999822993770279e-05, "loss": 0.0536, "step": 73470 }, { "epoch": 88.63729631864815, "grad_norm": 4.594894886016846, "learning_rate": 1.9998229696447996e-05, "loss": 0.0514, "step": 73480 }, { "epoch": 88.64936632468316, "grad_norm": 5.192210674285889, "learning_rate": 1.9998229455193202e-05, "loss": 0.052, "step": 73490 }, { "epoch": 88.66143633071816, "grad_norm": 5.381495952606201, "learning_rate": 1.999822921393841e-05, "loss": 0.0515, "step": 73500 }, { "epoch": 88.66143633071816, "eval_loss": 12.667529106140137, "eval_runtime": 8.1305, "eval_samples_per_second": 85.727, "eval_steps_per_second": 10.823, "step": 73500 }, { "epoch": 88.67350633675316, "grad_norm": 4.900136947631836, "learning_rate": 1.9998228972683615e-05, "loss": 0.0556, "step": 73510 }, { "epoch": 88.68557634278817, "grad_norm": 5.461044788360596, "learning_rate": 1.999822873142882e-05, "loss": 0.0534, "step": 73520 }, { "epoch": 88.69764634882317, "grad_norm": 5.229474067687988, "learning_rate": 1.9998228490174027e-05, "loss": 0.0535, "step": 73530 }, { "epoch": 88.70971635485817, "grad_norm": 5.754058837890625, "learning_rate": 1.9998228248919233e-05, "loss": 0.0536, "step": 73540 }, { "epoch": 88.72178636089318, "grad_norm": 4.904781818389893, "learning_rate": 1.999822800766444e-05, "loss": 0.0549, "step": 73550 }, { "epoch": 88.73385636692818, "grad_norm": 4.902160167694092, "learning_rate": 1.9998227766409646e-05, "loss": 0.0559, "step": 73560 }, { "epoch": 88.74592637296318, "grad_norm": 5.285794734954834, "learning_rate": 1.9998227525154852e-05, "loss": 0.0534, "step": 73570 }, { "epoch": 88.75799637899819, "grad_norm": 4.791637420654297, "learning_rate": 1.9998227283900058e-05, "loss": 0.0522, "step": 73580 }, { "epoch": 88.77006638503319, "grad_norm": 4.791713237762451, "learning_rate": 1.9998227042645264e-05, "loss": 0.0545, "step": 73590 }, { "epoch": 88.78213639106819, "grad_norm": 4.831859111785889, "learning_rate": 1.999822680139047e-05, "loss": 0.0536, "step": 73600 }, { "epoch": 88.7942063971032, "grad_norm": 5.066999912261963, "learning_rate": 1.9998226560135677e-05, "loss": 0.0569, "step": 73610 }, { "epoch": 88.8062764031382, "grad_norm": 5.389687538146973, "learning_rate": 1.9998226318880883e-05, "loss": 0.0557, "step": 73620 }, { "epoch": 88.8183464091732, "grad_norm": 4.7058563232421875, "learning_rate": 1.999822607762609e-05, "loss": 0.0537, "step": 73630 }, { "epoch": 88.8304164152082, "grad_norm": 4.374016284942627, "learning_rate": 1.9998225836371296e-05, "loss": 0.0539, "step": 73640 }, { "epoch": 88.84248642124321, "grad_norm": 5.3617634773254395, "learning_rate": 1.9998225595116502e-05, "loss": 0.054, "step": 73650 }, { "epoch": 88.85455642727821, "grad_norm": 5.494887828826904, "learning_rate": 1.9998225353861708e-05, "loss": 0.0554, "step": 73660 }, { "epoch": 88.86662643331321, "grad_norm": 4.682824611663818, "learning_rate": 1.9998225112606914e-05, "loss": 0.0538, "step": 73670 }, { "epoch": 88.87869643934822, "grad_norm": 5.848526954650879, "learning_rate": 1.999822487135212e-05, "loss": 0.0593, "step": 73680 }, { "epoch": 88.89076644538322, "grad_norm": 5.015262603759766, "learning_rate": 1.9998224630097323e-05, "loss": 0.0543, "step": 73690 }, { "epoch": 88.90283645141822, "grad_norm": 4.741687297821045, "learning_rate": 1.999822438884253e-05, "loss": 0.0547, "step": 73700 }, { "epoch": 88.91490645745323, "grad_norm": 5.543375492095947, "learning_rate": 1.9998224147587736e-05, "loss": 0.0554, "step": 73710 }, { "epoch": 88.92697646348823, "grad_norm": 5.4309282302856445, "learning_rate": 1.9998223906332942e-05, "loss": 0.0547, "step": 73720 }, { "epoch": 88.93904646952323, "grad_norm": 5.705482482910156, "learning_rate": 1.9998223665078148e-05, "loss": 0.0562, "step": 73730 }, { "epoch": 88.95111647555824, "grad_norm": 5.738610744476318, "learning_rate": 1.9998223423823354e-05, "loss": 0.0548, "step": 73740 }, { "epoch": 88.96318648159324, "grad_norm": 4.572318077087402, "learning_rate": 1.999822318256856e-05, "loss": 0.0562, "step": 73750 }, { "epoch": 88.97525648762824, "grad_norm": 4.631450176239014, "learning_rate": 1.9998222941313767e-05, "loss": 0.0542, "step": 73760 }, { "epoch": 88.98732649366325, "grad_norm": 5.008683681488037, "learning_rate": 1.9998222700058973e-05, "loss": 0.0564, "step": 73770 }, { "epoch": 88.99939649969825, "grad_norm": 5.611876010894775, "learning_rate": 1.999822245880418e-05, "loss": 0.0592, "step": 73780 }, { "epoch": 89.0108630054315, "grad_norm": 4.145177841186523, "learning_rate": 1.9998222217549385e-05, "loss": 0.0387, "step": 73790 }, { "epoch": 89.0229330114665, "grad_norm": 4.04073429107666, "learning_rate": 1.999822197629459e-05, "loss": 0.0398, "step": 73800 }, { "epoch": 89.03500301750151, "grad_norm": 4.140591144561768, "learning_rate": 1.9998221735039798e-05, "loss": 0.0389, "step": 73810 }, { "epoch": 89.04707302353651, "grad_norm": 4.281661033630371, "learning_rate": 1.9998221493785004e-05, "loss": 0.0407, "step": 73820 }, { "epoch": 89.05914302957152, "grad_norm": 4.182556629180908, "learning_rate": 1.999822125253021e-05, "loss": 0.0439, "step": 73830 }, { "epoch": 89.07121303560652, "grad_norm": 3.896073818206787, "learning_rate": 1.999822101127542e-05, "loss": 0.0423, "step": 73840 }, { "epoch": 89.08328304164152, "grad_norm": 4.568140983581543, "learning_rate": 1.9998220770020623e-05, "loss": 0.0427, "step": 73850 }, { "epoch": 89.09535304767653, "grad_norm": 5.186277866363525, "learning_rate": 1.999822052876583e-05, "loss": 0.0466, "step": 73860 }, { "epoch": 89.10742305371153, "grad_norm": 4.477360725402832, "learning_rate": 1.9998220287511035e-05, "loss": 0.0446, "step": 73870 }, { "epoch": 89.11949305974653, "grad_norm": 4.893312454223633, "learning_rate": 1.999822004625624e-05, "loss": 0.0463, "step": 73880 }, { "epoch": 89.13156306578153, "grad_norm": 4.348491191864014, "learning_rate": 1.9998219805001448e-05, "loss": 0.0462, "step": 73890 }, { "epoch": 89.14363307181654, "grad_norm": 4.991367816925049, "learning_rate": 1.9998219563746654e-05, "loss": 0.0462, "step": 73900 }, { "epoch": 89.15570307785154, "grad_norm": 3.9771761894226074, "learning_rate": 1.999821932249186e-05, "loss": 0.0449, "step": 73910 }, { "epoch": 89.16777308388654, "grad_norm": 4.50124454498291, "learning_rate": 1.9998219081237066e-05, "loss": 0.0454, "step": 73920 }, { "epoch": 89.17984308992155, "grad_norm": 4.106451511383057, "learning_rate": 1.9998218839982272e-05, "loss": 0.0442, "step": 73930 }, { "epoch": 89.19191309595655, "grad_norm": 5.523388862609863, "learning_rate": 1.999821859872748e-05, "loss": 0.0471, "step": 73940 }, { "epoch": 89.20398310199155, "grad_norm": 4.922745704650879, "learning_rate": 1.9998218357472685e-05, "loss": 0.0491, "step": 73950 }, { "epoch": 89.21605310802656, "grad_norm": 5.085726737976074, "learning_rate": 1.999821811621789e-05, "loss": 0.0483, "step": 73960 }, { "epoch": 89.22812311406156, "grad_norm": 4.629494667053223, "learning_rate": 1.9998217874963097e-05, "loss": 0.0476, "step": 73970 }, { "epoch": 89.24019312009656, "grad_norm": 4.4039459228515625, "learning_rate": 1.9998217633708303e-05, "loss": 0.0484, "step": 73980 }, { "epoch": 89.25226312613157, "grad_norm": 4.605224609375, "learning_rate": 1.999821739245351e-05, "loss": 0.0476, "step": 73990 }, { "epoch": 89.26433313216657, "grad_norm": 4.870157718658447, "learning_rate": 1.9998217151198716e-05, "loss": 0.047, "step": 74000 }, { "epoch": 89.26433313216657, "eval_loss": 12.641827583312988, "eval_runtime": 8.1439, "eval_samples_per_second": 85.586, "eval_steps_per_second": 10.806, "step": 74000 }, { "epoch": 89.27640313820157, "grad_norm": 4.848653793334961, "learning_rate": 1.9998216909943922e-05, "loss": 0.0483, "step": 74010 }, { "epoch": 89.28847314423658, "grad_norm": 4.582911491394043, "learning_rate": 1.999821666868913e-05, "loss": 0.0471, "step": 74020 }, { "epoch": 89.30054315027158, "grad_norm": 4.717400074005127, "learning_rate": 1.9998216427434335e-05, "loss": 0.0463, "step": 74030 }, { "epoch": 89.31261315630658, "grad_norm": 4.630971908569336, "learning_rate": 1.999821618617954e-05, "loss": 0.0485, "step": 74040 }, { "epoch": 89.32468316234159, "grad_norm": 5.023049354553223, "learning_rate": 1.9998215944924747e-05, "loss": 0.0487, "step": 74050 }, { "epoch": 89.33675316837659, "grad_norm": 5.157466888427734, "learning_rate": 1.9998215703669953e-05, "loss": 0.0525, "step": 74060 }, { "epoch": 89.34882317441159, "grad_norm": 4.835382461547852, "learning_rate": 1.999821546241516e-05, "loss": 0.0491, "step": 74070 }, { "epoch": 89.3608931804466, "grad_norm": 5.392271995544434, "learning_rate": 1.9998215221160366e-05, "loss": 0.0513, "step": 74080 }, { "epoch": 89.3729631864816, "grad_norm": 4.673041820526123, "learning_rate": 1.9998214979905572e-05, "loss": 0.0481, "step": 74090 }, { "epoch": 89.3850331925166, "grad_norm": 4.983357906341553, "learning_rate": 1.9998214738650775e-05, "loss": 0.0508, "step": 74100 }, { "epoch": 89.3971031985516, "grad_norm": 5.27943229675293, "learning_rate": 1.999821449739598e-05, "loss": 0.0519, "step": 74110 }, { "epoch": 89.40917320458661, "grad_norm": 5.06630277633667, "learning_rate": 1.9998214256141187e-05, "loss": 0.0506, "step": 74120 }, { "epoch": 89.42124321062161, "grad_norm": 4.782111167907715, "learning_rate": 1.9998214014886393e-05, "loss": 0.0502, "step": 74130 }, { "epoch": 89.43331321665661, "grad_norm": 5.549253940582275, "learning_rate": 1.99982137736316e-05, "loss": 0.0505, "step": 74140 }, { "epoch": 89.44538322269162, "grad_norm": 5.121172904968262, "learning_rate": 1.9998213532376806e-05, "loss": 0.0515, "step": 74150 }, { "epoch": 89.45745322872662, "grad_norm": 4.934232711791992, "learning_rate": 1.9998213291122012e-05, "loss": 0.0514, "step": 74160 }, { "epoch": 89.46952323476162, "grad_norm": 5.240180492401123, "learning_rate": 1.9998213049867218e-05, "loss": 0.0511, "step": 74170 }, { "epoch": 89.48159324079663, "grad_norm": 5.396851539611816, "learning_rate": 1.9998212808612424e-05, "loss": 0.0508, "step": 74180 }, { "epoch": 89.49366324683163, "grad_norm": 4.932616710662842, "learning_rate": 1.999821256735763e-05, "loss": 0.0537, "step": 74190 }, { "epoch": 89.50573325286662, "grad_norm": 5.528519630432129, "learning_rate": 1.9998212326102837e-05, "loss": 0.0534, "step": 74200 }, { "epoch": 89.51780325890162, "grad_norm": 5.054306507110596, "learning_rate": 1.9998212084848043e-05, "loss": 0.0529, "step": 74210 }, { "epoch": 89.52987326493663, "grad_norm": 5.669320583343506, "learning_rate": 1.999821184359325e-05, "loss": 0.0511, "step": 74220 }, { "epoch": 89.54194327097163, "grad_norm": 5.267635822296143, "learning_rate": 1.9998211602338455e-05, "loss": 0.0528, "step": 74230 }, { "epoch": 89.55401327700663, "grad_norm": 4.909688472747803, "learning_rate": 1.9998211361083662e-05, "loss": 0.0528, "step": 74240 }, { "epoch": 89.56608328304164, "grad_norm": 4.918980598449707, "learning_rate": 1.9998211119828868e-05, "loss": 0.0524, "step": 74250 }, { "epoch": 89.57815328907664, "grad_norm": 4.796065807342529, "learning_rate": 1.9998210878574074e-05, "loss": 0.0518, "step": 74260 }, { "epoch": 89.59022329511164, "grad_norm": 5.374955654144287, "learning_rate": 1.999821063731928e-05, "loss": 0.0497, "step": 74270 }, { "epoch": 89.60229330114664, "grad_norm": 4.945618629455566, "learning_rate": 1.9998210396064487e-05, "loss": 0.0532, "step": 74280 }, { "epoch": 89.61436330718165, "grad_norm": 4.954288482666016, "learning_rate": 1.9998210154809693e-05, "loss": 0.0532, "step": 74290 }, { "epoch": 89.62643331321665, "grad_norm": 5.720640182495117, "learning_rate": 1.99982099135549e-05, "loss": 0.0527, "step": 74300 }, { "epoch": 89.63850331925165, "grad_norm": 5.235449314117432, "learning_rate": 1.9998209672300105e-05, "loss": 0.0513, "step": 74310 }, { "epoch": 89.65057332528666, "grad_norm": 4.649712562561035, "learning_rate": 1.999820943104531e-05, "loss": 0.0513, "step": 74320 }, { "epoch": 89.66264333132166, "grad_norm": 5.15107536315918, "learning_rate": 1.9998209189790518e-05, "loss": 0.0542, "step": 74330 }, { "epoch": 89.67471333735666, "grad_norm": 4.670315742492676, "learning_rate": 1.9998208948535724e-05, "loss": 0.0543, "step": 74340 }, { "epoch": 89.68678334339167, "grad_norm": 4.686694622039795, "learning_rate": 1.9998208707280927e-05, "loss": 0.0513, "step": 74350 }, { "epoch": 89.69885334942667, "grad_norm": 5.546543121337891, "learning_rate": 1.9998208466026133e-05, "loss": 0.0543, "step": 74360 }, { "epoch": 89.71092335546167, "grad_norm": 5.810861110687256, "learning_rate": 1.999820822477134e-05, "loss": 0.0549, "step": 74370 }, { "epoch": 89.72299336149668, "grad_norm": 4.930561065673828, "learning_rate": 1.999820798351655e-05, "loss": 0.0546, "step": 74380 }, { "epoch": 89.73506336753168, "grad_norm": 4.984217643737793, "learning_rate": 1.9998207742261755e-05, "loss": 0.054, "step": 74390 }, { "epoch": 89.74713337356668, "grad_norm": 5.28961706161499, "learning_rate": 1.999820750100696e-05, "loss": 0.0564, "step": 74400 }, { "epoch": 89.75920337960169, "grad_norm": 5.453330993652344, "learning_rate": 1.9998207259752167e-05, "loss": 0.0576, "step": 74410 }, { "epoch": 89.77127338563669, "grad_norm": 5.107643127441406, "learning_rate": 1.9998207018497374e-05, "loss": 0.0542, "step": 74420 }, { "epoch": 89.78334339167169, "grad_norm": 5.490441799163818, "learning_rate": 1.999820677724258e-05, "loss": 0.0554, "step": 74430 }, { "epoch": 89.7954133977067, "grad_norm": 4.606841087341309, "learning_rate": 1.9998206535987786e-05, "loss": 0.0529, "step": 74440 }, { "epoch": 89.8074834037417, "grad_norm": 5.097328186035156, "learning_rate": 1.9998206294732992e-05, "loss": 0.057, "step": 74450 }, { "epoch": 89.8195534097767, "grad_norm": 4.755586624145508, "learning_rate": 1.99982060534782e-05, "loss": 0.0521, "step": 74460 }, { "epoch": 89.8316234158117, "grad_norm": 4.974583625793457, "learning_rate": 1.9998205812223405e-05, "loss": 0.0535, "step": 74470 }, { "epoch": 89.84369342184671, "grad_norm": 5.200186729431152, "learning_rate": 1.999820557096861e-05, "loss": 0.0554, "step": 74480 }, { "epoch": 89.85576342788171, "grad_norm": 4.920487403869629, "learning_rate": 1.9998205329713817e-05, "loss": 0.0537, "step": 74490 }, { "epoch": 89.86783343391672, "grad_norm": 4.934639930725098, "learning_rate": 1.9998205088459023e-05, "loss": 0.0556, "step": 74500 }, { "epoch": 89.86783343391672, "eval_loss": 12.701250076293945, "eval_runtime": 8.1268, "eval_samples_per_second": 85.766, "eval_steps_per_second": 10.828, "step": 74500 }, { "epoch": 89.87990343995172, "grad_norm": 5.002471446990967, "learning_rate": 1.999820484720423e-05, "loss": 0.0546, "step": 74510 }, { "epoch": 89.89197344598672, "grad_norm": 5.2241129875183105, "learning_rate": 1.9998204605949432e-05, "loss": 0.0545, "step": 74520 }, { "epoch": 89.90404345202172, "grad_norm": 5.019325256347656, "learning_rate": 1.999820436469464e-05, "loss": 0.0548, "step": 74530 }, { "epoch": 89.91611345805673, "grad_norm": 5.347335338592529, "learning_rate": 1.9998204123439845e-05, "loss": 0.0579, "step": 74540 }, { "epoch": 89.92818346409173, "grad_norm": 5.435967445373535, "learning_rate": 1.999820388218505e-05, "loss": 0.0554, "step": 74550 }, { "epoch": 89.94025347012673, "grad_norm": 5.298644542694092, "learning_rate": 1.9998203640930257e-05, "loss": 0.0579, "step": 74560 }, { "epoch": 89.95232347616174, "grad_norm": 4.892943382263184, "learning_rate": 1.9998203399675463e-05, "loss": 0.0549, "step": 74570 }, { "epoch": 89.96439348219674, "grad_norm": 5.250544548034668, "learning_rate": 1.999820315842067e-05, "loss": 0.0552, "step": 74580 }, { "epoch": 89.97646348823174, "grad_norm": 5.199073791503906, "learning_rate": 1.9998202917165876e-05, "loss": 0.0563, "step": 74590 }, { "epoch": 89.98853349426675, "grad_norm": 4.76865291595459, "learning_rate": 1.9998202675911082e-05, "loss": 0.0553, "step": 74600 }, { "epoch": 90.0, "grad_norm": 8.44915771484375, "learning_rate": 1.9998202434656288e-05, "loss": 0.0572, "step": 74610 }, { "epoch": 90.012070006035, "grad_norm": 4.051071643829346, "learning_rate": 1.9998202193401494e-05, "loss": 0.0402, "step": 74620 }, { "epoch": 90.02414001207, "grad_norm": 4.392811298370361, "learning_rate": 1.99982019521467e-05, "loss": 0.0383, "step": 74630 }, { "epoch": 90.03621001810501, "grad_norm": 4.186489105224609, "learning_rate": 1.9998201710891907e-05, "loss": 0.0402, "step": 74640 }, { "epoch": 90.04828002414001, "grad_norm": 4.391363143920898, "learning_rate": 1.9998201469637113e-05, "loss": 0.0423, "step": 74650 }, { "epoch": 90.06035003017502, "grad_norm": 4.212307929992676, "learning_rate": 1.999820122838232e-05, "loss": 0.0435, "step": 74660 }, { "epoch": 90.07242003621002, "grad_norm": 4.428682804107666, "learning_rate": 1.9998200987127526e-05, "loss": 0.0417, "step": 74670 }, { "epoch": 90.08449004224502, "grad_norm": 4.320430755615234, "learning_rate": 1.9998200745872732e-05, "loss": 0.0425, "step": 74680 }, { "epoch": 90.09656004828003, "grad_norm": 4.6475114822387695, "learning_rate": 1.9998200504617938e-05, "loss": 0.0448, "step": 74690 }, { "epoch": 90.10863005431503, "grad_norm": 4.395731449127197, "learning_rate": 1.9998200263363144e-05, "loss": 0.0435, "step": 74700 }, { "epoch": 90.12070006035003, "grad_norm": 4.686784267425537, "learning_rate": 1.999820002210835e-05, "loss": 0.0422, "step": 74710 }, { "epoch": 90.13277006638504, "grad_norm": 5.035346984863281, "learning_rate": 1.9998199780853557e-05, "loss": 0.0456, "step": 74720 }, { "epoch": 90.14484007242004, "grad_norm": 4.411844253540039, "learning_rate": 1.9998199539598763e-05, "loss": 0.0444, "step": 74730 }, { "epoch": 90.15691007845504, "grad_norm": 4.755009174346924, "learning_rate": 1.999819929834397e-05, "loss": 0.0447, "step": 74740 }, { "epoch": 90.16898008449004, "grad_norm": 4.877163410186768, "learning_rate": 1.9998199057089175e-05, "loss": 0.0464, "step": 74750 }, { "epoch": 90.18105009052505, "grad_norm": 4.71695613861084, "learning_rate": 1.999819881583438e-05, "loss": 0.0448, "step": 74760 }, { "epoch": 90.19312009656005, "grad_norm": 4.222686290740967, "learning_rate": 1.9998198574579584e-05, "loss": 0.0457, "step": 74770 }, { "epoch": 90.20519010259505, "grad_norm": 4.48451566696167, "learning_rate": 1.999819833332479e-05, "loss": 0.0463, "step": 74780 }, { "epoch": 90.21726010863006, "grad_norm": 4.784078121185303, "learning_rate": 1.9998198092069997e-05, "loss": 0.0478, "step": 74790 }, { "epoch": 90.22933011466506, "grad_norm": 4.460050106048584, "learning_rate": 1.9998197850815203e-05, "loss": 0.0484, "step": 74800 }, { "epoch": 90.24140012070006, "grad_norm": 4.583319664001465, "learning_rate": 1.999819760956041e-05, "loss": 0.0453, "step": 74810 }, { "epoch": 90.25347012673507, "grad_norm": 4.73393440246582, "learning_rate": 1.9998197368305615e-05, "loss": 0.0447, "step": 74820 }, { "epoch": 90.26554013277007, "grad_norm": 4.466148376464844, "learning_rate": 1.999819712705082e-05, "loss": 0.0483, "step": 74830 }, { "epoch": 90.27761013880507, "grad_norm": 5.001640796661377, "learning_rate": 1.9998196885796028e-05, "loss": 0.0483, "step": 74840 }, { "epoch": 90.28968014484008, "grad_norm": 4.5627121925354, "learning_rate": 1.9998196644541234e-05, "loss": 0.0482, "step": 74850 }, { "epoch": 90.30175015087508, "grad_norm": 4.815404415130615, "learning_rate": 1.999819640328644e-05, "loss": 0.0495, "step": 74860 }, { "epoch": 90.31382015691008, "grad_norm": 4.765660762786865, "learning_rate": 1.9998196162031646e-05, "loss": 0.0491, "step": 74870 }, { "epoch": 90.32589016294509, "grad_norm": 4.516696929931641, "learning_rate": 1.9998195920776853e-05, "loss": 0.0483, "step": 74880 }, { "epoch": 90.33796016898009, "grad_norm": 4.843338489532471, "learning_rate": 1.999819567952206e-05, "loss": 0.0499, "step": 74890 }, { "epoch": 90.35003017501509, "grad_norm": 4.979863166809082, "learning_rate": 1.9998195438267265e-05, "loss": 0.0484, "step": 74900 }, { "epoch": 90.3621001810501, "grad_norm": 4.741300106048584, "learning_rate": 1.999819519701247e-05, "loss": 0.0463, "step": 74910 }, { "epoch": 90.3741701870851, "grad_norm": 5.128345489501953, "learning_rate": 1.999819495575768e-05, "loss": 0.0513, "step": 74920 }, { "epoch": 90.3862401931201, "grad_norm": 4.786336898803711, "learning_rate": 1.9998194714502884e-05, "loss": 0.0482, "step": 74930 }, { "epoch": 90.3983101991551, "grad_norm": 4.65226936340332, "learning_rate": 1.999819447324809e-05, "loss": 0.0469, "step": 74940 }, { "epoch": 90.41038020519011, "grad_norm": 4.9135355949401855, "learning_rate": 1.9998194231993296e-05, "loss": 0.0482, "step": 74950 }, { "epoch": 90.42245021122511, "grad_norm": 4.5685200691223145, "learning_rate": 1.9998193990738502e-05, "loss": 0.0489, "step": 74960 }, { "epoch": 90.43452021726011, "grad_norm": 4.730215549468994, "learning_rate": 1.999819374948371e-05, "loss": 0.0496, "step": 74970 }, { "epoch": 90.44659022329512, "grad_norm": 4.581263542175293, "learning_rate": 1.9998193508228915e-05, "loss": 0.0505, "step": 74980 }, { "epoch": 90.45866022933012, "grad_norm": 4.749341011047363, "learning_rate": 1.999819326697412e-05, "loss": 0.0489, "step": 74990 }, { "epoch": 90.47073023536512, "grad_norm": 5.050985813140869, "learning_rate": 1.9998193025719327e-05, "loss": 0.0501, "step": 75000 }, { "epoch": 90.47073023536512, "eval_loss": 12.695817947387695, "eval_runtime": 8.1329, "eval_samples_per_second": 85.702, "eval_steps_per_second": 10.82, "step": 75000 }, { "epoch": 90.48280024140013, "grad_norm": 5.221072196960449, "learning_rate": 1.9998192784464534e-05, "loss": 0.0521, "step": 75010 }, { "epoch": 90.49487024743513, "grad_norm": 5.233987331390381, "learning_rate": 1.999819254320974e-05, "loss": 0.0516, "step": 75020 }, { "epoch": 90.50694025347012, "grad_norm": 4.916378021240234, "learning_rate": 1.9998192301954946e-05, "loss": 0.0528, "step": 75030 }, { "epoch": 90.51901025950512, "grad_norm": 4.455299377441406, "learning_rate": 1.9998192060700152e-05, "loss": 0.0497, "step": 75040 }, { "epoch": 90.53108026554013, "grad_norm": 5.767943382263184, "learning_rate": 1.999819181944536e-05, "loss": 0.0523, "step": 75050 }, { "epoch": 90.54315027157513, "grad_norm": 5.135562896728516, "learning_rate": 1.9998191578190565e-05, "loss": 0.0503, "step": 75060 }, { "epoch": 90.55522027761013, "grad_norm": 5.243748664855957, "learning_rate": 1.999819133693577e-05, "loss": 0.0534, "step": 75070 }, { "epoch": 90.56729028364514, "grad_norm": 5.192594528198242, "learning_rate": 1.9998191095680977e-05, "loss": 0.0534, "step": 75080 }, { "epoch": 90.57936028968014, "grad_norm": 4.7128095626831055, "learning_rate": 1.9998190854426183e-05, "loss": 0.0514, "step": 75090 }, { "epoch": 90.59143029571514, "grad_norm": 5.471073150634766, "learning_rate": 1.999819061317139e-05, "loss": 0.0531, "step": 75100 }, { "epoch": 90.60350030175015, "grad_norm": 4.592616558074951, "learning_rate": 1.9998190371916596e-05, "loss": 0.0519, "step": 75110 }, { "epoch": 90.61557030778515, "grad_norm": 5.325773239135742, "learning_rate": 1.9998190130661802e-05, "loss": 0.0537, "step": 75120 }, { "epoch": 90.62764031382015, "grad_norm": 4.604344844818115, "learning_rate": 1.9998189889407008e-05, "loss": 0.0533, "step": 75130 }, { "epoch": 90.63971031985515, "grad_norm": 5.638186454772949, "learning_rate": 1.9998189648152214e-05, "loss": 0.0515, "step": 75140 }, { "epoch": 90.65178032589016, "grad_norm": 4.713808059692383, "learning_rate": 1.999818940689742e-05, "loss": 0.052, "step": 75150 }, { "epoch": 90.66385033192516, "grad_norm": 4.8130316734313965, "learning_rate": 1.9998189165642627e-05, "loss": 0.0527, "step": 75160 }, { "epoch": 90.67592033796016, "grad_norm": 4.647985458374023, "learning_rate": 1.9998188924387833e-05, "loss": 0.0528, "step": 75170 }, { "epoch": 90.68799034399517, "grad_norm": 4.872984886169434, "learning_rate": 1.9998188683133036e-05, "loss": 0.0552, "step": 75180 }, { "epoch": 90.70006035003017, "grad_norm": 4.739133834838867, "learning_rate": 1.9998188441878242e-05, "loss": 0.0508, "step": 75190 }, { "epoch": 90.71213035606517, "grad_norm": 5.4084906578063965, "learning_rate": 1.9998188200623448e-05, "loss": 0.0545, "step": 75200 }, { "epoch": 90.72420036210018, "grad_norm": 4.8767170906066895, "learning_rate": 1.9998187959368654e-05, "loss": 0.0514, "step": 75210 }, { "epoch": 90.73627036813518, "grad_norm": 4.635343074798584, "learning_rate": 1.999818771811386e-05, "loss": 0.0554, "step": 75220 }, { "epoch": 90.74834037417018, "grad_norm": 5.159951210021973, "learning_rate": 1.9998187476859067e-05, "loss": 0.0546, "step": 75230 }, { "epoch": 90.76041038020519, "grad_norm": 4.814116477966309, "learning_rate": 1.9998187235604273e-05, "loss": 0.0532, "step": 75240 }, { "epoch": 90.77248038624019, "grad_norm": 5.412287712097168, "learning_rate": 1.999818699434948e-05, "loss": 0.054, "step": 75250 }, { "epoch": 90.7845503922752, "grad_norm": 5.092670917510986, "learning_rate": 1.9998186753094686e-05, "loss": 0.0538, "step": 75260 }, { "epoch": 90.7966203983102, "grad_norm": 5.381921291351318, "learning_rate": 1.9998186511839892e-05, "loss": 0.0518, "step": 75270 }, { "epoch": 90.8086904043452, "grad_norm": 5.112898349761963, "learning_rate": 1.9998186270585098e-05, "loss": 0.0532, "step": 75280 }, { "epoch": 90.8207604103802, "grad_norm": 4.861541748046875, "learning_rate": 1.9998186029330304e-05, "loss": 0.0507, "step": 75290 }, { "epoch": 90.8328304164152, "grad_norm": 5.373168468475342, "learning_rate": 1.999818578807551e-05, "loss": 0.0537, "step": 75300 }, { "epoch": 90.84490042245021, "grad_norm": 5.316362380981445, "learning_rate": 1.9998185546820717e-05, "loss": 0.0549, "step": 75310 }, { "epoch": 90.85697042848521, "grad_norm": 4.838957786560059, "learning_rate": 1.9998185305565923e-05, "loss": 0.0522, "step": 75320 }, { "epoch": 90.86904043452022, "grad_norm": 5.052577972412109, "learning_rate": 1.999818506431113e-05, "loss": 0.056, "step": 75330 }, { "epoch": 90.88111044055522, "grad_norm": 5.709005832672119, "learning_rate": 1.9998184823056335e-05, "loss": 0.0545, "step": 75340 }, { "epoch": 90.89318044659022, "grad_norm": 5.4321208000183105, "learning_rate": 1.999818458180154e-05, "loss": 0.0548, "step": 75350 }, { "epoch": 90.90525045262522, "grad_norm": 5.680521011352539, "learning_rate": 1.9998184340546748e-05, "loss": 0.0553, "step": 75360 }, { "epoch": 90.91732045866023, "grad_norm": 5.729650020599365, "learning_rate": 1.9998184099291954e-05, "loss": 0.0556, "step": 75370 }, { "epoch": 90.92939046469523, "grad_norm": 4.498037338256836, "learning_rate": 1.999818385803716e-05, "loss": 0.0539, "step": 75380 }, { "epoch": 90.94146047073023, "grad_norm": 5.407823085784912, "learning_rate": 1.9998183616782366e-05, "loss": 0.0541, "step": 75390 }, { "epoch": 90.95353047676524, "grad_norm": 5.247045993804932, "learning_rate": 1.9998183375527573e-05, "loss": 0.0559, "step": 75400 }, { "epoch": 90.96560048280024, "grad_norm": 5.104113578796387, "learning_rate": 1.999818313427278e-05, "loss": 0.0555, "step": 75410 }, { "epoch": 90.97767048883524, "grad_norm": 5.1253790855407715, "learning_rate": 1.9998182893017985e-05, "loss": 0.0567, "step": 75420 }, { "epoch": 90.98974049487025, "grad_norm": 5.2555341720581055, "learning_rate": 1.9998182651763188e-05, "loss": 0.0548, "step": 75430 }, { "epoch": 91.0012070006035, "grad_norm": 4.370479106903076, "learning_rate": 1.9998182410508394e-05, "loss": 0.0566, "step": 75440 }, { "epoch": 91.0132770066385, "grad_norm": 3.982452392578125, "learning_rate": 1.99981821692536e-05, "loss": 0.0384, "step": 75450 }, { "epoch": 91.0253470126735, "grad_norm": 3.8637404441833496, "learning_rate": 1.999818192799881e-05, "loss": 0.0423, "step": 75460 }, { "epoch": 91.03741701870851, "grad_norm": 4.978765964508057, "learning_rate": 1.9998181686744016e-05, "loss": 0.0423, "step": 75470 }, { "epoch": 91.04948702474351, "grad_norm": 4.690715312957764, "learning_rate": 1.9998181445489222e-05, "loss": 0.0415, "step": 75480 }, { "epoch": 91.06155703077852, "grad_norm": 4.758978366851807, "learning_rate": 1.999818120423443e-05, "loss": 0.0419, "step": 75490 }, { "epoch": 91.07362703681352, "grad_norm": 4.275250434875488, "learning_rate": 1.9998180962979635e-05, "loss": 0.0435, "step": 75500 }, { "epoch": 91.07362703681352, "eval_loss": 12.71070384979248, "eval_runtime": 8.1333, "eval_samples_per_second": 85.697, "eval_steps_per_second": 10.82, "step": 75500 }, { "epoch": 91.08569704284852, "grad_norm": 4.648966312408447, "learning_rate": 1.999818072172484e-05, "loss": 0.0421, "step": 75510 }, { "epoch": 91.09776704888353, "grad_norm": 4.225071430206299, "learning_rate": 1.9998180480470047e-05, "loss": 0.0434, "step": 75520 }, { "epoch": 91.10983705491853, "grad_norm": 3.9418649673461914, "learning_rate": 1.9998180239215253e-05, "loss": 0.0433, "step": 75530 }, { "epoch": 91.12190706095353, "grad_norm": 4.9614481925964355, "learning_rate": 1.999817999796046e-05, "loss": 0.0435, "step": 75540 }, { "epoch": 91.13397706698854, "grad_norm": 4.081015110015869, "learning_rate": 1.9998179756705666e-05, "loss": 0.044, "step": 75550 }, { "epoch": 91.14604707302354, "grad_norm": 3.8228890895843506, "learning_rate": 1.9998179515450872e-05, "loss": 0.0438, "step": 75560 }, { "epoch": 91.15811707905854, "grad_norm": 4.769464015960693, "learning_rate": 1.9998179274196078e-05, "loss": 0.0445, "step": 75570 }, { "epoch": 91.17018708509354, "grad_norm": 4.469377040863037, "learning_rate": 1.9998179032941284e-05, "loss": 0.0461, "step": 75580 }, { "epoch": 91.18225709112855, "grad_norm": 5.692521572113037, "learning_rate": 1.9998178791686487e-05, "loss": 0.0464, "step": 75590 }, { "epoch": 91.19432709716355, "grad_norm": 4.799747467041016, "learning_rate": 1.9998178550431693e-05, "loss": 0.0454, "step": 75600 }, { "epoch": 91.20639710319855, "grad_norm": 4.5464911460876465, "learning_rate": 1.99981783091769e-05, "loss": 0.0456, "step": 75610 }, { "epoch": 91.21846710923356, "grad_norm": 4.5774431228637695, "learning_rate": 1.9998178067922106e-05, "loss": 0.0465, "step": 75620 }, { "epoch": 91.23053711526856, "grad_norm": 4.827582836151123, "learning_rate": 1.9998177826667312e-05, "loss": 0.0499, "step": 75630 }, { "epoch": 91.24260712130356, "grad_norm": 4.663225173950195, "learning_rate": 1.9998177585412518e-05, "loss": 0.0494, "step": 75640 }, { "epoch": 91.25467712733857, "grad_norm": 4.620285511016846, "learning_rate": 1.9998177344157725e-05, "loss": 0.0469, "step": 75650 }, { "epoch": 91.26674713337357, "grad_norm": 4.962461471557617, "learning_rate": 1.999817710290293e-05, "loss": 0.0462, "step": 75660 }, { "epoch": 91.27881713940857, "grad_norm": 5.288265705108643, "learning_rate": 1.9998176861648137e-05, "loss": 0.0493, "step": 75670 }, { "epoch": 91.29088714544358, "grad_norm": 4.683196067810059, "learning_rate": 1.9998176620393343e-05, "loss": 0.0497, "step": 75680 }, { "epoch": 91.30295715147858, "grad_norm": 5.302506446838379, "learning_rate": 1.999817637913855e-05, "loss": 0.0481, "step": 75690 }, { "epoch": 91.31502715751358, "grad_norm": 4.0471272468566895, "learning_rate": 1.9998176137883756e-05, "loss": 0.0486, "step": 75700 }, { "epoch": 91.32709716354859, "grad_norm": 5.075802326202393, "learning_rate": 1.9998175896628962e-05, "loss": 0.0489, "step": 75710 }, { "epoch": 91.33916716958359, "grad_norm": 4.968672752380371, "learning_rate": 1.9998175655374168e-05, "loss": 0.0481, "step": 75720 }, { "epoch": 91.35123717561859, "grad_norm": 4.436751365661621, "learning_rate": 1.9998175414119374e-05, "loss": 0.049, "step": 75730 }, { "epoch": 91.3633071816536, "grad_norm": 4.747427463531494, "learning_rate": 1.999817517286458e-05, "loss": 0.0503, "step": 75740 }, { "epoch": 91.3753771876886, "grad_norm": 5.073152542114258, "learning_rate": 1.9998174931609787e-05, "loss": 0.0497, "step": 75750 }, { "epoch": 91.3874471937236, "grad_norm": 5.025700092315674, "learning_rate": 1.9998174690354993e-05, "loss": 0.0511, "step": 75760 }, { "epoch": 91.3995171997586, "grad_norm": 4.260638236999512, "learning_rate": 1.99981744491002e-05, "loss": 0.0477, "step": 75770 }, { "epoch": 91.41158720579361, "grad_norm": 5.17510461807251, "learning_rate": 1.9998174207845405e-05, "loss": 0.0503, "step": 75780 }, { "epoch": 91.42365721182861, "grad_norm": 4.652052879333496, "learning_rate": 1.999817396659061e-05, "loss": 0.0503, "step": 75790 }, { "epoch": 91.43572721786362, "grad_norm": 5.387064456939697, "learning_rate": 1.9998173725335818e-05, "loss": 0.0503, "step": 75800 }, { "epoch": 91.44779722389862, "grad_norm": 4.606984615325928, "learning_rate": 1.9998173484081024e-05, "loss": 0.0493, "step": 75810 }, { "epoch": 91.45986722993362, "grad_norm": 4.69240665435791, "learning_rate": 1.999817324282623e-05, "loss": 0.0509, "step": 75820 }, { "epoch": 91.47193723596862, "grad_norm": 4.752045631408691, "learning_rate": 1.9998173001571436e-05, "loss": 0.0482, "step": 75830 }, { "epoch": 91.48400724200363, "grad_norm": 4.796882152557373, "learning_rate": 1.999817276031664e-05, "loss": 0.0494, "step": 75840 }, { "epoch": 91.49607724803863, "grad_norm": 5.564022064208984, "learning_rate": 1.9998172519061845e-05, "loss": 0.0509, "step": 75850 }, { "epoch": 91.50814725407362, "grad_norm": 4.655203819274902, "learning_rate": 1.999817227780705e-05, "loss": 0.0511, "step": 75860 }, { "epoch": 91.52021726010862, "grad_norm": 4.73676872253418, "learning_rate": 1.9998172036552258e-05, "loss": 0.049, "step": 75870 }, { "epoch": 91.53228726614363, "grad_norm": 4.86414098739624, "learning_rate": 1.9998171795297464e-05, "loss": 0.051, "step": 75880 }, { "epoch": 91.54435727217863, "grad_norm": 5.451549053192139, "learning_rate": 1.999817155404267e-05, "loss": 0.0503, "step": 75890 }, { "epoch": 91.55642727821363, "grad_norm": 5.283411979675293, "learning_rate": 1.9998171312787877e-05, "loss": 0.0513, "step": 75900 }, { "epoch": 91.56849728424864, "grad_norm": 4.628707408905029, "learning_rate": 1.9998171071533083e-05, "loss": 0.0532, "step": 75910 }, { "epoch": 91.58056729028364, "grad_norm": 4.679083824157715, "learning_rate": 1.999817083027829e-05, "loss": 0.0485, "step": 75920 }, { "epoch": 91.59263729631864, "grad_norm": 4.734099388122559, "learning_rate": 1.9998170589023495e-05, "loss": 0.0511, "step": 75930 }, { "epoch": 91.60470730235365, "grad_norm": 4.795932292938232, "learning_rate": 1.99981703477687e-05, "loss": 0.0498, "step": 75940 }, { "epoch": 91.61677730838865, "grad_norm": 4.328112602233887, "learning_rate": 1.9998170106513908e-05, "loss": 0.051, "step": 75950 }, { "epoch": 91.62884731442365, "grad_norm": 5.258892059326172, "learning_rate": 1.9998169865259114e-05, "loss": 0.0535, "step": 75960 }, { "epoch": 91.64091732045866, "grad_norm": 5.462223529815674, "learning_rate": 1.999816962400432e-05, "loss": 0.0522, "step": 75970 }, { "epoch": 91.65298732649366, "grad_norm": 4.53530216217041, "learning_rate": 1.9998169382749526e-05, "loss": 0.0504, "step": 75980 }, { "epoch": 91.66505733252866, "grad_norm": 4.6105427742004395, "learning_rate": 1.9998169141494732e-05, "loss": 0.0513, "step": 75990 }, { "epoch": 91.67712733856366, "grad_norm": 4.372708320617676, "learning_rate": 1.9998168900239942e-05, "loss": 0.0524, "step": 76000 }, { "epoch": 91.67712733856366, "eval_loss": 12.731620788574219, "eval_runtime": 8.1386, "eval_samples_per_second": 85.641, "eval_steps_per_second": 10.813, "step": 76000 }, { "epoch": 91.68919734459867, "grad_norm": 5.012252330780029, "learning_rate": 1.9998168658985145e-05, "loss": 0.0536, "step": 76010 }, { "epoch": 91.70126735063367, "grad_norm": 5.336565017700195, "learning_rate": 1.999816841773035e-05, "loss": 0.0532, "step": 76020 }, { "epoch": 91.71333735666867, "grad_norm": 5.347482681274414, "learning_rate": 1.9998168176475557e-05, "loss": 0.0542, "step": 76030 }, { "epoch": 91.72540736270368, "grad_norm": 4.50571870803833, "learning_rate": 1.9998167935220764e-05, "loss": 0.0529, "step": 76040 }, { "epoch": 91.73747736873868, "grad_norm": 5.0140838623046875, "learning_rate": 1.999816769396597e-05, "loss": 0.0525, "step": 76050 }, { "epoch": 91.74954737477368, "grad_norm": 5.329376697540283, "learning_rate": 1.9998167452711176e-05, "loss": 0.052, "step": 76060 }, { "epoch": 91.76161738080869, "grad_norm": 4.759077072143555, "learning_rate": 1.9998167211456382e-05, "loss": 0.0516, "step": 76070 }, { "epoch": 91.77368738684369, "grad_norm": 5.140562534332275, "learning_rate": 1.999816697020159e-05, "loss": 0.0558, "step": 76080 }, { "epoch": 91.7857573928787, "grad_norm": 4.594762802124023, "learning_rate": 1.9998166728946795e-05, "loss": 0.0519, "step": 76090 }, { "epoch": 91.7978273989137, "grad_norm": 4.603973388671875, "learning_rate": 1.9998166487692e-05, "loss": 0.0525, "step": 76100 }, { "epoch": 91.8098974049487, "grad_norm": 5.23793888092041, "learning_rate": 1.9998166246437207e-05, "loss": 0.0519, "step": 76110 }, { "epoch": 91.8219674109837, "grad_norm": 5.026486396789551, "learning_rate": 1.9998166005182413e-05, "loss": 0.0512, "step": 76120 }, { "epoch": 91.8340374170187, "grad_norm": 4.955781936645508, "learning_rate": 1.999816576392762e-05, "loss": 0.054, "step": 76130 }, { "epoch": 91.84610742305371, "grad_norm": 5.293256759643555, "learning_rate": 1.9998165522672826e-05, "loss": 0.0551, "step": 76140 }, { "epoch": 91.85817742908871, "grad_norm": 5.0725836753845215, "learning_rate": 1.9998165281418032e-05, "loss": 0.0554, "step": 76150 }, { "epoch": 91.87024743512372, "grad_norm": 5.652973175048828, "learning_rate": 1.9998165040163238e-05, "loss": 0.0549, "step": 76160 }, { "epoch": 91.88231744115872, "grad_norm": 6.028936862945557, "learning_rate": 1.9998164798908444e-05, "loss": 0.0519, "step": 76170 }, { "epoch": 91.89438744719372, "grad_norm": 5.685543060302734, "learning_rate": 1.999816455765365e-05, "loss": 0.0565, "step": 76180 }, { "epoch": 91.90645745322873, "grad_norm": 5.556987762451172, "learning_rate": 1.9998164316398857e-05, "loss": 0.055, "step": 76190 }, { "epoch": 91.91852745926373, "grad_norm": 5.47572135925293, "learning_rate": 1.9998164075144063e-05, "loss": 0.0546, "step": 76200 }, { "epoch": 91.93059746529873, "grad_norm": 5.539262294769287, "learning_rate": 1.999816383388927e-05, "loss": 0.0549, "step": 76210 }, { "epoch": 91.94266747133373, "grad_norm": 5.318014144897461, "learning_rate": 1.9998163592634475e-05, "loss": 0.0566, "step": 76220 }, { "epoch": 91.95473747736874, "grad_norm": 5.361958026885986, "learning_rate": 1.999816335137968e-05, "loss": 0.0568, "step": 76230 }, { "epoch": 91.96680748340374, "grad_norm": 5.3523454666137695, "learning_rate": 1.9998163110124888e-05, "loss": 0.0522, "step": 76240 }, { "epoch": 91.97887748943874, "grad_norm": 4.824337959289551, "learning_rate": 1.9998162868870094e-05, "loss": 0.0547, "step": 76250 }, { "epoch": 91.99094749547375, "grad_norm": 5.149827480316162, "learning_rate": 1.9998162627615297e-05, "loss": 0.0551, "step": 76260 }, { "epoch": 92.002414001207, "grad_norm": 4.420403957366943, "learning_rate": 1.9998162386360503e-05, "loss": 0.0523, "step": 76270 }, { "epoch": 92.014484007242, "grad_norm": 4.1343231201171875, "learning_rate": 1.999816214510571e-05, "loss": 0.0398, "step": 76280 }, { "epoch": 92.026554013277, "grad_norm": 3.7454066276550293, "learning_rate": 1.9998161903850916e-05, "loss": 0.041, "step": 76290 }, { "epoch": 92.03862401931201, "grad_norm": 4.237465858459473, "learning_rate": 1.9998161662596122e-05, "loss": 0.0405, "step": 76300 }, { "epoch": 92.05069402534701, "grad_norm": 3.8227744102478027, "learning_rate": 1.9998161421341328e-05, "loss": 0.0392, "step": 76310 }, { "epoch": 92.06276403138202, "grad_norm": 4.752642631530762, "learning_rate": 1.9998161180086534e-05, "loss": 0.0425, "step": 76320 }, { "epoch": 92.07483403741702, "grad_norm": 5.377229690551758, "learning_rate": 1.999816093883174e-05, "loss": 0.0411, "step": 76330 }, { "epoch": 92.08690404345202, "grad_norm": 4.391862392425537, "learning_rate": 1.9998160697576947e-05, "loss": 0.0422, "step": 76340 }, { "epoch": 92.09897404948703, "grad_norm": 4.18865966796875, "learning_rate": 1.9998160456322153e-05, "loss": 0.044, "step": 76350 }, { "epoch": 92.11104405552203, "grad_norm": 4.669145584106445, "learning_rate": 1.999816021506736e-05, "loss": 0.0419, "step": 76360 }, { "epoch": 92.12311406155703, "grad_norm": 4.444820880889893, "learning_rate": 1.9998159973812565e-05, "loss": 0.0436, "step": 76370 }, { "epoch": 92.13518406759204, "grad_norm": 4.322792053222656, "learning_rate": 1.999815973255777e-05, "loss": 0.0457, "step": 76380 }, { "epoch": 92.14725407362704, "grad_norm": 4.383181571960449, "learning_rate": 1.9998159491302978e-05, "loss": 0.0445, "step": 76390 }, { "epoch": 92.15932407966204, "grad_norm": 4.733088970184326, "learning_rate": 1.9998159250048184e-05, "loss": 0.0428, "step": 76400 }, { "epoch": 92.17139408569705, "grad_norm": 4.668094635009766, "learning_rate": 1.999815900879339e-05, "loss": 0.0461, "step": 76410 }, { "epoch": 92.18346409173205, "grad_norm": 4.498551368713379, "learning_rate": 1.9998158767538596e-05, "loss": 0.0462, "step": 76420 }, { "epoch": 92.19553409776705, "grad_norm": 4.473941326141357, "learning_rate": 1.9998158526283803e-05, "loss": 0.0458, "step": 76430 }, { "epoch": 92.20760410380205, "grad_norm": 4.764701843261719, "learning_rate": 1.999815828502901e-05, "loss": 0.0443, "step": 76440 }, { "epoch": 92.21967410983706, "grad_norm": 4.244599342346191, "learning_rate": 1.9998158043774215e-05, "loss": 0.045, "step": 76450 }, { "epoch": 92.23174411587206, "grad_norm": 4.709653377532959, "learning_rate": 1.999815780251942e-05, "loss": 0.0493, "step": 76460 }, { "epoch": 92.24381412190706, "grad_norm": 4.755812644958496, "learning_rate": 1.9998157561264627e-05, "loss": 0.044, "step": 76470 }, { "epoch": 92.25588412794207, "grad_norm": 4.609304428100586, "learning_rate": 1.9998157320009834e-05, "loss": 0.046, "step": 76480 }, { "epoch": 92.26795413397707, "grad_norm": 5.018311500549316, "learning_rate": 1.999815707875504e-05, "loss": 0.0466, "step": 76490 }, { "epoch": 92.28002414001207, "grad_norm": 4.4928669929504395, "learning_rate": 1.9998156837500246e-05, "loss": 0.0464, "step": 76500 }, { "epoch": 92.28002414001207, "eval_loss": 12.740290641784668, "eval_runtime": 8.1315, "eval_samples_per_second": 85.717, "eval_steps_per_second": 10.822, "step": 76500 }, { "epoch": 92.29209414604708, "grad_norm": 4.7012128829956055, "learning_rate": 1.999815659624545e-05, "loss": 0.0478, "step": 76510 }, { "epoch": 92.30416415208208, "grad_norm": 4.21353006362915, "learning_rate": 1.9998156354990655e-05, "loss": 0.047, "step": 76520 }, { "epoch": 92.31623415811708, "grad_norm": 4.758325099945068, "learning_rate": 1.999815611373586e-05, "loss": 0.0492, "step": 76530 }, { "epoch": 92.32830416415209, "grad_norm": 4.819988250732422, "learning_rate": 1.999815587248107e-05, "loss": 0.0486, "step": 76540 }, { "epoch": 92.34037417018709, "grad_norm": 5.25255012512207, "learning_rate": 1.9998155631226277e-05, "loss": 0.0473, "step": 76550 }, { "epoch": 92.3524441762221, "grad_norm": 4.618163585662842, "learning_rate": 1.9998155389971483e-05, "loss": 0.0491, "step": 76560 }, { "epoch": 92.3645141822571, "grad_norm": 4.566854953765869, "learning_rate": 1.999815514871669e-05, "loss": 0.0455, "step": 76570 }, { "epoch": 92.3765841882921, "grad_norm": 4.864655494689941, "learning_rate": 1.9998154907461896e-05, "loss": 0.0504, "step": 76580 }, { "epoch": 92.3886541943271, "grad_norm": 5.1109771728515625, "learning_rate": 1.9998154666207102e-05, "loss": 0.0481, "step": 76590 }, { "epoch": 92.4007242003621, "grad_norm": 5.192226886749268, "learning_rate": 1.9998154424952308e-05, "loss": 0.0496, "step": 76600 }, { "epoch": 92.41279420639711, "grad_norm": 4.926996231079102, "learning_rate": 1.9998154183697514e-05, "loss": 0.0488, "step": 76610 }, { "epoch": 92.42486421243211, "grad_norm": 5.034032344818115, "learning_rate": 1.999815394244272e-05, "loss": 0.0507, "step": 76620 }, { "epoch": 92.43693421846712, "grad_norm": 4.701093673706055, "learning_rate": 1.9998153701187927e-05, "loss": 0.0494, "step": 76630 }, { "epoch": 92.44900422450212, "grad_norm": 4.40669059753418, "learning_rate": 1.9998153459933133e-05, "loss": 0.0486, "step": 76640 }, { "epoch": 92.46107423053712, "grad_norm": 4.203983306884766, "learning_rate": 1.999815321867834e-05, "loss": 0.0485, "step": 76650 }, { "epoch": 92.47314423657213, "grad_norm": 4.716643333435059, "learning_rate": 1.9998152977423545e-05, "loss": 0.0497, "step": 76660 }, { "epoch": 92.48521424260713, "grad_norm": 4.953446388244629, "learning_rate": 1.999815273616875e-05, "loss": 0.0497, "step": 76670 }, { "epoch": 92.49728424864213, "grad_norm": 4.427473068237305, "learning_rate": 1.9998152494913955e-05, "loss": 0.0471, "step": 76680 }, { "epoch": 92.50935425467712, "grad_norm": 5.016950607299805, "learning_rate": 1.999815225365916e-05, "loss": 0.0504, "step": 76690 }, { "epoch": 92.52142426071212, "grad_norm": 4.924071311950684, "learning_rate": 1.9998152012404367e-05, "loss": 0.0491, "step": 76700 }, { "epoch": 92.53349426674713, "grad_norm": 4.883175373077393, "learning_rate": 1.9998151771149573e-05, "loss": 0.0483, "step": 76710 }, { "epoch": 92.54556427278213, "grad_norm": 4.903011798858643, "learning_rate": 1.999815152989478e-05, "loss": 0.0516, "step": 76720 }, { "epoch": 92.55763427881713, "grad_norm": 5.198662757873535, "learning_rate": 1.9998151288639986e-05, "loss": 0.052, "step": 76730 }, { "epoch": 92.56970428485214, "grad_norm": 4.769882678985596, "learning_rate": 1.9998151047385192e-05, "loss": 0.0482, "step": 76740 }, { "epoch": 92.58177429088714, "grad_norm": 5.015952110290527, "learning_rate": 1.9998150806130398e-05, "loss": 0.0514, "step": 76750 }, { "epoch": 92.59384429692214, "grad_norm": 5.380181312561035, "learning_rate": 1.9998150564875604e-05, "loss": 0.0519, "step": 76760 }, { "epoch": 92.60591430295715, "grad_norm": 5.416763782501221, "learning_rate": 1.999815032362081e-05, "loss": 0.052, "step": 76770 }, { "epoch": 92.61798430899215, "grad_norm": 4.884822845458984, "learning_rate": 1.9998150082366017e-05, "loss": 0.0529, "step": 76780 }, { "epoch": 92.63005431502715, "grad_norm": 4.973453521728516, "learning_rate": 1.9998149841111223e-05, "loss": 0.0529, "step": 76790 }, { "epoch": 92.64212432106216, "grad_norm": 4.283870220184326, "learning_rate": 1.999814959985643e-05, "loss": 0.0543, "step": 76800 }, { "epoch": 92.65419432709716, "grad_norm": 4.652189254760742, "learning_rate": 1.9998149358601635e-05, "loss": 0.053, "step": 76810 }, { "epoch": 92.66626433313216, "grad_norm": 4.92473030090332, "learning_rate": 1.999814911734684e-05, "loss": 0.0527, "step": 76820 }, { "epoch": 92.67833433916717, "grad_norm": 4.553295612335205, "learning_rate": 1.9998148876092048e-05, "loss": 0.0502, "step": 76830 }, { "epoch": 92.69040434520217, "grad_norm": 4.9464216232299805, "learning_rate": 1.9998148634837254e-05, "loss": 0.0524, "step": 76840 }, { "epoch": 92.70247435123717, "grad_norm": 5.262056350708008, "learning_rate": 1.999814839358246e-05, "loss": 0.0507, "step": 76850 }, { "epoch": 92.71454435727217, "grad_norm": 5.053805351257324, "learning_rate": 1.9998148152327666e-05, "loss": 0.0533, "step": 76860 }, { "epoch": 92.72661436330718, "grad_norm": 4.606949329376221, "learning_rate": 1.9998147911072873e-05, "loss": 0.0526, "step": 76870 }, { "epoch": 92.73868436934218, "grad_norm": 4.685070514678955, "learning_rate": 1.999814766981808e-05, "loss": 0.0504, "step": 76880 }, { "epoch": 92.75075437537718, "grad_norm": 4.993468761444092, "learning_rate": 1.9998147428563285e-05, "loss": 0.0537, "step": 76890 }, { "epoch": 92.76282438141219, "grad_norm": 4.8879618644714355, "learning_rate": 1.999814718730849e-05, "loss": 0.0525, "step": 76900 }, { "epoch": 92.77489438744719, "grad_norm": 4.900053977966309, "learning_rate": 1.9998146946053697e-05, "loss": 0.052, "step": 76910 }, { "epoch": 92.7869643934822, "grad_norm": 5.233870029449463, "learning_rate": 1.99981467047989e-05, "loss": 0.0524, "step": 76920 }, { "epoch": 92.7990343995172, "grad_norm": 5.478466987609863, "learning_rate": 1.9998146463544107e-05, "loss": 0.0559, "step": 76930 }, { "epoch": 92.8111044055522, "grad_norm": 4.934873104095459, "learning_rate": 1.9998146222289313e-05, "loss": 0.054, "step": 76940 }, { "epoch": 92.8231744115872, "grad_norm": 4.699859619140625, "learning_rate": 1.999814598103452e-05, "loss": 0.0529, "step": 76950 }, { "epoch": 92.8352444176222, "grad_norm": 5.002597332000732, "learning_rate": 1.9998145739779725e-05, "loss": 0.0536, "step": 76960 }, { "epoch": 92.84731442365721, "grad_norm": 5.054344654083252, "learning_rate": 1.999814549852493e-05, "loss": 0.0535, "step": 76970 }, { "epoch": 92.85938442969221, "grad_norm": 4.704006671905518, "learning_rate": 1.9998145257270138e-05, "loss": 0.0533, "step": 76980 }, { "epoch": 92.87145443572722, "grad_norm": 4.913228988647461, "learning_rate": 1.9998145016015344e-05, "loss": 0.053, "step": 76990 }, { "epoch": 92.88352444176222, "grad_norm": 4.793466567993164, "learning_rate": 1.999814477476055e-05, "loss": 0.0512, "step": 77000 }, { "epoch": 92.88352444176222, "eval_loss": 12.73913860321045, "eval_runtime": 8.1332, "eval_samples_per_second": 85.698, "eval_steps_per_second": 10.82, "step": 77000 }, { "epoch": 92.89559444779722, "grad_norm": 4.768026828765869, "learning_rate": 1.9998144533505756e-05, "loss": 0.0507, "step": 77010 }, { "epoch": 92.90766445383223, "grad_norm": 5.362266540527344, "learning_rate": 1.9998144292250962e-05, "loss": 0.0567, "step": 77020 }, { "epoch": 92.91973445986723, "grad_norm": 5.315586566925049, "learning_rate": 1.999814405099617e-05, "loss": 0.0528, "step": 77030 }, { "epoch": 92.93180446590223, "grad_norm": 5.0572404861450195, "learning_rate": 1.9998143809741375e-05, "loss": 0.055, "step": 77040 }, { "epoch": 92.94387447193724, "grad_norm": 5.248339653015137, "learning_rate": 1.999814356848658e-05, "loss": 0.0555, "step": 77050 }, { "epoch": 92.95594447797224, "grad_norm": 5.226670742034912, "learning_rate": 1.9998143327231787e-05, "loss": 0.0557, "step": 77060 }, { "epoch": 92.96801448400724, "grad_norm": 4.8821539878845215, "learning_rate": 1.9998143085976994e-05, "loss": 0.0547, "step": 77070 }, { "epoch": 92.98008449004224, "grad_norm": 4.980026721954346, "learning_rate": 1.9998142844722203e-05, "loss": 0.0523, "step": 77080 }, { "epoch": 92.99215449607725, "grad_norm": 5.036242961883545, "learning_rate": 1.9998142603467406e-05, "loss": 0.0552, "step": 77090 }, { "epoch": 93.0036210018105, "grad_norm": 4.717694282531738, "learning_rate": 1.9998142362212612e-05, "loss": 0.0527, "step": 77100 }, { "epoch": 93.0156910078455, "grad_norm": 3.901353597640991, "learning_rate": 1.999814212095782e-05, "loss": 0.0395, "step": 77110 }, { "epoch": 93.02776101388051, "grad_norm": 4.429618835449219, "learning_rate": 1.9998141879703025e-05, "loss": 0.0368, "step": 77120 }, { "epoch": 93.03983101991551, "grad_norm": 4.129220008850098, "learning_rate": 1.999814163844823e-05, "loss": 0.0399, "step": 77130 }, { "epoch": 93.05190102595051, "grad_norm": 4.315781116485596, "learning_rate": 1.9998141397193437e-05, "loss": 0.0421, "step": 77140 }, { "epoch": 93.06397103198552, "grad_norm": 4.622958183288574, "learning_rate": 1.9998141155938643e-05, "loss": 0.0431, "step": 77150 }, { "epoch": 93.07604103802052, "grad_norm": 4.402229309082031, "learning_rate": 1.999814091468385e-05, "loss": 0.0435, "step": 77160 }, { "epoch": 93.08811104405552, "grad_norm": 4.071615695953369, "learning_rate": 1.9998140673429056e-05, "loss": 0.0412, "step": 77170 }, { "epoch": 93.10018105009053, "grad_norm": 4.569180011749268, "learning_rate": 1.9998140432174262e-05, "loss": 0.0441, "step": 77180 }, { "epoch": 93.11225105612553, "grad_norm": 4.155158042907715, "learning_rate": 1.9998140190919468e-05, "loss": 0.0412, "step": 77190 }, { "epoch": 93.12432106216053, "grad_norm": 4.327630519866943, "learning_rate": 1.9998139949664674e-05, "loss": 0.0436, "step": 77200 }, { "epoch": 93.13639106819554, "grad_norm": 4.491708278656006, "learning_rate": 1.999813970840988e-05, "loss": 0.0438, "step": 77210 }, { "epoch": 93.14846107423054, "grad_norm": 4.58452033996582, "learning_rate": 1.9998139467155087e-05, "loss": 0.0452, "step": 77220 }, { "epoch": 93.16053108026554, "grad_norm": 4.304990291595459, "learning_rate": 1.9998139225900293e-05, "loss": 0.042, "step": 77230 }, { "epoch": 93.17260108630055, "grad_norm": 5.040531635284424, "learning_rate": 1.99981389846455e-05, "loss": 0.0448, "step": 77240 }, { "epoch": 93.18467109233555, "grad_norm": 3.975356340408325, "learning_rate": 1.9998138743390705e-05, "loss": 0.0443, "step": 77250 }, { "epoch": 93.19674109837055, "grad_norm": 4.746770858764648, "learning_rate": 1.999813850213591e-05, "loss": 0.0456, "step": 77260 }, { "epoch": 93.20881110440556, "grad_norm": 4.505163192749023, "learning_rate": 1.9998138260881118e-05, "loss": 0.0459, "step": 77270 }, { "epoch": 93.22088111044056, "grad_norm": 4.2240471839904785, "learning_rate": 1.9998138019626324e-05, "loss": 0.0454, "step": 77280 }, { "epoch": 93.23295111647556, "grad_norm": 4.214466571807861, "learning_rate": 1.999813777837153e-05, "loss": 0.0455, "step": 77290 }, { "epoch": 93.24502112251056, "grad_norm": 4.679049968719482, "learning_rate": 1.9998137537116737e-05, "loss": 0.0462, "step": 77300 }, { "epoch": 93.25709112854557, "grad_norm": 4.2018866539001465, "learning_rate": 1.9998137295861943e-05, "loss": 0.0457, "step": 77310 }, { "epoch": 93.26916113458057, "grad_norm": 4.930639743804932, "learning_rate": 1.999813705460715e-05, "loss": 0.0443, "step": 77320 }, { "epoch": 93.28123114061557, "grad_norm": 5.038966178894043, "learning_rate": 1.9998136813352355e-05, "loss": 0.0475, "step": 77330 }, { "epoch": 93.29330114665058, "grad_norm": 4.475462913513184, "learning_rate": 1.9998136572097558e-05, "loss": 0.0467, "step": 77340 }, { "epoch": 93.30537115268558, "grad_norm": 4.39858341217041, "learning_rate": 1.9998136330842764e-05, "loss": 0.0478, "step": 77350 }, { "epoch": 93.31744115872058, "grad_norm": 5.173695087432861, "learning_rate": 1.999813608958797e-05, "loss": 0.0486, "step": 77360 }, { "epoch": 93.32951116475559, "grad_norm": 4.564393997192383, "learning_rate": 1.9998135848333177e-05, "loss": 0.0494, "step": 77370 }, { "epoch": 93.34158117079059, "grad_norm": 4.607680797576904, "learning_rate": 1.9998135607078383e-05, "loss": 0.0468, "step": 77380 }, { "epoch": 93.3536511768256, "grad_norm": 4.592140197753906, "learning_rate": 1.999813536582359e-05, "loss": 0.0472, "step": 77390 }, { "epoch": 93.3657211828606, "grad_norm": 5.325812816619873, "learning_rate": 1.9998135124568795e-05, "loss": 0.0492, "step": 77400 }, { "epoch": 93.3777911888956, "grad_norm": 5.279475212097168, "learning_rate": 1.9998134883314e-05, "loss": 0.0479, "step": 77410 }, { "epoch": 93.3898611949306, "grad_norm": 4.900836944580078, "learning_rate": 1.9998134642059208e-05, "loss": 0.0471, "step": 77420 }, { "epoch": 93.4019312009656, "grad_norm": 4.9987030029296875, "learning_rate": 1.9998134400804414e-05, "loss": 0.0461, "step": 77430 }, { "epoch": 93.41400120700061, "grad_norm": 4.793157577514648, "learning_rate": 1.999813415954962e-05, "loss": 0.048, "step": 77440 }, { "epoch": 93.42607121303561, "grad_norm": 4.5769195556640625, "learning_rate": 1.9998133918294826e-05, "loss": 0.0477, "step": 77450 }, { "epoch": 93.43814121907062, "grad_norm": 4.740070343017578, "learning_rate": 1.9998133677040033e-05, "loss": 0.0487, "step": 77460 }, { "epoch": 93.45021122510562, "grad_norm": 4.237915992736816, "learning_rate": 1.999813343578524e-05, "loss": 0.046, "step": 77470 }, { "epoch": 93.46228123114062, "grad_norm": 4.948226451873779, "learning_rate": 1.9998133194530445e-05, "loss": 0.0491, "step": 77480 }, { "epoch": 93.47435123717563, "grad_norm": 5.0354390144348145, "learning_rate": 1.999813295327565e-05, "loss": 0.0496, "step": 77490 }, { "epoch": 93.48642124321063, "grad_norm": 4.433510780334473, "learning_rate": 1.9998132712020857e-05, "loss": 0.0499, "step": 77500 }, { "epoch": 93.48642124321063, "eval_loss": 12.744490623474121, "eval_runtime": 8.1239, "eval_samples_per_second": 85.796, "eval_steps_per_second": 10.832, "step": 77500 }, { "epoch": 93.49849124924563, "grad_norm": 4.866936683654785, "learning_rate": 1.9998132470766064e-05, "loss": 0.0515, "step": 77510 }, { "epoch": 93.51056125528062, "grad_norm": 4.680614948272705, "learning_rate": 1.999813222951127e-05, "loss": 0.0504, "step": 77520 }, { "epoch": 93.52263126131562, "grad_norm": 4.600743770599365, "learning_rate": 1.9998131988256476e-05, "loss": 0.0514, "step": 77530 }, { "epoch": 93.53470126735063, "grad_norm": 4.814119338989258, "learning_rate": 1.9998131747001682e-05, "loss": 0.05, "step": 77540 }, { "epoch": 93.54677127338563, "grad_norm": 5.067127227783203, "learning_rate": 1.999813150574689e-05, "loss": 0.05, "step": 77550 }, { "epoch": 93.55884127942063, "grad_norm": 4.5395989418029785, "learning_rate": 1.9998131264492095e-05, "loss": 0.0461, "step": 77560 }, { "epoch": 93.57091128545564, "grad_norm": 4.793154239654541, "learning_rate": 1.99981310232373e-05, "loss": 0.0476, "step": 77570 }, { "epoch": 93.58298129149064, "grad_norm": 4.946317195892334, "learning_rate": 1.9998130781982507e-05, "loss": 0.0493, "step": 77580 }, { "epoch": 93.59505129752564, "grad_norm": 4.293050765991211, "learning_rate": 1.999813054072771e-05, "loss": 0.0495, "step": 77590 }, { "epoch": 93.60712130356065, "grad_norm": 4.855801105499268, "learning_rate": 1.9998130299472916e-05, "loss": 0.0506, "step": 77600 }, { "epoch": 93.61919130959565, "grad_norm": 4.623111724853516, "learning_rate": 1.9998130058218122e-05, "loss": 0.0514, "step": 77610 }, { "epoch": 93.63126131563065, "grad_norm": 4.947202682495117, "learning_rate": 1.9998129816963332e-05, "loss": 0.0504, "step": 77620 }, { "epoch": 93.64333132166566, "grad_norm": 5.009346961975098, "learning_rate": 1.9998129575708538e-05, "loss": 0.0507, "step": 77630 }, { "epoch": 93.65540132770066, "grad_norm": 5.242311954498291, "learning_rate": 1.9998129334453744e-05, "loss": 0.0502, "step": 77640 }, { "epoch": 93.66747133373566, "grad_norm": 5.076425075531006, "learning_rate": 1.999812909319895e-05, "loss": 0.049, "step": 77650 }, { "epoch": 93.67954133977067, "grad_norm": 5.115490436553955, "learning_rate": 1.9998128851944157e-05, "loss": 0.0516, "step": 77660 }, { "epoch": 93.69161134580567, "grad_norm": 5.551931858062744, "learning_rate": 1.9998128610689363e-05, "loss": 0.0513, "step": 77670 }, { "epoch": 93.70368135184067, "grad_norm": 4.68565034866333, "learning_rate": 1.999812836943457e-05, "loss": 0.051, "step": 77680 }, { "epoch": 93.71575135787567, "grad_norm": 4.719478607177734, "learning_rate": 1.9998128128179776e-05, "loss": 0.0514, "step": 77690 }, { "epoch": 93.72782136391068, "grad_norm": 4.796701431274414, "learning_rate": 1.9998127886924982e-05, "loss": 0.05, "step": 77700 }, { "epoch": 93.73989136994568, "grad_norm": 5.272255897521973, "learning_rate": 1.9998127645670188e-05, "loss": 0.0497, "step": 77710 }, { "epoch": 93.75196137598068, "grad_norm": 4.956488132476807, "learning_rate": 1.9998127404415394e-05, "loss": 0.0504, "step": 77720 }, { "epoch": 93.76403138201569, "grad_norm": Infinity, "learning_rate": 1.99981271631606e-05, "loss": 0.0515, "step": 77730 }, { "epoch": 93.77610138805069, "grad_norm": 4.978899002075195, "learning_rate": 1.9998126921905807e-05, "loss": 0.0524, "step": 77740 }, { "epoch": 93.7881713940857, "grad_norm": 4.965790271759033, "learning_rate": 1.999812668065101e-05, "loss": 0.0529, "step": 77750 }, { "epoch": 93.8002414001207, "grad_norm": 4.905977725982666, "learning_rate": 1.9998126439396216e-05, "loss": 0.05, "step": 77760 }, { "epoch": 93.8123114061557, "grad_norm": 4.943648815155029, "learning_rate": 1.9998126198141422e-05, "loss": 0.0534, "step": 77770 }, { "epoch": 93.8243814121907, "grad_norm": 4.6559858322143555, "learning_rate": 1.9998125956886628e-05, "loss": 0.0516, "step": 77780 }, { "epoch": 93.8364514182257, "grad_norm": 5.112442493438721, "learning_rate": 1.9998125715631834e-05, "loss": 0.0545, "step": 77790 }, { "epoch": 93.84852142426071, "grad_norm": 5.644599914550781, "learning_rate": 1.999812547437704e-05, "loss": 0.0545, "step": 77800 }, { "epoch": 93.86059143029571, "grad_norm": 5.210785388946533, "learning_rate": 1.9998125233122247e-05, "loss": 0.0517, "step": 77810 }, { "epoch": 93.87266143633072, "grad_norm": 4.972742557525635, "learning_rate": 1.9998124991867453e-05, "loss": 0.0546, "step": 77820 }, { "epoch": 93.88473144236572, "grad_norm": 4.713344097137451, "learning_rate": 1.999812475061266e-05, "loss": 0.0519, "step": 77830 }, { "epoch": 93.89680144840072, "grad_norm": 4.568460464477539, "learning_rate": 1.9998124509357865e-05, "loss": 0.0534, "step": 77840 }, { "epoch": 93.90887145443573, "grad_norm": 4.807438373565674, "learning_rate": 1.999812426810307e-05, "loss": 0.0543, "step": 77850 }, { "epoch": 93.92094146047073, "grad_norm": 5.740462779998779, "learning_rate": 1.9998124026848278e-05, "loss": 0.0514, "step": 77860 }, { "epoch": 93.93301146650573, "grad_norm": 4.157668113708496, "learning_rate": 1.9998123785593484e-05, "loss": 0.0518, "step": 77870 }, { "epoch": 93.94508147254074, "grad_norm": 4.472518444061279, "learning_rate": 1.999812354433869e-05, "loss": 0.0543, "step": 77880 }, { "epoch": 93.95715147857574, "grad_norm": 5.590657711029053, "learning_rate": 1.9998123303083896e-05, "loss": 0.0536, "step": 77890 }, { "epoch": 93.96922148461074, "grad_norm": 4.890132427215576, "learning_rate": 1.9998123061829103e-05, "loss": 0.0517, "step": 77900 }, { "epoch": 93.98129149064575, "grad_norm": 5.610064506530762, "learning_rate": 1.999812282057431e-05, "loss": 0.0532, "step": 77910 }, { "epoch": 93.99336149668075, "grad_norm": 5.085530757904053, "learning_rate": 1.9998122579319515e-05, "loss": 0.0514, "step": 77920 }, { "epoch": 94.004828002414, "grad_norm": 4.2149810791015625, "learning_rate": 1.999812233806472e-05, "loss": 0.0457, "step": 77930 }, { "epoch": 94.016898008449, "grad_norm": 4.324565887451172, "learning_rate": 1.9998122096809928e-05, "loss": 0.0349, "step": 77940 }, { "epoch": 94.02896801448401, "grad_norm": 3.9912843704223633, "learning_rate": 1.9998121855555134e-05, "loss": 0.0377, "step": 77950 }, { "epoch": 94.04103802051901, "grad_norm": 4.056493282318115, "learning_rate": 1.999812161430034e-05, "loss": 0.0374, "step": 77960 }, { "epoch": 94.05310802655401, "grad_norm": 3.938458204269409, "learning_rate": 1.9998121373045546e-05, "loss": 0.0409, "step": 77970 }, { "epoch": 94.06517803258902, "grad_norm": 4.1101884841918945, "learning_rate": 1.9998121131790752e-05, "loss": 0.0378, "step": 77980 }, { "epoch": 94.07724803862402, "grad_norm": 4.681115627288818, "learning_rate": 1.999812089053596e-05, "loss": 0.0404, "step": 77990 }, { "epoch": 94.08931804465902, "grad_norm": 4.372028350830078, "learning_rate": 1.999812064928116e-05, "loss": 0.0418, "step": 78000 }, { "epoch": 94.08931804465902, "eval_loss": 12.748262405395508, "eval_runtime": 8.1379, "eval_samples_per_second": 85.648, "eval_steps_per_second": 10.814, "step": 78000 }, { "epoch": 94.10138805069403, "grad_norm": 4.315463542938232, "learning_rate": 1.9998120408026368e-05, "loss": 0.0397, "step": 78010 }, { "epoch": 94.11345805672903, "grad_norm": 4.073751926422119, "learning_rate": 1.9998120166771574e-05, "loss": 0.0419, "step": 78020 }, { "epoch": 94.12552806276403, "grad_norm": 4.034538269042969, "learning_rate": 1.999811992551678e-05, "loss": 0.0425, "step": 78030 }, { "epoch": 94.13759806879904, "grad_norm": 4.5340681076049805, "learning_rate": 1.9998119684261986e-05, "loss": 0.0424, "step": 78040 }, { "epoch": 94.14966807483404, "grad_norm": 4.79560661315918, "learning_rate": 1.9998119443007192e-05, "loss": 0.0424, "step": 78050 }, { "epoch": 94.16173808086904, "grad_norm": 4.33491325378418, "learning_rate": 1.99981192017524e-05, "loss": 0.0438, "step": 78060 }, { "epoch": 94.17380808690405, "grad_norm": 3.9295454025268555, "learning_rate": 1.9998118960497605e-05, "loss": 0.0436, "step": 78070 }, { "epoch": 94.18587809293905, "grad_norm": 4.641022682189941, "learning_rate": 1.999811871924281e-05, "loss": 0.0439, "step": 78080 }, { "epoch": 94.19794809897405, "grad_norm": 4.802939414978027, "learning_rate": 1.9998118477988017e-05, "loss": 0.0432, "step": 78090 }, { "epoch": 94.21001810500906, "grad_norm": 5.00897216796875, "learning_rate": 1.9998118236733224e-05, "loss": 0.0445, "step": 78100 }, { "epoch": 94.22208811104406, "grad_norm": 3.9974961280822754, "learning_rate": 1.999811799547843e-05, "loss": 0.0436, "step": 78110 }, { "epoch": 94.23415811707906, "grad_norm": 4.445639133453369, "learning_rate": 1.9998117754223636e-05, "loss": 0.0446, "step": 78120 }, { "epoch": 94.24622812311407, "grad_norm": 4.898594856262207, "learning_rate": 1.9998117512968842e-05, "loss": 0.046, "step": 78130 }, { "epoch": 94.25829812914907, "grad_norm": 4.44385290145874, "learning_rate": 1.999811727171405e-05, "loss": 0.0438, "step": 78140 }, { "epoch": 94.27036813518407, "grad_norm": 4.58472204208374, "learning_rate": 1.9998117030459255e-05, "loss": 0.0457, "step": 78150 }, { "epoch": 94.28243814121907, "grad_norm": 4.7471747398376465, "learning_rate": 1.999811678920446e-05, "loss": 0.0437, "step": 78160 }, { "epoch": 94.29450814725408, "grad_norm": 4.42863655090332, "learning_rate": 1.9998116547949667e-05, "loss": 0.0463, "step": 78170 }, { "epoch": 94.30657815328908, "grad_norm": 4.389801502227783, "learning_rate": 1.9998116306694873e-05, "loss": 0.0468, "step": 78180 }, { "epoch": 94.31864815932408, "grad_norm": 4.6407270431518555, "learning_rate": 1.999811606544008e-05, "loss": 0.0476, "step": 78190 }, { "epoch": 94.33071816535909, "grad_norm": 4.863547325134277, "learning_rate": 1.9998115824185286e-05, "loss": 0.0461, "step": 78200 }, { "epoch": 94.34278817139409, "grad_norm": 4.638521671295166, "learning_rate": 1.9998115582930492e-05, "loss": 0.0459, "step": 78210 }, { "epoch": 94.3548581774291, "grad_norm": 4.446322917938232, "learning_rate": 1.9998115341675698e-05, "loss": 0.0456, "step": 78220 }, { "epoch": 94.3669281834641, "grad_norm": 4.896442890167236, "learning_rate": 1.9998115100420904e-05, "loss": 0.0463, "step": 78230 }, { "epoch": 94.3789981894991, "grad_norm": 4.317086696624756, "learning_rate": 1.999811485916611e-05, "loss": 0.048, "step": 78240 }, { "epoch": 94.3910681955341, "grad_norm": 5.019114971160889, "learning_rate": 1.9998114617911317e-05, "loss": 0.0479, "step": 78250 }, { "epoch": 94.4031382015691, "grad_norm": 4.617844581604004, "learning_rate": 1.9998114376656523e-05, "loss": 0.0465, "step": 78260 }, { "epoch": 94.41520820760411, "grad_norm": 4.395875930786133, "learning_rate": 1.999811413540173e-05, "loss": 0.0489, "step": 78270 }, { "epoch": 94.42727821363911, "grad_norm": 5.191770076751709, "learning_rate": 1.9998113894146935e-05, "loss": 0.0491, "step": 78280 }, { "epoch": 94.43934821967412, "grad_norm": 5.021620750427246, "learning_rate": 1.999811365289214e-05, "loss": 0.0469, "step": 78290 }, { "epoch": 94.45141822570912, "grad_norm": 5.074501037597656, "learning_rate": 1.9998113411637348e-05, "loss": 0.0481, "step": 78300 }, { "epoch": 94.46348823174412, "grad_norm": 5.104135513305664, "learning_rate": 1.9998113170382554e-05, "loss": 0.0496, "step": 78310 }, { "epoch": 94.47555823777913, "grad_norm": 5.170146942138672, "learning_rate": 1.999811292912776e-05, "loss": 0.0474, "step": 78320 }, { "epoch": 94.48762824381413, "grad_norm": 4.855066776275635, "learning_rate": 1.9998112687872967e-05, "loss": 0.0484, "step": 78330 }, { "epoch": 94.49969824984913, "grad_norm": 4.876155853271484, "learning_rate": 1.9998112446618173e-05, "loss": 0.0478, "step": 78340 }, { "epoch": 94.51176825588412, "grad_norm": 4.713514804840088, "learning_rate": 1.999811220536338e-05, "loss": 0.0491, "step": 78350 }, { "epoch": 94.52383826191912, "grad_norm": 4.540152549743652, "learning_rate": 1.9998111964108585e-05, "loss": 0.0491, "step": 78360 }, { "epoch": 94.53590826795413, "grad_norm": 5.310450077056885, "learning_rate": 1.999811172285379e-05, "loss": 0.05, "step": 78370 }, { "epoch": 94.54797827398913, "grad_norm": 4.363389015197754, "learning_rate": 1.9998111481598998e-05, "loss": 0.0502, "step": 78380 }, { "epoch": 94.56004828002413, "grad_norm": 4.903217315673828, "learning_rate": 1.9998111240344204e-05, "loss": 0.0498, "step": 78390 }, { "epoch": 94.57211828605914, "grad_norm": 5.211065292358398, "learning_rate": 1.999811099908941e-05, "loss": 0.0499, "step": 78400 }, { "epoch": 94.58418829209414, "grad_norm": 4.885470390319824, "learning_rate": 1.9998110757834613e-05, "loss": 0.0503, "step": 78410 }, { "epoch": 94.59625829812914, "grad_norm": 4.800075531005859, "learning_rate": 1.999811051657982e-05, "loss": 0.0484, "step": 78420 }, { "epoch": 94.60832830416415, "grad_norm": 5.159666538238525, "learning_rate": 1.9998110275325025e-05, "loss": 0.0499, "step": 78430 }, { "epoch": 94.62039831019915, "grad_norm": 4.782378673553467, "learning_rate": 1.999811003407023e-05, "loss": 0.0485, "step": 78440 }, { "epoch": 94.63246831623415, "grad_norm": 4.868757247924805, "learning_rate": 1.9998109792815438e-05, "loss": 0.0505, "step": 78450 }, { "epoch": 94.64453832226916, "grad_norm": 4.793765068054199, "learning_rate": 1.9998109551560644e-05, "loss": 0.0501, "step": 78460 }, { "epoch": 94.65660832830416, "grad_norm": 4.670323371887207, "learning_rate": 1.999810931030585e-05, "loss": 0.0506, "step": 78470 }, { "epoch": 94.66867833433916, "grad_norm": 5.127986431121826, "learning_rate": 1.9998109069051056e-05, "loss": 0.0513, "step": 78480 }, { "epoch": 94.68074834037417, "grad_norm": 4.996337413787842, "learning_rate": 1.9998108827796263e-05, "loss": 0.0493, "step": 78490 }, { "epoch": 94.69281834640917, "grad_norm": 4.438230514526367, "learning_rate": 1.999810858654147e-05, "loss": 0.0512, "step": 78500 }, { "epoch": 94.69281834640917, "eval_loss": 12.775283813476562, "eval_runtime": 8.1331, "eval_samples_per_second": 85.699, "eval_steps_per_second": 10.82, "step": 78500 }, { "epoch": 94.70488835244417, "grad_norm": 5.317470073699951, "learning_rate": 1.9998108345286675e-05, "loss": 0.0482, "step": 78510 }, { "epoch": 94.71695835847918, "grad_norm": 5.2616167068481445, "learning_rate": 1.999810810403188e-05, "loss": 0.0511, "step": 78520 }, { "epoch": 94.72902836451418, "grad_norm": 4.52682638168335, "learning_rate": 1.9998107862777087e-05, "loss": 0.051, "step": 78530 }, { "epoch": 94.74109837054918, "grad_norm": 4.9496331214904785, "learning_rate": 1.9998107621522294e-05, "loss": 0.0488, "step": 78540 }, { "epoch": 94.75316837658418, "grad_norm": 4.650717258453369, "learning_rate": 1.99981073802675e-05, "loss": 0.0526, "step": 78550 }, { "epoch": 94.76523838261919, "grad_norm": 5.205956935882568, "learning_rate": 1.9998107139012706e-05, "loss": 0.0524, "step": 78560 }, { "epoch": 94.77730838865419, "grad_norm": 5.3128790855407715, "learning_rate": 1.9998106897757912e-05, "loss": 0.0518, "step": 78570 }, { "epoch": 94.7893783946892, "grad_norm": 4.697868824005127, "learning_rate": 1.999810665650312e-05, "loss": 0.0522, "step": 78580 }, { "epoch": 94.8014484007242, "grad_norm": 5.321874141693115, "learning_rate": 1.9998106415248325e-05, "loss": 0.0526, "step": 78590 }, { "epoch": 94.8135184067592, "grad_norm": 5.20148229598999, "learning_rate": 1.999810617399353e-05, "loss": 0.0537, "step": 78600 }, { "epoch": 94.8255884127942, "grad_norm": 4.660248279571533, "learning_rate": 1.9998105932738737e-05, "loss": 0.0523, "step": 78610 }, { "epoch": 94.83765841882921, "grad_norm": 4.69208288192749, "learning_rate": 1.9998105691483943e-05, "loss": 0.0533, "step": 78620 }, { "epoch": 94.84972842486421, "grad_norm": 5.057765960693359, "learning_rate": 1.999810545022915e-05, "loss": 0.0516, "step": 78630 }, { "epoch": 94.86179843089921, "grad_norm": 4.397902011871338, "learning_rate": 1.9998105208974356e-05, "loss": 0.0525, "step": 78640 }, { "epoch": 94.87386843693422, "grad_norm": 4.8010358810424805, "learning_rate": 1.9998104967719562e-05, "loss": 0.051, "step": 78650 }, { "epoch": 94.88593844296922, "grad_norm": 5.030202388763428, "learning_rate": 1.9998104726464765e-05, "loss": 0.0523, "step": 78660 }, { "epoch": 94.89800844900422, "grad_norm": 5.085814476013184, "learning_rate": 1.999810448520997e-05, "loss": 0.0515, "step": 78670 }, { "epoch": 94.91007845503923, "grad_norm": 5.4052839279174805, "learning_rate": 1.9998104243955177e-05, "loss": 0.0516, "step": 78680 }, { "epoch": 94.92214846107423, "grad_norm": 4.156686305999756, "learning_rate": 1.9998104002700384e-05, "loss": 0.0509, "step": 78690 }, { "epoch": 94.93421846710923, "grad_norm": 5.030811786651611, "learning_rate": 1.9998103761445593e-05, "loss": 0.0528, "step": 78700 }, { "epoch": 94.94628847314424, "grad_norm": 5.097717761993408, "learning_rate": 1.99981035201908e-05, "loss": 0.051, "step": 78710 }, { "epoch": 94.95835847917924, "grad_norm": 4.614091873168945, "learning_rate": 1.9998103278936006e-05, "loss": 0.0536, "step": 78720 }, { "epoch": 94.97042848521424, "grad_norm": 4.8875017166137695, "learning_rate": 1.9998103037681212e-05, "loss": 0.0534, "step": 78730 }, { "epoch": 94.98249849124925, "grad_norm": 4.65311861038208, "learning_rate": 1.9998102796426418e-05, "loss": 0.0534, "step": 78740 }, { "epoch": 94.99456849728425, "grad_norm": 4.7138142585754395, "learning_rate": 1.9998102555171624e-05, "loss": 0.0543, "step": 78750 }, { "epoch": 95.0060350030175, "grad_norm": 3.892927408218384, "learning_rate": 1.999810231391683e-05, "loss": 0.0414, "step": 78760 }, { "epoch": 95.0181050090525, "grad_norm": 4.387320518493652, "learning_rate": 1.9998102072662037e-05, "loss": 0.0355, "step": 78770 }, { "epoch": 95.03017501508751, "grad_norm": 4.197768211364746, "learning_rate": 1.9998101831407243e-05, "loss": 0.0377, "step": 78780 }, { "epoch": 95.04224502112251, "grad_norm": 3.7499654293060303, "learning_rate": 1.999810159015245e-05, "loss": 0.0394, "step": 78790 }, { "epoch": 95.05431502715751, "grad_norm": 3.9672327041625977, "learning_rate": 1.9998101348897655e-05, "loss": 0.0392, "step": 78800 }, { "epoch": 95.06638503319252, "grad_norm": 3.8810791969299316, "learning_rate": 1.999810110764286e-05, "loss": 0.0416, "step": 78810 }, { "epoch": 95.07845503922752, "grad_norm": 3.9892332553863525, "learning_rate": 1.9998100866388068e-05, "loss": 0.0386, "step": 78820 }, { "epoch": 95.09052504526252, "grad_norm": 4.6472697257995605, "learning_rate": 1.999810062513327e-05, "loss": 0.0402, "step": 78830 }, { "epoch": 95.10259505129753, "grad_norm": 4.340109825134277, "learning_rate": 1.9998100383878477e-05, "loss": 0.0395, "step": 78840 }, { "epoch": 95.11466505733253, "grad_norm": 3.943420171737671, "learning_rate": 1.9998100142623683e-05, "loss": 0.0409, "step": 78850 }, { "epoch": 95.12673506336753, "grad_norm": 4.521097660064697, "learning_rate": 1.999809990136889e-05, "loss": 0.0452, "step": 78860 }, { "epoch": 95.13880506940254, "grad_norm": 4.174147129058838, "learning_rate": 1.9998099660114095e-05, "loss": 0.0426, "step": 78870 }, { "epoch": 95.15087507543754, "grad_norm": 4.425595283508301, "learning_rate": 1.99980994188593e-05, "loss": 0.0428, "step": 78880 }, { "epoch": 95.16294508147254, "grad_norm": 4.63526725769043, "learning_rate": 1.9998099177604508e-05, "loss": 0.0445, "step": 78890 }, { "epoch": 95.17501508750755, "grad_norm": 4.849722862243652, "learning_rate": 1.9998098936349714e-05, "loss": 0.0433, "step": 78900 }, { "epoch": 95.18708509354255, "grad_norm": 4.665803909301758, "learning_rate": 1.999809869509492e-05, "loss": 0.043, "step": 78910 }, { "epoch": 95.19915509957755, "grad_norm": 4.311531066894531, "learning_rate": 1.9998098453840126e-05, "loss": 0.0425, "step": 78920 }, { "epoch": 95.21122510561256, "grad_norm": 4.838583469390869, "learning_rate": 1.9998098212585333e-05, "loss": 0.0462, "step": 78930 }, { "epoch": 95.22329511164756, "grad_norm": 4.929396152496338, "learning_rate": 1.999809797133054e-05, "loss": 0.0444, "step": 78940 }, { "epoch": 95.23536511768256, "grad_norm": 4.314678192138672, "learning_rate": 1.9998097730075745e-05, "loss": 0.0443, "step": 78950 }, { "epoch": 95.24743512371757, "grad_norm": 4.515108585357666, "learning_rate": 1.999809748882095e-05, "loss": 0.0429, "step": 78960 }, { "epoch": 95.25950512975257, "grad_norm": 4.705352783203125, "learning_rate": 1.9998097247566158e-05, "loss": 0.0468, "step": 78970 }, { "epoch": 95.27157513578757, "grad_norm": 4.2202301025390625, "learning_rate": 1.9998097006311364e-05, "loss": 0.0465, "step": 78980 }, { "epoch": 95.28364514182257, "grad_norm": 4.254096508026123, "learning_rate": 1.999809676505657e-05, "loss": 0.0433, "step": 78990 }, { "epoch": 95.29571514785758, "grad_norm": 4.628633975982666, "learning_rate": 1.9998096523801776e-05, "loss": 0.0453, "step": 79000 }, { "epoch": 95.29571514785758, "eval_loss": 12.771047592163086, "eval_runtime": 8.1367, "eval_samples_per_second": 85.662, "eval_steps_per_second": 10.815, "step": 79000 }, { "epoch": 95.30778515389258, "grad_norm": 5.068076133728027, "learning_rate": 1.9998096282546982e-05, "loss": 0.0452, "step": 79010 }, { "epoch": 95.31985515992758, "grad_norm": 4.806339740753174, "learning_rate": 1.999809604129219e-05, "loss": 0.0472, "step": 79020 }, { "epoch": 95.33192516596259, "grad_norm": 4.930924892425537, "learning_rate": 1.9998095800037395e-05, "loss": 0.0476, "step": 79030 }, { "epoch": 95.34399517199759, "grad_norm": 4.558058738708496, "learning_rate": 1.99980955587826e-05, "loss": 0.0455, "step": 79040 }, { "epoch": 95.3560651780326, "grad_norm": 4.92647647857666, "learning_rate": 1.9998095317527807e-05, "loss": 0.0485, "step": 79050 }, { "epoch": 95.3681351840676, "grad_norm": 4.249919414520264, "learning_rate": 1.9998095076273013e-05, "loss": 0.0464, "step": 79060 }, { "epoch": 95.3802051901026, "grad_norm": 4.800860404968262, "learning_rate": 1.999809483501822e-05, "loss": 0.0483, "step": 79070 }, { "epoch": 95.3922751961376, "grad_norm": 5.017120361328125, "learning_rate": 1.9998094593763423e-05, "loss": 0.0477, "step": 79080 }, { "epoch": 95.4043452021726, "grad_norm": 4.823863983154297, "learning_rate": 1.999809435250863e-05, "loss": 0.0457, "step": 79090 }, { "epoch": 95.41641520820761, "grad_norm": 4.7871575355529785, "learning_rate": 1.9998094111253835e-05, "loss": 0.0455, "step": 79100 }, { "epoch": 95.42848521424261, "grad_norm": 4.467148780822754, "learning_rate": 1.999809386999904e-05, "loss": 0.0484, "step": 79110 }, { "epoch": 95.44055522027762, "grad_norm": 5.111647129058838, "learning_rate": 1.9998093628744247e-05, "loss": 0.0498, "step": 79120 }, { "epoch": 95.45262522631262, "grad_norm": 4.633890151977539, "learning_rate": 1.9998093387489454e-05, "loss": 0.049, "step": 79130 }, { "epoch": 95.46469523234762, "grad_norm": 5.149634838104248, "learning_rate": 1.999809314623466e-05, "loss": 0.0459, "step": 79140 }, { "epoch": 95.47676523838263, "grad_norm": 5.268786907196045, "learning_rate": 1.9998092904979866e-05, "loss": 0.0474, "step": 79150 }, { "epoch": 95.48883524441763, "grad_norm": 4.390719413757324, "learning_rate": 1.9998092663725072e-05, "loss": 0.0514, "step": 79160 }, { "epoch": 95.50090525045263, "grad_norm": 4.801689624786377, "learning_rate": 1.999809242247028e-05, "loss": 0.0494, "step": 79170 }, { "epoch": 95.51297525648762, "grad_norm": 4.543116092681885, "learning_rate": 1.9998092181215485e-05, "loss": 0.049, "step": 79180 }, { "epoch": 95.52504526252262, "grad_norm": 4.389158725738525, "learning_rate": 1.999809193996069e-05, "loss": 0.0511, "step": 79190 }, { "epoch": 95.53711526855763, "grad_norm": 5.5883307456970215, "learning_rate": 1.9998091698705897e-05, "loss": 0.0514, "step": 79200 }, { "epoch": 95.54918527459263, "grad_norm": 4.744192123413086, "learning_rate": 1.9998091457451103e-05, "loss": 0.0482, "step": 79210 }, { "epoch": 95.56125528062763, "grad_norm": 4.69405460357666, "learning_rate": 1.999809121619631e-05, "loss": 0.0483, "step": 79220 }, { "epoch": 95.57332528666264, "grad_norm": 5.0629377365112305, "learning_rate": 1.9998090974941516e-05, "loss": 0.0485, "step": 79230 }, { "epoch": 95.58539529269764, "grad_norm": 4.775434970855713, "learning_rate": 1.9998090733686722e-05, "loss": 0.0489, "step": 79240 }, { "epoch": 95.59746529873264, "grad_norm": 4.58038854598999, "learning_rate": 1.9998090492431928e-05, "loss": 0.0468, "step": 79250 }, { "epoch": 95.60953530476765, "grad_norm": 4.980356693267822, "learning_rate": 1.9998090251177134e-05, "loss": 0.0489, "step": 79260 }, { "epoch": 95.62160531080265, "grad_norm": 4.718698501586914, "learning_rate": 1.999809000992234e-05, "loss": 0.0501, "step": 79270 }, { "epoch": 95.63367531683765, "grad_norm": 5.247001647949219, "learning_rate": 1.9998089768667547e-05, "loss": 0.0504, "step": 79280 }, { "epoch": 95.64574532287266, "grad_norm": 4.553993225097656, "learning_rate": 1.9998089527412753e-05, "loss": 0.0505, "step": 79290 }, { "epoch": 95.65781532890766, "grad_norm": 4.906691074371338, "learning_rate": 1.999808928615796e-05, "loss": 0.0506, "step": 79300 }, { "epoch": 95.66988533494266, "grad_norm": 5.74135160446167, "learning_rate": 1.9998089044903165e-05, "loss": 0.0508, "step": 79310 }, { "epoch": 95.68195534097767, "grad_norm": 5.048218250274658, "learning_rate": 1.999808880364837e-05, "loss": 0.0501, "step": 79320 }, { "epoch": 95.69402534701267, "grad_norm": 5.028153419494629, "learning_rate": 1.9998088562393578e-05, "loss": 0.0499, "step": 79330 }, { "epoch": 95.70609535304767, "grad_norm": 5.199427604675293, "learning_rate": 1.9998088321138784e-05, "loss": 0.0516, "step": 79340 }, { "epoch": 95.71816535908268, "grad_norm": 4.395163536071777, "learning_rate": 1.999808807988399e-05, "loss": 0.049, "step": 79350 }, { "epoch": 95.73023536511768, "grad_norm": 4.605162143707275, "learning_rate": 1.9998087838629197e-05, "loss": 0.0513, "step": 79360 }, { "epoch": 95.74230537115268, "grad_norm": 4.974411487579346, "learning_rate": 1.9998087597374403e-05, "loss": 0.0543, "step": 79370 }, { "epoch": 95.75437537718769, "grad_norm": 4.51439094543457, "learning_rate": 1.999808735611961e-05, "loss": 0.0529, "step": 79380 }, { "epoch": 95.76644538322269, "grad_norm": 4.224778652191162, "learning_rate": 1.9998087114864815e-05, "loss": 0.0508, "step": 79390 }, { "epoch": 95.77851538925769, "grad_norm": 4.61607551574707, "learning_rate": 1.999808687361002e-05, "loss": 0.0496, "step": 79400 }, { "epoch": 95.7905853952927, "grad_norm": 4.8419365882873535, "learning_rate": 1.9998086632355228e-05, "loss": 0.0487, "step": 79410 }, { "epoch": 95.8026554013277, "grad_norm": 4.3751139640808105, "learning_rate": 1.9998086391100434e-05, "loss": 0.0501, "step": 79420 }, { "epoch": 95.8147254073627, "grad_norm": 5.09844446182251, "learning_rate": 1.999808614984564e-05, "loss": 0.0514, "step": 79430 }, { "epoch": 95.8267954133977, "grad_norm": 5.061943054199219, "learning_rate": 1.9998085908590846e-05, "loss": 0.0522, "step": 79440 }, { "epoch": 95.83886541943271, "grad_norm": 5.0817484855651855, "learning_rate": 1.9998085667336052e-05, "loss": 0.0514, "step": 79450 }, { "epoch": 95.85093542546771, "grad_norm": 5.299846649169922, "learning_rate": 1.999808542608126e-05, "loss": 0.0511, "step": 79460 }, { "epoch": 95.86300543150271, "grad_norm": 5.040971755981445, "learning_rate": 1.9998085184826465e-05, "loss": 0.0496, "step": 79470 }, { "epoch": 95.87507543753772, "grad_norm": 4.334948539733887, "learning_rate": 1.999808494357167e-05, "loss": 0.0501, "step": 79480 }, { "epoch": 95.88714544357272, "grad_norm": 5.456387042999268, "learning_rate": 1.9998084702316874e-05, "loss": 0.0539, "step": 79490 }, { "epoch": 95.89921544960772, "grad_norm": 5.163417816162109, "learning_rate": 1.999808446106208e-05, "loss": 0.0528, "step": 79500 }, { "epoch": 95.89921544960772, "eval_loss": 12.784693717956543, "eval_runtime": 8.1359, "eval_samples_per_second": 85.67, "eval_steps_per_second": 10.816, "step": 79500 }, { "epoch": 95.91128545564273, "grad_norm": 4.8361496925354, "learning_rate": 1.9998084219807286e-05, "loss": 0.0509, "step": 79510 }, { "epoch": 95.92335546167773, "grad_norm": 4.97360372543335, "learning_rate": 1.9998083978552493e-05, "loss": 0.0526, "step": 79520 }, { "epoch": 95.93542546771273, "grad_norm": 4.868590354919434, "learning_rate": 1.99980837372977e-05, "loss": 0.0521, "step": 79530 }, { "epoch": 95.94749547374774, "grad_norm": 5.394593238830566, "learning_rate": 1.9998083496042905e-05, "loss": 0.0532, "step": 79540 }, { "epoch": 95.95956547978274, "grad_norm": 5.008950710296631, "learning_rate": 1.999808325478811e-05, "loss": 0.0511, "step": 79550 }, { "epoch": 95.97163548581774, "grad_norm": 4.768584728240967, "learning_rate": 1.9998083013533317e-05, "loss": 0.0504, "step": 79560 }, { "epoch": 95.98370549185275, "grad_norm": 4.976532936096191, "learning_rate": 1.9998082772278524e-05, "loss": 0.052, "step": 79570 }, { "epoch": 95.99577549788775, "grad_norm": 4.7568135261535645, "learning_rate": 1.999808253102373e-05, "loss": 0.053, "step": 79580 }, { "epoch": 96.007242003621, "grad_norm": 3.7762253284454346, "learning_rate": 1.9998082289768936e-05, "loss": 0.0421, "step": 79590 }, { "epoch": 96.019312009656, "grad_norm": 4.129497051239014, "learning_rate": 1.9998082048514142e-05, "loss": 0.0356, "step": 79600 }, { "epoch": 96.03138201569101, "grad_norm": 4.031962871551514, "learning_rate": 1.999808180725935e-05, "loss": 0.0368, "step": 79610 }, { "epoch": 96.04345202172601, "grad_norm": 4.5197577476501465, "learning_rate": 1.9998081566004555e-05, "loss": 0.0379, "step": 79620 }, { "epoch": 96.05552202776101, "grad_norm": 4.30324125289917, "learning_rate": 1.999808132474976e-05, "loss": 0.0395, "step": 79630 }, { "epoch": 96.06759203379602, "grad_norm": 4.6921892166137695, "learning_rate": 1.9998081083494967e-05, "loss": 0.0395, "step": 79640 }, { "epoch": 96.07966203983102, "grad_norm": 4.074190139770508, "learning_rate": 1.9998080842240173e-05, "loss": 0.0404, "step": 79650 }, { "epoch": 96.09173204586602, "grad_norm": 4.408523082733154, "learning_rate": 1.999808060098538e-05, "loss": 0.039, "step": 79660 }, { "epoch": 96.10380205190103, "grad_norm": 4.475346565246582, "learning_rate": 1.9998080359730586e-05, "loss": 0.0428, "step": 79670 }, { "epoch": 96.11587205793603, "grad_norm": 4.448452472686768, "learning_rate": 1.9998080118475792e-05, "loss": 0.0423, "step": 79680 }, { "epoch": 96.12794206397103, "grad_norm": 4.322646617889404, "learning_rate": 1.9998079877220998e-05, "loss": 0.0394, "step": 79690 }, { "epoch": 96.14001207000604, "grad_norm": 4.450972557067871, "learning_rate": 1.9998079635966204e-05, "loss": 0.0393, "step": 79700 }, { "epoch": 96.15208207604104, "grad_norm": 4.820017337799072, "learning_rate": 1.999807939471141e-05, "loss": 0.0404, "step": 79710 }, { "epoch": 96.16415208207604, "grad_norm": 4.680251121520996, "learning_rate": 1.9998079153456617e-05, "loss": 0.0452, "step": 79720 }, { "epoch": 96.17622208811105, "grad_norm": 4.590274810791016, "learning_rate": 1.9998078912201823e-05, "loss": 0.0428, "step": 79730 }, { "epoch": 96.18829209414605, "grad_norm": 5.072737693786621, "learning_rate": 1.9998078670947026e-05, "loss": 0.0451, "step": 79740 }, { "epoch": 96.20036210018105, "grad_norm": 4.3564839363098145, "learning_rate": 1.9998078429692232e-05, "loss": 0.0472, "step": 79750 }, { "epoch": 96.21243210621606, "grad_norm": 4.676358699798584, "learning_rate": 1.999807818843744e-05, "loss": 0.0445, "step": 79760 }, { "epoch": 96.22450211225106, "grad_norm": 4.9612932205200195, "learning_rate": 1.9998077947182645e-05, "loss": 0.0443, "step": 79770 }, { "epoch": 96.23657211828606, "grad_norm": 5.137452125549316, "learning_rate": 1.9998077705927854e-05, "loss": 0.0456, "step": 79780 }, { "epoch": 96.24864212432107, "grad_norm": 4.571587562561035, "learning_rate": 1.999807746467306e-05, "loss": 0.0424, "step": 79790 }, { "epoch": 96.26071213035607, "grad_norm": 4.866623401641846, "learning_rate": 1.9998077223418267e-05, "loss": 0.0452, "step": 79800 }, { "epoch": 96.27278213639107, "grad_norm": 4.311924457550049, "learning_rate": 1.9998076982163473e-05, "loss": 0.0441, "step": 79810 }, { "epoch": 96.28485214242608, "grad_norm": 4.172901630401611, "learning_rate": 1.999807674090868e-05, "loss": 0.0432, "step": 79820 }, { "epoch": 96.29692214846108, "grad_norm": 4.518604278564453, "learning_rate": 1.9998076499653885e-05, "loss": 0.0459, "step": 79830 }, { "epoch": 96.30899215449608, "grad_norm": 4.9614057540893555, "learning_rate": 1.999807625839909e-05, "loss": 0.046, "step": 79840 }, { "epoch": 96.32106216053108, "grad_norm": 5.109565258026123, "learning_rate": 1.9998076017144298e-05, "loss": 0.0447, "step": 79850 }, { "epoch": 96.33313216656609, "grad_norm": 4.41401481628418, "learning_rate": 1.9998075775889504e-05, "loss": 0.048, "step": 79860 }, { "epoch": 96.34520217260109, "grad_norm": 4.529788494110107, "learning_rate": 1.999807553463471e-05, "loss": 0.0473, "step": 79870 }, { "epoch": 96.3572721786361, "grad_norm": 4.474508762359619, "learning_rate": 1.9998075293379916e-05, "loss": 0.0485, "step": 79880 }, { "epoch": 96.3693421846711, "grad_norm": 4.5287251472473145, "learning_rate": 1.9998075052125123e-05, "loss": 0.0451, "step": 79890 }, { "epoch": 96.3814121907061, "grad_norm": 4.650768756866455, "learning_rate": 1.9998074810870325e-05, "loss": 0.0458, "step": 79900 }, { "epoch": 96.3934821967411, "grad_norm": 4.724643230438232, "learning_rate": 1.999807456961553e-05, "loss": 0.0485, "step": 79910 }, { "epoch": 96.40555220277611, "grad_norm": 4.657209873199463, "learning_rate": 1.9998074328360738e-05, "loss": 0.047, "step": 79920 }, { "epoch": 96.41762220881111, "grad_norm": 4.622216701507568, "learning_rate": 1.9998074087105944e-05, "loss": 0.0468, "step": 79930 }, { "epoch": 96.42969221484611, "grad_norm": 4.974682807922363, "learning_rate": 1.999807384585115e-05, "loss": 0.0476, "step": 79940 }, { "epoch": 96.44176222088112, "grad_norm": 4.522447109222412, "learning_rate": 1.9998073604596356e-05, "loss": 0.0466, "step": 79950 }, { "epoch": 96.45383222691612, "grad_norm": 5.093474388122559, "learning_rate": 1.9998073363341563e-05, "loss": 0.0464, "step": 79960 }, { "epoch": 96.46590223295112, "grad_norm": 4.352967262268066, "learning_rate": 1.999807312208677e-05, "loss": 0.0465, "step": 79970 }, { "epoch": 96.47797223898613, "grad_norm": 4.534321308135986, "learning_rate": 1.9998072880831975e-05, "loss": 0.0487, "step": 79980 }, { "epoch": 96.49004224502113, "grad_norm": 4.841190814971924, "learning_rate": 1.999807263957718e-05, "loss": 0.0484, "step": 79990 }, { "epoch": 96.50211225105613, "grad_norm": 4.961386203765869, "learning_rate": 1.9998072398322388e-05, "loss": 0.0483, "step": 80000 }, { "epoch": 96.50211225105613, "eval_loss": 12.804495811462402, "eval_runtime": 8.149, "eval_samples_per_second": 85.532, "eval_steps_per_second": 10.799, "step": 80000 }, { "epoch": 96.51418225709112, "grad_norm": 4.883261680603027, "learning_rate": 1.9998072157067594e-05, "loss": 0.0518, "step": 80010 }, { "epoch": 96.52625226312612, "grad_norm": 4.9651641845703125, "learning_rate": 1.99980719158128e-05, "loss": 0.0509, "step": 80020 }, { "epoch": 96.53832226916113, "grad_norm": 4.522050857543945, "learning_rate": 1.9998071674558006e-05, "loss": 0.0498, "step": 80030 }, { "epoch": 96.55039227519613, "grad_norm": 4.511635780334473, "learning_rate": 1.9998071433303212e-05, "loss": 0.0477, "step": 80040 }, { "epoch": 96.56246228123113, "grad_norm": 5.025040626525879, "learning_rate": 1.999807119204842e-05, "loss": 0.0486, "step": 80050 }, { "epoch": 96.57453228726614, "grad_norm": 4.802235126495361, "learning_rate": 1.9998070950793625e-05, "loss": 0.0479, "step": 80060 }, { "epoch": 96.58660229330114, "grad_norm": 4.675863742828369, "learning_rate": 1.999807070953883e-05, "loss": 0.0489, "step": 80070 }, { "epoch": 96.59867229933614, "grad_norm": 4.336817264556885, "learning_rate": 1.9998070468284037e-05, "loss": 0.0463, "step": 80080 }, { "epoch": 96.61074230537115, "grad_norm": 4.647403240203857, "learning_rate": 1.9998070227029244e-05, "loss": 0.0468, "step": 80090 }, { "epoch": 96.62281231140615, "grad_norm": 4.346622943878174, "learning_rate": 1.999806998577445e-05, "loss": 0.0486, "step": 80100 }, { "epoch": 96.63488231744115, "grad_norm": 5.492422103881836, "learning_rate": 1.9998069744519656e-05, "loss": 0.0489, "step": 80110 }, { "epoch": 96.64695232347616, "grad_norm": 4.482796669006348, "learning_rate": 1.9998069503264862e-05, "loss": 0.0488, "step": 80120 }, { "epoch": 96.65902232951116, "grad_norm": 4.694137096405029, "learning_rate": 1.999806926201007e-05, "loss": 0.0488, "step": 80130 }, { "epoch": 96.67109233554616, "grad_norm": 4.7936506271362305, "learning_rate": 1.9998069020755275e-05, "loss": 0.0501, "step": 80140 }, { "epoch": 96.68316234158117, "grad_norm": 4.760578632354736, "learning_rate": 1.9998068779500477e-05, "loss": 0.0491, "step": 80150 }, { "epoch": 96.69523234761617, "grad_norm": 4.8306379318237305, "learning_rate": 1.9998068538245684e-05, "loss": 0.0501, "step": 80160 }, { "epoch": 96.70730235365117, "grad_norm": 4.969155788421631, "learning_rate": 1.999806829699089e-05, "loss": 0.049, "step": 80170 }, { "epoch": 96.71937235968618, "grad_norm": 4.138284206390381, "learning_rate": 1.9998068055736096e-05, "loss": 0.0488, "step": 80180 }, { "epoch": 96.73144236572118, "grad_norm": 4.680025577545166, "learning_rate": 1.9998067814481302e-05, "loss": 0.0466, "step": 80190 }, { "epoch": 96.74351237175618, "grad_norm": 4.734085559844971, "learning_rate": 1.999806757322651e-05, "loss": 0.0512, "step": 80200 }, { "epoch": 96.75558237779119, "grad_norm": 4.365488529205322, "learning_rate": 1.9998067331971715e-05, "loss": 0.0518, "step": 80210 }, { "epoch": 96.76765238382619, "grad_norm": 4.79707670211792, "learning_rate": 1.999806709071692e-05, "loss": 0.0525, "step": 80220 }, { "epoch": 96.77972238986119, "grad_norm": 4.891663074493408, "learning_rate": 1.9998066849462127e-05, "loss": 0.0512, "step": 80230 }, { "epoch": 96.7917923958962, "grad_norm": 5.270427703857422, "learning_rate": 1.9998066608207333e-05, "loss": 0.0505, "step": 80240 }, { "epoch": 96.8038624019312, "grad_norm": 5.047259330749512, "learning_rate": 1.999806636695254e-05, "loss": 0.0516, "step": 80250 }, { "epoch": 96.8159324079662, "grad_norm": 4.755565166473389, "learning_rate": 1.9998066125697746e-05, "loss": 0.0493, "step": 80260 }, { "epoch": 96.8280024140012, "grad_norm": 4.594718933105469, "learning_rate": 1.9998065884442952e-05, "loss": 0.0506, "step": 80270 }, { "epoch": 96.84007242003621, "grad_norm": 5.308048725128174, "learning_rate": 1.9998065643188158e-05, "loss": 0.0543, "step": 80280 }, { "epoch": 96.85214242607121, "grad_norm": 4.586726188659668, "learning_rate": 1.9998065401933364e-05, "loss": 0.0514, "step": 80290 }, { "epoch": 96.86421243210621, "grad_norm": 5.116699695587158, "learning_rate": 1.999806516067857e-05, "loss": 0.0509, "step": 80300 }, { "epoch": 96.87628243814122, "grad_norm": 5.291916847229004, "learning_rate": 1.9998064919423777e-05, "loss": 0.0519, "step": 80310 }, { "epoch": 96.88835244417622, "grad_norm": 4.635359764099121, "learning_rate": 1.9998064678168983e-05, "loss": 0.0513, "step": 80320 }, { "epoch": 96.90042245021122, "grad_norm": 4.937749862670898, "learning_rate": 1.999806443691419e-05, "loss": 0.0529, "step": 80330 }, { "epoch": 96.91249245624623, "grad_norm": 4.900115489959717, "learning_rate": 1.9998064195659395e-05, "loss": 0.0507, "step": 80340 }, { "epoch": 96.92456246228123, "grad_norm": 4.661641597747803, "learning_rate": 1.9998063954404602e-05, "loss": 0.0526, "step": 80350 }, { "epoch": 96.93663246831623, "grad_norm": 4.370197772979736, "learning_rate": 1.9998063713149808e-05, "loss": 0.0507, "step": 80360 }, { "epoch": 96.94870247435124, "grad_norm": 5.5611042976379395, "learning_rate": 1.9998063471895014e-05, "loss": 0.0522, "step": 80370 }, { "epoch": 96.96077248038624, "grad_norm": 5.359743118286133, "learning_rate": 1.999806323064022e-05, "loss": 0.0545, "step": 80380 }, { "epoch": 96.97284248642124, "grad_norm": 4.925756931304932, "learning_rate": 1.9998062989385427e-05, "loss": 0.0536, "step": 80390 }, { "epoch": 96.98491249245625, "grad_norm": 4.569095134735107, "learning_rate": 1.9998062748130633e-05, "loss": 0.0524, "step": 80400 }, { "epoch": 96.99698249849125, "grad_norm": 4.731053829193115, "learning_rate": 1.999806250687584e-05, "loss": 0.0508, "step": 80410 }, { "epoch": 97.0084490042245, "grad_norm": 3.895951509475708, "learning_rate": 1.9998062265621045e-05, "loss": 0.0391, "step": 80420 }, { "epoch": 97.0205190102595, "grad_norm": 4.173140525817871, "learning_rate": 1.999806202436625e-05, "loss": 0.0357, "step": 80430 }, { "epoch": 97.03258901629451, "grad_norm": 4.129063606262207, "learning_rate": 1.9998061783111458e-05, "loss": 0.0368, "step": 80440 }, { "epoch": 97.04465902232951, "grad_norm": 4.055091857910156, "learning_rate": 1.9998061541856664e-05, "loss": 0.0369, "step": 80450 }, { "epoch": 97.05672902836451, "grad_norm": 4.1674065589904785, "learning_rate": 1.999806130060187e-05, "loss": 0.037, "step": 80460 }, { "epoch": 97.06879903439952, "grad_norm": 4.354358673095703, "learning_rate": 1.9998061059347076e-05, "loss": 0.0399, "step": 80470 }, { "epoch": 97.08086904043452, "grad_norm": 4.131934642791748, "learning_rate": 1.9998060818092283e-05, "loss": 0.0393, "step": 80480 }, { "epoch": 97.09293904646952, "grad_norm": 4.3008341789245605, "learning_rate": 1.999806057683749e-05, "loss": 0.0404, "step": 80490 }, { "epoch": 97.10500905250453, "grad_norm": 4.609408855438232, "learning_rate": 1.9998060335582695e-05, "loss": 0.038, "step": 80500 }, { "epoch": 97.10500905250453, "eval_loss": 12.792318344116211, "eval_runtime": 8.1355, "eval_samples_per_second": 85.674, "eval_steps_per_second": 10.817, "step": 80500 }, { "epoch": 97.11707905853953, "grad_norm": 4.28140926361084, "learning_rate": 1.99980600943279e-05, "loss": 0.0413, "step": 80510 }, { "epoch": 97.12914906457453, "grad_norm": 3.9028210639953613, "learning_rate": 1.9998059853073107e-05, "loss": 0.0417, "step": 80520 }, { "epoch": 97.14121907060954, "grad_norm": 4.1701507568359375, "learning_rate": 1.9998059611818314e-05, "loss": 0.0408, "step": 80530 }, { "epoch": 97.15328907664454, "grad_norm": 4.309243202209473, "learning_rate": 1.999805937056352e-05, "loss": 0.042, "step": 80540 }, { "epoch": 97.16535908267954, "grad_norm": 4.291857719421387, "learning_rate": 1.9998059129308726e-05, "loss": 0.0441, "step": 80550 }, { "epoch": 97.17742908871455, "grad_norm": 4.2071967124938965, "learning_rate": 1.9998058888053932e-05, "loss": 0.0428, "step": 80560 }, { "epoch": 97.18949909474955, "grad_norm": 4.347775936126709, "learning_rate": 1.9998058646799135e-05, "loss": 0.0417, "step": 80570 }, { "epoch": 97.20156910078455, "grad_norm": 4.315399169921875, "learning_rate": 1.999805840554434e-05, "loss": 0.0421, "step": 80580 }, { "epoch": 97.21363910681956, "grad_norm": 4.6162495613098145, "learning_rate": 1.9998058164289547e-05, "loss": 0.0411, "step": 80590 }, { "epoch": 97.22570911285456, "grad_norm": 3.9849047660827637, "learning_rate": 1.9998057923034754e-05, "loss": 0.0448, "step": 80600 }, { "epoch": 97.23777911888956, "grad_norm": 4.719464302062988, "learning_rate": 1.999805768177996e-05, "loss": 0.0433, "step": 80610 }, { "epoch": 97.24984912492457, "grad_norm": 4.616150856018066, "learning_rate": 1.9998057440525166e-05, "loss": 0.0436, "step": 80620 }, { "epoch": 97.26191913095957, "grad_norm": 4.187213897705078, "learning_rate": 1.9998057199270372e-05, "loss": 0.0452, "step": 80630 }, { "epoch": 97.27398913699457, "grad_norm": 4.823293209075928, "learning_rate": 1.999805695801558e-05, "loss": 0.0447, "step": 80640 }, { "epoch": 97.28605914302958, "grad_norm": 3.9916939735412598, "learning_rate": 1.9998056716760785e-05, "loss": 0.0449, "step": 80650 }, { "epoch": 97.29812914906458, "grad_norm": 4.291493892669678, "learning_rate": 1.999805647550599e-05, "loss": 0.0429, "step": 80660 }, { "epoch": 97.31019915509958, "grad_norm": 4.851772308349609, "learning_rate": 1.9998056234251197e-05, "loss": 0.0452, "step": 80670 }, { "epoch": 97.32226916113459, "grad_norm": 4.232090950012207, "learning_rate": 1.9998055992996403e-05, "loss": 0.044, "step": 80680 }, { "epoch": 97.33433916716959, "grad_norm": 5.0723557472229, "learning_rate": 1.999805575174161e-05, "loss": 0.0462, "step": 80690 }, { "epoch": 97.34640917320459, "grad_norm": 4.895280361175537, "learning_rate": 1.9998055510486816e-05, "loss": 0.0466, "step": 80700 }, { "epoch": 97.3584791792396, "grad_norm": 4.290903568267822, "learning_rate": 1.9998055269232022e-05, "loss": 0.0442, "step": 80710 }, { "epoch": 97.3705491852746, "grad_norm": 5.159426689147949, "learning_rate": 1.9998055027977228e-05, "loss": 0.0472, "step": 80720 }, { "epoch": 97.3826191913096, "grad_norm": 4.832973480224609, "learning_rate": 1.9998054786722435e-05, "loss": 0.0464, "step": 80730 }, { "epoch": 97.3946891973446, "grad_norm": 4.388495922088623, "learning_rate": 1.999805454546764e-05, "loss": 0.046, "step": 80740 }, { "epoch": 97.40675920337961, "grad_norm": 4.783690929412842, "learning_rate": 1.9998054304212847e-05, "loss": 0.0485, "step": 80750 }, { "epoch": 97.41882920941461, "grad_norm": 4.759093761444092, "learning_rate": 1.9998054062958053e-05, "loss": 0.0428, "step": 80760 }, { "epoch": 97.43089921544961, "grad_norm": 4.965703964233398, "learning_rate": 1.999805382170326e-05, "loss": 0.0457, "step": 80770 }, { "epoch": 97.44296922148462, "grad_norm": 4.824136734008789, "learning_rate": 1.9998053580448466e-05, "loss": 0.0467, "step": 80780 }, { "epoch": 97.45503922751962, "grad_norm": 4.485579013824463, "learning_rate": 1.9998053339193672e-05, "loss": 0.0469, "step": 80790 }, { "epoch": 97.46710923355462, "grad_norm": 4.8448591232299805, "learning_rate": 1.9998053097938878e-05, "loss": 0.0461, "step": 80800 }, { "epoch": 97.47917923958963, "grad_norm": 4.6768574714660645, "learning_rate": 1.9998052856684084e-05, "loss": 0.0466, "step": 80810 }, { "epoch": 97.49124924562463, "grad_norm": 4.683547496795654, "learning_rate": 1.9998052615429287e-05, "loss": 0.0465, "step": 80820 }, { "epoch": 97.50331925165963, "grad_norm": 4.737412929534912, "learning_rate": 1.9998052374174493e-05, "loss": 0.0473, "step": 80830 }, { "epoch": 97.51538925769462, "grad_norm": 4.793842792510986, "learning_rate": 1.99980521329197e-05, "loss": 0.0469, "step": 80840 }, { "epoch": 97.52745926372963, "grad_norm": 4.953111171722412, "learning_rate": 1.9998051891664906e-05, "loss": 0.049, "step": 80850 }, { "epoch": 97.53952926976463, "grad_norm": 4.631315231323242, "learning_rate": 1.9998051650410115e-05, "loss": 0.0452, "step": 80860 }, { "epoch": 97.55159927579963, "grad_norm": 4.643808841705322, "learning_rate": 1.999805140915532e-05, "loss": 0.0473, "step": 80870 }, { "epoch": 97.56366928183463, "grad_norm": 4.61457633972168, "learning_rate": 1.9998051167900528e-05, "loss": 0.0464, "step": 80880 }, { "epoch": 97.57573928786964, "grad_norm": 4.820600986480713, "learning_rate": 1.9998050926645734e-05, "loss": 0.0491, "step": 80890 }, { "epoch": 97.58780929390464, "grad_norm": 4.222254276275635, "learning_rate": 1.999805068539094e-05, "loss": 0.0464, "step": 80900 }, { "epoch": 97.59987929993964, "grad_norm": 4.522668838500977, "learning_rate": 1.9998050444136146e-05, "loss": 0.0493, "step": 80910 }, { "epoch": 97.61194930597465, "grad_norm": 4.853472709655762, "learning_rate": 1.9998050202881353e-05, "loss": 0.0478, "step": 80920 }, { "epoch": 97.62401931200965, "grad_norm": 4.693339824676514, "learning_rate": 1.999804996162656e-05, "loss": 0.0487, "step": 80930 }, { "epoch": 97.63608931804465, "grad_norm": 5.19867467880249, "learning_rate": 1.9998049720371765e-05, "loss": 0.0492, "step": 80940 }, { "epoch": 97.64815932407966, "grad_norm": 4.7985124588012695, "learning_rate": 1.999804947911697e-05, "loss": 0.0475, "step": 80950 }, { "epoch": 97.66022933011466, "grad_norm": 4.315157890319824, "learning_rate": 1.9998049237862177e-05, "loss": 0.0475, "step": 80960 }, { "epoch": 97.67229933614966, "grad_norm": 4.42549467086792, "learning_rate": 1.9998048996607384e-05, "loss": 0.0479, "step": 80970 }, { "epoch": 97.68436934218467, "grad_norm": 4.890717506408691, "learning_rate": 1.9998048755352587e-05, "loss": 0.0478, "step": 80980 }, { "epoch": 97.69643934821967, "grad_norm": 5.28685998916626, "learning_rate": 1.9998048514097793e-05, "loss": 0.0489, "step": 80990 }, { "epoch": 97.70850935425467, "grad_norm": 5.183189392089844, "learning_rate": 1.9998048272843e-05, "loss": 0.0504, "step": 81000 }, { "epoch": 97.70850935425467, "eval_loss": 12.817726135253906, "eval_runtime": 8.1509, "eval_samples_per_second": 85.512, "eval_steps_per_second": 10.796, "step": 81000 }, { "epoch": 97.72057936028968, "grad_norm": 4.3537917137146, "learning_rate": 1.9998048031588205e-05, "loss": 0.0512, "step": 81010 }, { "epoch": 97.73264936632468, "grad_norm": 4.632035732269287, "learning_rate": 1.999804779033341e-05, "loss": 0.0481, "step": 81020 }, { "epoch": 97.74471937235968, "grad_norm": 5.122387886047363, "learning_rate": 1.9998047549078618e-05, "loss": 0.0475, "step": 81030 }, { "epoch": 97.75678937839469, "grad_norm": 4.886849880218506, "learning_rate": 1.9998047307823824e-05, "loss": 0.048, "step": 81040 }, { "epoch": 97.76885938442969, "grad_norm": 4.336805820465088, "learning_rate": 1.999804706656903e-05, "loss": 0.0504, "step": 81050 }, { "epoch": 97.78092939046469, "grad_norm": 4.8310699462890625, "learning_rate": 1.9998046825314236e-05, "loss": 0.052, "step": 81060 }, { "epoch": 97.7929993964997, "grad_norm": 4.456433296203613, "learning_rate": 1.9998046584059442e-05, "loss": 0.0504, "step": 81070 }, { "epoch": 97.8050694025347, "grad_norm": 4.574093818664551, "learning_rate": 1.999804634280465e-05, "loss": 0.0483, "step": 81080 }, { "epoch": 97.8171394085697, "grad_norm": 5.194366931915283, "learning_rate": 1.9998046101549855e-05, "loss": 0.0479, "step": 81090 }, { "epoch": 97.8292094146047, "grad_norm": 4.779168128967285, "learning_rate": 1.999804586029506e-05, "loss": 0.0497, "step": 81100 }, { "epoch": 97.84127942063971, "grad_norm": 5.8201003074646, "learning_rate": 1.9998045619040267e-05, "loss": 0.0511, "step": 81110 }, { "epoch": 97.85334942667471, "grad_norm": 5.0678791999816895, "learning_rate": 1.9998045377785474e-05, "loss": 0.0537, "step": 81120 }, { "epoch": 97.86541943270971, "grad_norm": 5.070559501647949, "learning_rate": 1.999804513653068e-05, "loss": 0.0524, "step": 81130 }, { "epoch": 97.87748943874472, "grad_norm": 5.091709136962891, "learning_rate": 1.9998044895275886e-05, "loss": 0.0529, "step": 81140 }, { "epoch": 97.88955944477972, "grad_norm": 5.1331963539123535, "learning_rate": 1.9998044654021092e-05, "loss": 0.0503, "step": 81150 }, { "epoch": 97.90162945081472, "grad_norm": 5.452621936798096, "learning_rate": 1.99980444127663e-05, "loss": 0.0524, "step": 81160 }, { "epoch": 97.91369945684973, "grad_norm": 4.69914436340332, "learning_rate": 1.9998044171511505e-05, "loss": 0.0499, "step": 81170 }, { "epoch": 97.92576946288473, "grad_norm": 4.583319664001465, "learning_rate": 1.999804393025671e-05, "loss": 0.0527, "step": 81180 }, { "epoch": 97.93783946891973, "grad_norm": 5.102647304534912, "learning_rate": 1.9998043689001917e-05, "loss": 0.0525, "step": 81190 }, { "epoch": 97.94990947495474, "grad_norm": 4.79567813873291, "learning_rate": 1.9998043447747123e-05, "loss": 0.0529, "step": 81200 }, { "epoch": 97.96197948098974, "grad_norm": 4.6424102783203125, "learning_rate": 1.999804320649233e-05, "loss": 0.0504, "step": 81210 }, { "epoch": 97.97404948702474, "grad_norm": 4.459578037261963, "learning_rate": 1.9998042965237536e-05, "loss": 0.0504, "step": 81220 }, { "epoch": 97.98611949305975, "grad_norm": 5.155092716217041, "learning_rate": 1.999804272398274e-05, "loss": 0.0525, "step": 81230 }, { "epoch": 97.99818949909475, "grad_norm": 4.579289436340332, "learning_rate": 1.9998042482727945e-05, "loss": 0.0525, "step": 81240 }, { "epoch": 98.009656004828, "grad_norm": 3.5158302783966064, "learning_rate": 1.999804224147315e-05, "loss": 0.0388, "step": 81250 }, { "epoch": 98.021726010863, "grad_norm": 3.982022285461426, "learning_rate": 1.9998042000218357e-05, "loss": 0.0356, "step": 81260 }, { "epoch": 98.03379601689801, "grad_norm": 4.499565124511719, "learning_rate": 1.9998041758963563e-05, "loss": 0.037, "step": 81270 }, { "epoch": 98.04586602293301, "grad_norm": 4.33507776260376, "learning_rate": 1.999804151770877e-05, "loss": 0.0389, "step": 81280 }, { "epoch": 98.05793602896802, "grad_norm": 3.786144971847534, "learning_rate": 1.9998041276453976e-05, "loss": 0.0388, "step": 81290 }, { "epoch": 98.07000603500302, "grad_norm": 4.12038516998291, "learning_rate": 1.9998041035199182e-05, "loss": 0.0398, "step": 81300 }, { "epoch": 98.08207604103802, "grad_norm": 4.431696891784668, "learning_rate": 1.9998040793944388e-05, "loss": 0.0389, "step": 81310 }, { "epoch": 98.09414604707302, "grad_norm": 4.090619087219238, "learning_rate": 1.9998040552689594e-05, "loss": 0.0419, "step": 81320 }, { "epoch": 98.10621605310803, "grad_norm": 4.027188777923584, "learning_rate": 1.99980403114348e-05, "loss": 0.0399, "step": 81330 }, { "epoch": 98.11828605914303, "grad_norm": 4.334508419036865, "learning_rate": 1.9998040070180007e-05, "loss": 0.038, "step": 81340 }, { "epoch": 98.13035606517803, "grad_norm": 4.497455596923828, "learning_rate": 1.9998039828925213e-05, "loss": 0.0432, "step": 81350 }, { "epoch": 98.14242607121304, "grad_norm": 4.044776916503906, "learning_rate": 1.999803958767042e-05, "loss": 0.0406, "step": 81360 }, { "epoch": 98.15449607724804, "grad_norm": 4.5164361000061035, "learning_rate": 1.9998039346415626e-05, "loss": 0.042, "step": 81370 }, { "epoch": 98.16656608328304, "grad_norm": 4.067342758178711, "learning_rate": 1.9998039105160832e-05, "loss": 0.0418, "step": 81380 }, { "epoch": 98.17863608931805, "grad_norm": 4.295238971710205, "learning_rate": 1.9998038863906038e-05, "loss": 0.0425, "step": 81390 }, { "epoch": 98.19070609535305, "grad_norm": 4.124503135681152, "learning_rate": 1.9998038622651244e-05, "loss": 0.0416, "step": 81400 }, { "epoch": 98.20277610138805, "grad_norm": 4.963180065155029, "learning_rate": 1.999803838139645e-05, "loss": 0.0427, "step": 81410 }, { "epoch": 98.21484610742306, "grad_norm": 4.208033561706543, "learning_rate": 1.9998038140141657e-05, "loss": 0.0422, "step": 81420 }, { "epoch": 98.22691611345806, "grad_norm": 4.941347122192383, "learning_rate": 1.9998037898886863e-05, "loss": 0.0445, "step": 81430 }, { "epoch": 98.23898611949306, "grad_norm": 4.310028553009033, "learning_rate": 1.999803765763207e-05, "loss": 0.0437, "step": 81440 }, { "epoch": 98.25105612552807, "grad_norm": 4.583357334136963, "learning_rate": 1.9998037416377275e-05, "loss": 0.0454, "step": 81450 }, { "epoch": 98.26312613156307, "grad_norm": 4.21547269821167, "learning_rate": 1.999803717512248e-05, "loss": 0.0466, "step": 81460 }, { "epoch": 98.27519613759807, "grad_norm": 4.192857265472412, "learning_rate": 1.9998036933867688e-05, "loss": 0.0432, "step": 81470 }, { "epoch": 98.28726614363308, "grad_norm": 4.493104457855225, "learning_rate": 1.9998036692612894e-05, "loss": 0.0457, "step": 81480 }, { "epoch": 98.29933614966808, "grad_norm": 4.769906997680664, "learning_rate": 1.99980364513581e-05, "loss": 0.0451, "step": 81490 }, { "epoch": 98.31140615570308, "grad_norm": 4.336683750152588, "learning_rate": 1.9998036210103306e-05, "loss": 0.043, "step": 81500 }, { "epoch": 98.31140615570308, "eval_loss": 12.82152271270752, "eval_runtime": 8.13, "eval_samples_per_second": 85.731, "eval_steps_per_second": 10.824, "step": 81500 }, { "epoch": 98.32347616173809, "grad_norm": 4.646142959594727, "learning_rate": 1.9998035968848513e-05, "loss": 0.0456, "step": 81510 }, { "epoch": 98.33554616777309, "grad_norm": 4.808854103088379, "learning_rate": 1.999803572759372e-05, "loss": 0.0463, "step": 81520 }, { "epoch": 98.34761617380809, "grad_norm": 4.742020130157471, "learning_rate": 1.9998035486338925e-05, "loss": 0.0459, "step": 81530 }, { "epoch": 98.3596861798431, "grad_norm": 5.007101535797119, "learning_rate": 1.999803524508413e-05, "loss": 0.0466, "step": 81540 }, { "epoch": 98.3717561858781, "grad_norm": 4.728388786315918, "learning_rate": 1.9998035003829337e-05, "loss": 0.0448, "step": 81550 }, { "epoch": 98.3838261919131, "grad_norm": 4.12952184677124, "learning_rate": 1.9998034762574544e-05, "loss": 0.0461, "step": 81560 }, { "epoch": 98.3958961979481, "grad_norm": 4.764995098114014, "learning_rate": 1.999803452131975e-05, "loss": 0.0451, "step": 81570 }, { "epoch": 98.40796620398311, "grad_norm": 4.932131767272949, "learning_rate": 1.9998034280064956e-05, "loss": 0.0466, "step": 81580 }, { "epoch": 98.42003621001811, "grad_norm": 4.775599002838135, "learning_rate": 1.9998034038810162e-05, "loss": 0.0456, "step": 81590 }, { "epoch": 98.43210621605311, "grad_norm": 4.602090835571289, "learning_rate": 1.999803379755537e-05, "loss": 0.0464, "step": 81600 }, { "epoch": 98.44417622208812, "grad_norm": 4.195338249206543, "learning_rate": 1.9998033556300575e-05, "loss": 0.0459, "step": 81610 }, { "epoch": 98.45624622812312, "grad_norm": 5.235936641693115, "learning_rate": 1.999803331504578e-05, "loss": 0.0452, "step": 81620 }, { "epoch": 98.46831623415812, "grad_norm": 4.086232662200928, "learning_rate": 1.9998033073790987e-05, "loss": 0.0471, "step": 81630 }, { "epoch": 98.48038624019313, "grad_norm": 4.759445667266846, "learning_rate": 1.9998032832536193e-05, "loss": 0.0481, "step": 81640 }, { "epoch": 98.49245624622813, "grad_norm": 4.630607604980469, "learning_rate": 1.9998032591281396e-05, "loss": 0.0452, "step": 81650 }, { "epoch": 98.50452625226312, "grad_norm": 4.84041166305542, "learning_rate": 1.9998032350026602e-05, "loss": 0.0475, "step": 81660 }, { "epoch": 98.51659625829812, "grad_norm": 4.763901710510254, "learning_rate": 1.999803210877181e-05, "loss": 0.0475, "step": 81670 }, { "epoch": 98.52866626433313, "grad_norm": 4.196941375732422, "learning_rate": 1.9998031867517015e-05, "loss": 0.047, "step": 81680 }, { "epoch": 98.54073627036813, "grad_norm": 4.196876525878906, "learning_rate": 1.999803162626222e-05, "loss": 0.0476, "step": 81690 }, { "epoch": 98.55280627640313, "grad_norm": 5.298069477081299, "learning_rate": 1.9998031385007427e-05, "loss": 0.0466, "step": 81700 }, { "epoch": 98.56487628243814, "grad_norm": 4.826600074768066, "learning_rate": 1.9998031143752633e-05, "loss": 0.0469, "step": 81710 }, { "epoch": 98.57694628847314, "grad_norm": 5.220310211181641, "learning_rate": 1.999803090249784e-05, "loss": 0.0508, "step": 81720 }, { "epoch": 98.58901629450814, "grad_norm": 4.447760105133057, "learning_rate": 1.9998030661243046e-05, "loss": 0.0486, "step": 81730 }, { "epoch": 98.60108630054314, "grad_norm": 4.329344749450684, "learning_rate": 1.9998030419988252e-05, "loss": 0.0484, "step": 81740 }, { "epoch": 98.61315630657815, "grad_norm": 4.918455123901367, "learning_rate": 1.999803017873346e-05, "loss": 0.0469, "step": 81750 }, { "epoch": 98.62522631261315, "grad_norm": 4.799095153808594, "learning_rate": 1.9998029937478665e-05, "loss": 0.047, "step": 81760 }, { "epoch": 98.63729631864815, "grad_norm": 5.014058589935303, "learning_rate": 1.999802969622387e-05, "loss": 0.0482, "step": 81770 }, { "epoch": 98.64936632468316, "grad_norm": 5.311548233032227, "learning_rate": 1.9998029454969077e-05, "loss": 0.0464, "step": 81780 }, { "epoch": 98.66143633071816, "grad_norm": 4.456405162811279, "learning_rate": 1.9998029213714283e-05, "loss": 0.0509, "step": 81790 }, { "epoch": 98.67350633675316, "grad_norm": 4.37872314453125, "learning_rate": 1.999802897245949e-05, "loss": 0.0462, "step": 81800 }, { "epoch": 98.68557634278817, "grad_norm": 5.123855113983154, "learning_rate": 1.9998028731204696e-05, "loss": 0.0503, "step": 81810 }, { "epoch": 98.69764634882317, "grad_norm": 4.465582847595215, "learning_rate": 1.9998028489949902e-05, "loss": 0.0491, "step": 81820 }, { "epoch": 98.70971635485817, "grad_norm": 4.438023090362549, "learning_rate": 1.9998028248695108e-05, "loss": 0.0477, "step": 81830 }, { "epoch": 98.72178636089318, "grad_norm": 4.708314418792725, "learning_rate": 1.9998028007440314e-05, "loss": 0.0498, "step": 81840 }, { "epoch": 98.73385636692818, "grad_norm": 4.908749103546143, "learning_rate": 1.999802776618552e-05, "loss": 0.0481, "step": 81850 }, { "epoch": 98.74592637296318, "grad_norm": 4.74984884262085, "learning_rate": 1.9998027524930727e-05, "loss": 0.0505, "step": 81860 }, { "epoch": 98.75799637899819, "grad_norm": 4.917177200317383, "learning_rate": 1.9998027283675933e-05, "loss": 0.0515, "step": 81870 }, { "epoch": 98.77006638503319, "grad_norm": 5.0143961906433105, "learning_rate": 1.999802704242114e-05, "loss": 0.0511, "step": 81880 }, { "epoch": 98.78213639106819, "grad_norm": 4.827152252197266, "learning_rate": 1.9998026801166345e-05, "loss": 0.0496, "step": 81890 }, { "epoch": 98.7942063971032, "grad_norm": 5.064960956573486, "learning_rate": 1.9998026559911548e-05, "loss": 0.05, "step": 81900 }, { "epoch": 98.8062764031382, "grad_norm": 4.641687870025635, "learning_rate": 1.9998026318656754e-05, "loss": 0.0496, "step": 81910 }, { "epoch": 98.8183464091732, "grad_norm": 5.020212173461914, "learning_rate": 1.999802607740196e-05, "loss": 0.0533, "step": 81920 }, { "epoch": 98.8304164152082, "grad_norm": 4.667603015899658, "learning_rate": 1.9998025836147167e-05, "loss": 0.0506, "step": 81930 }, { "epoch": 98.84248642124321, "grad_norm": 4.3798604011535645, "learning_rate": 1.9998025594892376e-05, "loss": 0.0504, "step": 81940 }, { "epoch": 98.85455642727821, "grad_norm": 4.361633777618408, "learning_rate": 1.9998025353637583e-05, "loss": 0.05, "step": 81950 }, { "epoch": 98.86662643331321, "grad_norm": 4.367061614990234, "learning_rate": 1.999802511238279e-05, "loss": 0.0499, "step": 81960 }, { "epoch": 98.87869643934822, "grad_norm": 4.999725341796875, "learning_rate": 1.9998024871127995e-05, "loss": 0.0493, "step": 81970 }, { "epoch": 98.89076644538322, "grad_norm": 4.7614312171936035, "learning_rate": 1.99980246298732e-05, "loss": 0.0485, "step": 81980 }, { "epoch": 98.90283645141822, "grad_norm": 5.072967529296875, "learning_rate": 1.9998024388618407e-05, "loss": 0.0482, "step": 81990 }, { "epoch": 98.91490645745323, "grad_norm": 4.539009094238281, "learning_rate": 1.9998024147363614e-05, "loss": 0.0508, "step": 82000 }, { "epoch": 98.91490645745323, "eval_loss": 12.825911521911621, "eval_runtime": 8.1308, "eval_samples_per_second": 85.724, "eval_steps_per_second": 10.823, "step": 82000 }, { "epoch": 98.92697646348823, "grad_norm": 4.647853851318359, "learning_rate": 1.999802390610882e-05, "loss": 0.0509, "step": 82010 }, { "epoch": 98.93904646952323, "grad_norm": 4.890029430389404, "learning_rate": 1.9998023664854026e-05, "loss": 0.052, "step": 82020 }, { "epoch": 98.95111647555824, "grad_norm": 4.991296291351318, "learning_rate": 1.9998023423599232e-05, "loss": 0.0486, "step": 82030 }, { "epoch": 98.96318648159324, "grad_norm": 5.040450096130371, "learning_rate": 1.999802318234444e-05, "loss": 0.0502, "step": 82040 }, { "epoch": 98.97525648762824, "grad_norm": 4.7786712646484375, "learning_rate": 1.9998022941089645e-05, "loss": 0.0504, "step": 82050 }, { "epoch": 98.98732649366325, "grad_norm": 4.536142826080322, "learning_rate": 1.9998022699834848e-05, "loss": 0.0507, "step": 82060 }, { "epoch": 98.99939649969825, "grad_norm": 5.01600456237793, "learning_rate": 1.9998022458580054e-05, "loss": 0.0487, "step": 82070 }, { "epoch": 99.0108630054315, "grad_norm": 3.841468334197998, "learning_rate": 1.999802221732526e-05, "loss": 0.0363, "step": 82080 }, { "epoch": 99.0229330114665, "grad_norm": 3.947122812271118, "learning_rate": 1.9998021976070466e-05, "loss": 0.0365, "step": 82090 }, { "epoch": 99.03500301750151, "grad_norm": 3.716384172439575, "learning_rate": 1.9998021734815672e-05, "loss": 0.0375, "step": 82100 }, { "epoch": 99.04707302353651, "grad_norm": 4.377647399902344, "learning_rate": 1.999802149356088e-05, "loss": 0.0381, "step": 82110 }, { "epoch": 99.05914302957152, "grad_norm": 4.308010578155518, "learning_rate": 1.9998021252306085e-05, "loss": 0.037, "step": 82120 }, { "epoch": 99.07121303560652, "grad_norm": 4.154442310333252, "learning_rate": 1.999802101105129e-05, "loss": 0.0369, "step": 82130 }, { "epoch": 99.08328304164152, "grad_norm": 4.1776041984558105, "learning_rate": 1.9998020769796497e-05, "loss": 0.0388, "step": 82140 }, { "epoch": 99.09535304767653, "grad_norm": 4.2109785079956055, "learning_rate": 1.9998020528541704e-05, "loss": 0.0396, "step": 82150 }, { "epoch": 99.10742305371153, "grad_norm": 4.158817768096924, "learning_rate": 1.999802028728691e-05, "loss": 0.0409, "step": 82160 }, { "epoch": 99.11949305974653, "grad_norm": 4.493099689483643, "learning_rate": 1.9998020046032116e-05, "loss": 0.0386, "step": 82170 }, { "epoch": 99.13156306578153, "grad_norm": 3.829439640045166, "learning_rate": 1.9998019804777322e-05, "loss": 0.0388, "step": 82180 }, { "epoch": 99.14363307181654, "grad_norm": 3.997570037841797, "learning_rate": 1.999801956352253e-05, "loss": 0.0402, "step": 82190 }, { "epoch": 99.15570307785154, "grad_norm": 3.9310245513916016, "learning_rate": 1.9998019322267735e-05, "loss": 0.0418, "step": 82200 }, { "epoch": 99.16777308388654, "grad_norm": 4.155944347381592, "learning_rate": 1.999801908101294e-05, "loss": 0.041, "step": 82210 }, { "epoch": 99.17984308992155, "grad_norm": 4.034672737121582, "learning_rate": 1.9998018839758147e-05, "loss": 0.0398, "step": 82220 }, { "epoch": 99.19191309595655, "grad_norm": 4.675760746002197, "learning_rate": 1.9998018598503353e-05, "loss": 0.0396, "step": 82230 }, { "epoch": 99.20398310199155, "grad_norm": 4.223177909851074, "learning_rate": 1.999801835724856e-05, "loss": 0.0414, "step": 82240 }, { "epoch": 99.21605310802656, "grad_norm": 4.074449062347412, "learning_rate": 1.9998018115993766e-05, "loss": 0.0438, "step": 82250 }, { "epoch": 99.22812311406156, "grad_norm": 4.951837062835693, "learning_rate": 1.9998017874738972e-05, "loss": 0.0438, "step": 82260 }, { "epoch": 99.24019312009656, "grad_norm": 4.7443342208862305, "learning_rate": 1.9998017633484178e-05, "loss": 0.0431, "step": 82270 }, { "epoch": 99.25226312613157, "grad_norm": 4.6556396484375, "learning_rate": 1.9998017392229384e-05, "loss": 0.0425, "step": 82280 }, { "epoch": 99.26433313216657, "grad_norm": 4.7805047035217285, "learning_rate": 1.999801715097459e-05, "loss": 0.0417, "step": 82290 }, { "epoch": 99.27640313820157, "grad_norm": 4.417880535125732, "learning_rate": 1.9998016909719797e-05, "loss": 0.0428, "step": 82300 }, { "epoch": 99.28847314423658, "grad_norm": 4.450247287750244, "learning_rate": 1.9998016668465e-05, "loss": 0.0423, "step": 82310 }, { "epoch": 99.30054315027158, "grad_norm": 4.50764799118042, "learning_rate": 1.9998016427210206e-05, "loss": 0.0456, "step": 82320 }, { "epoch": 99.31261315630658, "grad_norm": 4.282229423522949, "learning_rate": 1.9998016185955412e-05, "loss": 0.0448, "step": 82330 }, { "epoch": 99.32468316234159, "grad_norm": 4.326704025268555, "learning_rate": 1.9998015944700618e-05, "loss": 0.0462, "step": 82340 }, { "epoch": 99.33675316837659, "grad_norm": 4.948800563812256, "learning_rate": 1.9998015703445824e-05, "loss": 0.0443, "step": 82350 }, { "epoch": 99.34882317441159, "grad_norm": 5.117778778076172, "learning_rate": 1.999801546219103e-05, "loss": 0.044, "step": 82360 }, { "epoch": 99.3608931804466, "grad_norm": 4.386805534362793, "learning_rate": 1.9998015220936237e-05, "loss": 0.0453, "step": 82370 }, { "epoch": 99.3729631864816, "grad_norm": 4.4643778800964355, "learning_rate": 1.9998014979681443e-05, "loss": 0.0447, "step": 82380 }, { "epoch": 99.3850331925166, "grad_norm": 4.464785099029541, "learning_rate": 1.999801473842665e-05, "loss": 0.0436, "step": 82390 }, { "epoch": 99.3971031985516, "grad_norm": 4.4385666847229, "learning_rate": 1.9998014497171856e-05, "loss": 0.047, "step": 82400 }, { "epoch": 99.40917320458661, "grad_norm": 4.304634094238281, "learning_rate": 1.9998014255917062e-05, "loss": 0.0438, "step": 82410 }, { "epoch": 99.42124321062161, "grad_norm": 4.240158557891846, "learning_rate": 1.9998014014662268e-05, "loss": 0.045, "step": 82420 }, { "epoch": 99.43331321665661, "grad_norm": 4.486727237701416, "learning_rate": 1.9998013773407474e-05, "loss": 0.0438, "step": 82430 }, { "epoch": 99.44538322269162, "grad_norm": 4.895900726318359, "learning_rate": 1.999801353215268e-05, "loss": 0.0458, "step": 82440 }, { "epoch": 99.45745322872662, "grad_norm": 4.5582990646362305, "learning_rate": 1.9998013290897887e-05, "loss": 0.0447, "step": 82450 }, { "epoch": 99.46952323476162, "grad_norm": 4.522866725921631, "learning_rate": 1.9998013049643093e-05, "loss": 0.0453, "step": 82460 }, { "epoch": 99.48159324079663, "grad_norm": 4.935849189758301, "learning_rate": 1.99980128083883e-05, "loss": 0.0483, "step": 82470 }, { "epoch": 99.49366324683163, "grad_norm": 4.21859073638916, "learning_rate": 1.9998012567133505e-05, "loss": 0.0472, "step": 82480 }, { "epoch": 99.50573325286662, "grad_norm": 4.765159606933594, "learning_rate": 1.999801232587871e-05, "loss": 0.0471, "step": 82490 }, { "epoch": 99.51780325890162, "grad_norm": 4.309103012084961, "learning_rate": 1.9998012084623918e-05, "loss": 0.049, "step": 82500 }, { "epoch": 99.51780325890162, "eval_loss": 12.826650619506836, "eval_runtime": 8.132, "eval_samples_per_second": 85.711, "eval_steps_per_second": 10.821, "step": 82500 }, { "epoch": 99.52987326493663, "grad_norm": 5.052701950073242, "learning_rate": 1.9998011843369124e-05, "loss": 0.0464, "step": 82510 }, { "epoch": 99.54194327097163, "grad_norm": 4.969023704528809, "learning_rate": 1.999801160211433e-05, "loss": 0.0468, "step": 82520 }, { "epoch": 99.55401327700663, "grad_norm": 4.627518653869629, "learning_rate": 1.9998011360859536e-05, "loss": 0.0488, "step": 82530 }, { "epoch": 99.56608328304164, "grad_norm": 4.793467998504639, "learning_rate": 1.9998011119604743e-05, "loss": 0.0475, "step": 82540 }, { "epoch": 99.57815328907664, "grad_norm": 4.640239238739014, "learning_rate": 1.999801087834995e-05, "loss": 0.0487, "step": 82550 }, { "epoch": 99.59022329511164, "grad_norm": 5.005627155303955, "learning_rate": 1.9998010637095155e-05, "loss": 0.0475, "step": 82560 }, { "epoch": 99.60229330114664, "grad_norm": 4.830024242401123, "learning_rate": 1.999801039584036e-05, "loss": 0.0484, "step": 82570 }, { "epoch": 99.61436330718165, "grad_norm": 4.58385705947876, "learning_rate": 1.9998010154585567e-05, "loss": 0.0472, "step": 82580 }, { "epoch": 99.62643331321665, "grad_norm": 5.2689313888549805, "learning_rate": 1.9998009913330774e-05, "loss": 0.0492, "step": 82590 }, { "epoch": 99.63850331925165, "grad_norm": 4.783352851867676, "learning_rate": 1.999800967207598e-05, "loss": 0.0465, "step": 82600 }, { "epoch": 99.65057332528666, "grad_norm": 4.602685451507568, "learning_rate": 1.9998009430821186e-05, "loss": 0.049, "step": 82610 }, { "epoch": 99.66264333132166, "grad_norm": 4.577899932861328, "learning_rate": 1.9998009189566392e-05, "loss": 0.0489, "step": 82620 }, { "epoch": 99.67471333735666, "grad_norm": 4.716891288757324, "learning_rate": 1.99980089483116e-05, "loss": 0.0479, "step": 82630 }, { "epoch": 99.68678334339167, "grad_norm": 4.9341840744018555, "learning_rate": 1.9998008707056805e-05, "loss": 0.0486, "step": 82640 }, { "epoch": 99.69885334942667, "grad_norm": 4.793625831604004, "learning_rate": 1.999800846580201e-05, "loss": 0.0485, "step": 82650 }, { "epoch": 99.71092335546167, "grad_norm": 4.81575870513916, "learning_rate": 1.9998008224547217e-05, "loss": 0.0475, "step": 82660 }, { "epoch": 99.72299336149668, "grad_norm": 5.094271183013916, "learning_rate": 1.9998007983292423e-05, "loss": 0.0509, "step": 82670 }, { "epoch": 99.73506336753168, "grad_norm": 4.900885581970215, "learning_rate": 1.999800774203763e-05, "loss": 0.0509, "step": 82680 }, { "epoch": 99.74713337356668, "grad_norm": 4.501541614532471, "learning_rate": 1.9998007500782836e-05, "loss": 0.0514, "step": 82690 }, { "epoch": 99.75920337960169, "grad_norm": 4.9451470375061035, "learning_rate": 1.9998007259528042e-05, "loss": 0.0501, "step": 82700 }, { "epoch": 99.77127338563669, "grad_norm": 5.0011796951293945, "learning_rate": 1.9998007018273248e-05, "loss": 0.0511, "step": 82710 }, { "epoch": 99.78334339167169, "grad_norm": 5.415574550628662, "learning_rate": 1.999800677701845e-05, "loss": 0.0502, "step": 82720 }, { "epoch": 99.7954133977067, "grad_norm": 4.533692836761475, "learning_rate": 1.9998006535763657e-05, "loss": 0.0496, "step": 82730 }, { "epoch": 99.8074834037417, "grad_norm": 4.588137626647949, "learning_rate": 1.9998006294508863e-05, "loss": 0.0507, "step": 82740 }, { "epoch": 99.8195534097767, "grad_norm": 5.088333606719971, "learning_rate": 1.999800605325407e-05, "loss": 0.051, "step": 82750 }, { "epoch": 99.8316234158117, "grad_norm": 4.704165935516357, "learning_rate": 1.9998005811999276e-05, "loss": 0.0507, "step": 82760 }, { "epoch": 99.84369342184671, "grad_norm": 4.702328205108643, "learning_rate": 1.9998005570744482e-05, "loss": 0.0492, "step": 82770 }, { "epoch": 99.85576342788171, "grad_norm": 3.8159611225128174, "learning_rate": 1.999800532948969e-05, "loss": 0.0488, "step": 82780 }, { "epoch": 99.86783343391672, "grad_norm": 4.475333213806152, "learning_rate": 1.9998005088234895e-05, "loss": 0.0476, "step": 82790 }, { "epoch": 99.87990343995172, "grad_norm": 4.923098087310791, "learning_rate": 1.99980048469801e-05, "loss": 0.0491, "step": 82800 }, { "epoch": 99.89197344598672, "grad_norm": 4.706675052642822, "learning_rate": 1.9998004605725307e-05, "loss": 0.05, "step": 82810 }, { "epoch": 99.90404345202172, "grad_norm": 4.5251784324646, "learning_rate": 1.9998004364470513e-05, "loss": 0.0511, "step": 82820 }, { "epoch": 99.91611345805673, "grad_norm": 4.6126556396484375, "learning_rate": 1.999800412321572e-05, "loss": 0.0494, "step": 82830 }, { "epoch": 99.92818346409173, "grad_norm": 4.620737552642822, "learning_rate": 1.9998003881960926e-05, "loss": 0.0506, "step": 82840 }, { "epoch": 99.94025347012673, "grad_norm": 4.670417308807373, "learning_rate": 1.9998003640706132e-05, "loss": 0.0508, "step": 82850 }, { "epoch": 99.95232347616174, "grad_norm": 4.688383102416992, "learning_rate": 1.9998003399451338e-05, "loss": 0.0494, "step": 82860 }, { "epoch": 99.96439348219674, "grad_norm": 5.1427106857299805, "learning_rate": 1.9998003158196544e-05, "loss": 0.0496, "step": 82870 }, { "epoch": 99.97646348823174, "grad_norm": 4.752368450164795, "learning_rate": 1.999800291694175e-05, "loss": 0.0504, "step": 82880 }, { "epoch": 99.98853349426675, "grad_norm": 4.449303150177002, "learning_rate": 1.9998002675686957e-05, "loss": 0.0495, "step": 82890 }, { "epoch": 100.0, "grad_norm": 8.160000801086426, "learning_rate": 1.9998002434432163e-05, "loss": 0.0485, "step": 82900 }, { "epoch": 100.012070006035, "grad_norm": 3.920535087585449, "learning_rate": 1.999800219317737e-05, "loss": 0.0324, "step": 82910 }, { "epoch": 100.02414001207, "grad_norm": 4.122061729431152, "learning_rate": 1.9998001951922575e-05, "loss": 0.0346, "step": 82920 }, { "epoch": 100.03621001810501, "grad_norm": 5.087485313415527, "learning_rate": 1.999800171066778e-05, "loss": 0.0397, "step": 82930 }, { "epoch": 100.04828002414001, "grad_norm": 3.9551923274993896, "learning_rate": 1.9998001469412988e-05, "loss": 0.0363, "step": 82940 }, { "epoch": 100.06035003017502, "grad_norm": 4.097694396972656, "learning_rate": 1.9998001228158194e-05, "loss": 0.0375, "step": 82950 }, { "epoch": 100.07242003621002, "grad_norm": 4.143831253051758, "learning_rate": 1.99980009869034e-05, "loss": 0.0395, "step": 82960 }, { "epoch": 100.08449004224502, "grad_norm": 4.360711097717285, "learning_rate": 1.9998000745648603e-05, "loss": 0.038, "step": 82970 }, { "epoch": 100.09656004828003, "grad_norm": 3.6895508766174316, "learning_rate": 1.999800050439381e-05, "loss": 0.039, "step": 82980 }, { "epoch": 100.10863005431503, "grad_norm": 3.9024720191955566, "learning_rate": 1.9998000263139015e-05, "loss": 0.04, "step": 82990 }, { "epoch": 100.12070006035003, "grad_norm": 4.390427589416504, "learning_rate": 1.999800002188422e-05, "loss": 0.0394, "step": 83000 }, { "epoch": 100.12070006035003, "eval_loss": 12.84935474395752, "eval_runtime": 8.38, "eval_samples_per_second": 83.174, "eval_steps_per_second": 10.501, "step": 83000 }, { "epoch": 100.13277006638504, "grad_norm": 4.135577201843262, "learning_rate": 1.9997999780629428e-05, "loss": 0.0402, "step": 83010 }, { "epoch": 100.14484007242004, "grad_norm": 3.8786628246307373, "learning_rate": 1.9997999539374638e-05, "loss": 0.0397, "step": 83020 }, { "epoch": 100.15691007845504, "grad_norm": 4.269514083862305, "learning_rate": 1.9997999298119844e-05, "loss": 0.0403, "step": 83030 }, { "epoch": 100.16898008449004, "grad_norm": 3.9967360496520996, "learning_rate": 1.999799905686505e-05, "loss": 0.0404, "step": 83040 }, { "epoch": 100.18105009052505, "grad_norm": 4.227572441101074, "learning_rate": 1.9997998815610256e-05, "loss": 0.042, "step": 83050 }, { "epoch": 100.19312009656005, "grad_norm": 4.716184616088867, "learning_rate": 1.9997998574355462e-05, "loss": 0.042, "step": 83060 }, { "epoch": 100.20519010259505, "grad_norm": 4.221296310424805, "learning_rate": 1.999799833310067e-05, "loss": 0.0422, "step": 83070 }, { "epoch": 100.21726010863006, "grad_norm": 3.9601926803588867, "learning_rate": 1.9997998091845875e-05, "loss": 0.0418, "step": 83080 }, { "epoch": 100.22933011466506, "grad_norm": 4.427385330200195, "learning_rate": 1.999799785059108e-05, "loss": 0.0406, "step": 83090 }, { "epoch": 100.24140012070006, "grad_norm": 4.306036472320557, "learning_rate": 1.9997997609336287e-05, "loss": 0.0419, "step": 83100 }, { "epoch": 100.25347012673507, "grad_norm": 4.0118408203125, "learning_rate": 1.9997997368081493e-05, "loss": 0.0419, "step": 83110 }, { "epoch": 100.26554013277007, "grad_norm": 4.6136369705200195, "learning_rate": 1.99979971268267e-05, "loss": 0.0435, "step": 83120 }, { "epoch": 100.27761013880507, "grad_norm": 4.293776035308838, "learning_rate": 1.9997996885571906e-05, "loss": 0.0439, "step": 83130 }, { "epoch": 100.28968014484008, "grad_norm": 4.787362575531006, "learning_rate": 1.999799664431711e-05, "loss": 0.0419, "step": 83140 }, { "epoch": 100.30175015087508, "grad_norm": 4.053968906402588, "learning_rate": 1.9997996403062315e-05, "loss": 0.0432, "step": 83150 }, { "epoch": 100.31382015691008, "grad_norm": 4.565529823303223, "learning_rate": 1.999799616180752e-05, "loss": 0.0431, "step": 83160 }, { "epoch": 100.32589016294509, "grad_norm": 4.466250896453857, "learning_rate": 1.9997995920552727e-05, "loss": 0.0444, "step": 83170 }, { "epoch": 100.33796016898009, "grad_norm": 4.4785027503967285, "learning_rate": 1.9997995679297934e-05, "loss": 0.0445, "step": 83180 }, { "epoch": 100.35003017501509, "grad_norm": 4.1396613121032715, "learning_rate": 1.999799543804314e-05, "loss": 0.0433, "step": 83190 }, { "epoch": 100.3621001810501, "grad_norm": 4.287105560302734, "learning_rate": 1.9997995196788346e-05, "loss": 0.0436, "step": 83200 }, { "epoch": 100.3741701870851, "grad_norm": 4.750364780426025, "learning_rate": 1.9997994955533552e-05, "loss": 0.0447, "step": 83210 }, { "epoch": 100.3862401931201, "grad_norm": 4.1655592918396, "learning_rate": 1.999799471427876e-05, "loss": 0.0429, "step": 83220 }, { "epoch": 100.3983101991551, "grad_norm": 4.993688583374023, "learning_rate": 1.9997994473023965e-05, "loss": 0.0439, "step": 83230 }, { "epoch": 100.41038020519011, "grad_norm": 4.7843098640441895, "learning_rate": 1.999799423176917e-05, "loss": 0.0455, "step": 83240 }, { "epoch": 100.42245021122511, "grad_norm": 5.187985897064209, "learning_rate": 1.9997993990514377e-05, "loss": 0.0469, "step": 83250 }, { "epoch": 100.43452021726011, "grad_norm": 4.676303863525391, "learning_rate": 1.9997993749259583e-05, "loss": 0.0475, "step": 83260 }, { "epoch": 100.44659022329512, "grad_norm": 4.227489471435547, "learning_rate": 1.999799350800479e-05, "loss": 0.0468, "step": 83270 }, { "epoch": 100.45866022933012, "grad_norm": 4.366391658782959, "learning_rate": 1.9997993266749996e-05, "loss": 0.0462, "step": 83280 }, { "epoch": 100.47073023536512, "grad_norm": 4.443951606750488, "learning_rate": 1.9997993025495202e-05, "loss": 0.0442, "step": 83290 }, { "epoch": 100.48280024140013, "grad_norm": 4.911629676818848, "learning_rate": 1.9997992784240408e-05, "loss": 0.0469, "step": 83300 }, { "epoch": 100.49487024743513, "grad_norm": 4.572961330413818, "learning_rate": 1.9997992542985614e-05, "loss": 0.0461, "step": 83310 }, { "epoch": 100.50694025347012, "grad_norm": 5.056857109069824, "learning_rate": 1.999799230173082e-05, "loss": 0.0456, "step": 83320 }, { "epoch": 100.51901025950512, "grad_norm": 4.481513977050781, "learning_rate": 1.9997992060476027e-05, "loss": 0.046, "step": 83330 }, { "epoch": 100.53108026554013, "grad_norm": 4.508208751678467, "learning_rate": 1.9997991819221233e-05, "loss": 0.0468, "step": 83340 }, { "epoch": 100.54315027157513, "grad_norm": 5.034066200256348, "learning_rate": 1.999799157796644e-05, "loss": 0.0457, "step": 83350 }, { "epoch": 100.55522027761013, "grad_norm": 4.6750807762146, "learning_rate": 1.9997991336711645e-05, "loss": 0.048, "step": 83360 }, { "epoch": 100.56729028364514, "grad_norm": 4.754091262817383, "learning_rate": 1.999799109545685e-05, "loss": 0.0463, "step": 83370 }, { "epoch": 100.57936028968014, "grad_norm": 5.174444675445557, "learning_rate": 1.9997990854202058e-05, "loss": 0.0457, "step": 83380 }, { "epoch": 100.59143029571514, "grad_norm": 4.1471357345581055, "learning_rate": 1.999799061294726e-05, "loss": 0.0459, "step": 83390 }, { "epoch": 100.60350030175015, "grad_norm": 4.382879734039307, "learning_rate": 1.9997990371692467e-05, "loss": 0.0469, "step": 83400 }, { "epoch": 100.61557030778515, "grad_norm": 4.924803256988525, "learning_rate": 1.9997990130437673e-05, "loss": 0.0466, "step": 83410 }, { "epoch": 100.62764031382015, "grad_norm": 4.4261474609375, "learning_rate": 1.999798988918288e-05, "loss": 0.0463, "step": 83420 }, { "epoch": 100.63971031985515, "grad_norm": 4.385590076446533, "learning_rate": 1.9997989647928086e-05, "loss": 0.0449, "step": 83430 }, { "epoch": 100.65178032589016, "grad_norm": 4.796271800994873, "learning_rate": 1.9997989406673292e-05, "loss": 0.0467, "step": 83440 }, { "epoch": 100.66385033192516, "grad_norm": 5.282628059387207, "learning_rate": 1.9997989165418498e-05, "loss": 0.0489, "step": 83450 }, { "epoch": 100.67592033796016, "grad_norm": 4.462851047515869, "learning_rate": 1.9997988924163704e-05, "loss": 0.0463, "step": 83460 }, { "epoch": 100.68799034399517, "grad_norm": 4.535895347595215, "learning_rate": 1.999798868290891e-05, "loss": 0.0475, "step": 83470 }, { "epoch": 100.70006035003017, "grad_norm": 4.674198627471924, "learning_rate": 1.9997988441654117e-05, "loss": 0.0476, "step": 83480 }, { "epoch": 100.71213035606517, "grad_norm": 4.654218673706055, "learning_rate": 1.9997988200399323e-05, "loss": 0.0474, "step": 83490 }, { "epoch": 100.72420036210018, "grad_norm": 4.449554920196533, "learning_rate": 1.999798795914453e-05, "loss": 0.0478, "step": 83500 }, { "epoch": 100.72420036210018, "eval_loss": 12.877900123596191, "eval_runtime": 8.1337, "eval_samples_per_second": 85.692, "eval_steps_per_second": 10.819, "step": 83500 }, { "epoch": 100.73627036813518, "grad_norm": 4.547190189361572, "learning_rate": 1.9997987717889735e-05, "loss": 0.0478, "step": 83510 }, { "epoch": 100.74834037417018, "grad_norm": 5.098082542419434, "learning_rate": 1.999798747663494e-05, "loss": 0.0485, "step": 83520 }, { "epoch": 100.76041038020519, "grad_norm": 5.070422172546387, "learning_rate": 1.9997987235380148e-05, "loss": 0.0475, "step": 83530 }, { "epoch": 100.77248038624019, "grad_norm": 4.956583023071289, "learning_rate": 1.9997986994125354e-05, "loss": 0.0486, "step": 83540 }, { "epoch": 100.7845503922752, "grad_norm": 4.889718532562256, "learning_rate": 1.999798675287056e-05, "loss": 0.0491, "step": 83550 }, { "epoch": 100.7966203983102, "grad_norm": 4.79326868057251, "learning_rate": 1.9997986511615766e-05, "loss": 0.0513, "step": 83560 }, { "epoch": 100.8086904043452, "grad_norm": 4.859813213348389, "learning_rate": 1.9997986270360973e-05, "loss": 0.0477, "step": 83570 }, { "epoch": 100.8207604103802, "grad_norm": 4.923388481140137, "learning_rate": 1.999798602910618e-05, "loss": 0.0462, "step": 83580 }, { "epoch": 100.8328304164152, "grad_norm": 5.088634967803955, "learning_rate": 1.9997985787851385e-05, "loss": 0.0485, "step": 83590 }, { "epoch": 100.84490042245021, "grad_norm": 5.367968559265137, "learning_rate": 1.999798554659659e-05, "loss": 0.0509, "step": 83600 }, { "epoch": 100.85697042848521, "grad_norm": 4.959665775299072, "learning_rate": 1.9997985305341797e-05, "loss": 0.053, "step": 83610 }, { "epoch": 100.86904043452022, "grad_norm": 4.384909629821777, "learning_rate": 1.9997985064087004e-05, "loss": 0.0477, "step": 83620 }, { "epoch": 100.88111044055522, "grad_norm": 4.959144592285156, "learning_rate": 1.999798482283221e-05, "loss": 0.0509, "step": 83630 }, { "epoch": 100.89318044659022, "grad_norm": 4.636568069458008, "learning_rate": 1.9997984581577416e-05, "loss": 0.0516, "step": 83640 }, { "epoch": 100.90525045262522, "grad_norm": 4.884067058563232, "learning_rate": 1.9997984340322622e-05, "loss": 0.0527, "step": 83650 }, { "epoch": 100.91732045866023, "grad_norm": 4.223732948303223, "learning_rate": 1.999798409906783e-05, "loss": 0.051, "step": 83660 }, { "epoch": 100.92939046469523, "grad_norm": 4.858600616455078, "learning_rate": 1.9997983857813035e-05, "loss": 0.0483, "step": 83670 }, { "epoch": 100.94146047073023, "grad_norm": 4.657817840576172, "learning_rate": 1.999798361655824e-05, "loss": 0.0493, "step": 83680 }, { "epoch": 100.95353047676524, "grad_norm": 5.000977039337158, "learning_rate": 1.9997983375303447e-05, "loss": 0.0478, "step": 83690 }, { "epoch": 100.96560048280024, "grad_norm": 4.986941814422607, "learning_rate": 1.9997983134048653e-05, "loss": 0.0532, "step": 83700 }, { "epoch": 100.97767048883524, "grad_norm": 4.827368259429932, "learning_rate": 1.999798289279386e-05, "loss": 0.0509, "step": 83710 }, { "epoch": 100.98974049487025, "grad_norm": 4.836951732635498, "learning_rate": 1.9997982651539066e-05, "loss": 0.0497, "step": 83720 }, { "epoch": 101.0012070006035, "grad_norm": 4.153903007507324, "learning_rate": 1.9997982410284272e-05, "loss": 0.0488, "step": 83730 }, { "epoch": 101.0132770066385, "grad_norm": 4.2643141746521, "learning_rate": 1.9997982169029478e-05, "loss": 0.0355, "step": 83740 }, { "epoch": 101.0253470126735, "grad_norm": 3.9032135009765625, "learning_rate": 1.9997981927774684e-05, "loss": 0.0348, "step": 83750 }, { "epoch": 101.03741701870851, "grad_norm": 4.293811798095703, "learning_rate": 1.999798168651989e-05, "loss": 0.0359, "step": 83760 }, { "epoch": 101.04948702474351, "grad_norm": 4.617110252380371, "learning_rate": 1.9997981445265097e-05, "loss": 0.0384, "step": 83770 }, { "epoch": 101.06155703077852, "grad_norm": 3.388460636138916, "learning_rate": 1.9997981204010303e-05, "loss": 0.0355, "step": 83780 }, { "epoch": 101.07362703681352, "grad_norm": 4.198868274688721, "learning_rate": 1.999798096275551e-05, "loss": 0.0393, "step": 83790 }, { "epoch": 101.08569704284852, "grad_norm": 4.1780242919921875, "learning_rate": 1.9997980721500712e-05, "loss": 0.0378, "step": 83800 }, { "epoch": 101.09776704888353, "grad_norm": 4.397241115570068, "learning_rate": 1.999798048024592e-05, "loss": 0.0397, "step": 83810 }, { "epoch": 101.10983705491853, "grad_norm": 4.683605670928955, "learning_rate": 1.9997980238991125e-05, "loss": 0.0392, "step": 83820 }, { "epoch": 101.12190706095353, "grad_norm": 4.322150230407715, "learning_rate": 1.999797999773633e-05, "loss": 0.0391, "step": 83830 }, { "epoch": 101.13397706698854, "grad_norm": 4.828642845153809, "learning_rate": 1.9997979756481537e-05, "loss": 0.0413, "step": 83840 }, { "epoch": 101.14604707302354, "grad_norm": 4.123122692108154, "learning_rate": 1.9997979515226743e-05, "loss": 0.0402, "step": 83850 }, { "epoch": 101.15811707905854, "grad_norm": 4.361289024353027, "learning_rate": 1.999797927397195e-05, "loss": 0.0399, "step": 83860 }, { "epoch": 101.17018708509354, "grad_norm": 4.125746726989746, "learning_rate": 1.9997979032717156e-05, "loss": 0.0409, "step": 83870 }, { "epoch": 101.18225709112855, "grad_norm": 4.365398406982422, "learning_rate": 1.9997978791462362e-05, "loss": 0.0428, "step": 83880 }, { "epoch": 101.19432709716355, "grad_norm": 3.9291200637817383, "learning_rate": 1.9997978550207568e-05, "loss": 0.0407, "step": 83890 }, { "epoch": 101.20639710319855, "grad_norm": 4.65709924697876, "learning_rate": 1.9997978308952774e-05, "loss": 0.0446, "step": 83900 }, { "epoch": 101.21846710923356, "grad_norm": 4.017755031585693, "learning_rate": 1.999797806769798e-05, "loss": 0.0423, "step": 83910 }, { "epoch": 101.23053711526856, "grad_norm": 4.319009304046631, "learning_rate": 1.9997977826443187e-05, "loss": 0.0414, "step": 83920 }, { "epoch": 101.24260712130356, "grad_norm": 4.4143967628479, "learning_rate": 1.9997977585188393e-05, "loss": 0.0424, "step": 83930 }, { "epoch": 101.25467712733857, "grad_norm": 4.433815002441406, "learning_rate": 1.99979773439336e-05, "loss": 0.0437, "step": 83940 }, { "epoch": 101.26674713337357, "grad_norm": 4.234099864959717, "learning_rate": 1.9997977102678805e-05, "loss": 0.0425, "step": 83950 }, { "epoch": 101.27881713940857, "grad_norm": 4.175690174102783, "learning_rate": 1.999797686142401e-05, "loss": 0.0413, "step": 83960 }, { "epoch": 101.29088714544358, "grad_norm": 4.387997627258301, "learning_rate": 1.9997976620169218e-05, "loss": 0.0425, "step": 83970 }, { "epoch": 101.30295715147858, "grad_norm": 4.174242973327637, "learning_rate": 1.9997976378914424e-05, "loss": 0.0428, "step": 83980 }, { "epoch": 101.31502715751358, "grad_norm": 4.038208484649658, "learning_rate": 1.999797613765963e-05, "loss": 0.0434, "step": 83990 }, { "epoch": 101.32709716354859, "grad_norm": 4.931241989135742, "learning_rate": 1.9997975896404836e-05, "loss": 0.0442, "step": 84000 }, { "epoch": 101.32709716354859, "eval_loss": 12.883684158325195, "eval_runtime": 8.1387, "eval_samples_per_second": 85.641, "eval_steps_per_second": 10.813, "step": 84000 }, { "epoch": 101.33916716958359, "grad_norm": 4.326490879058838, "learning_rate": 1.9997975655150043e-05, "loss": 0.0441, "step": 84010 }, { "epoch": 101.35123717561859, "grad_norm": 4.296458721160889, "learning_rate": 1.999797541389525e-05, "loss": 0.047, "step": 84020 }, { "epoch": 101.3633071816536, "grad_norm": 3.8015658855438232, "learning_rate": 1.9997975172640455e-05, "loss": 0.044, "step": 84030 }, { "epoch": 101.3753771876886, "grad_norm": 4.860767364501953, "learning_rate": 1.999797493138566e-05, "loss": 0.0431, "step": 84040 }, { "epoch": 101.3874471937236, "grad_norm": 4.360193252563477, "learning_rate": 1.9997974690130864e-05, "loss": 0.0454, "step": 84050 }, { "epoch": 101.3995171997586, "grad_norm": 4.862827777862549, "learning_rate": 1.999797444887607e-05, "loss": 0.0443, "step": 84060 }, { "epoch": 101.41158720579361, "grad_norm": 4.508647441864014, "learning_rate": 1.9997974207621277e-05, "loss": 0.0449, "step": 84070 }, { "epoch": 101.42365721182861, "grad_norm": 4.481754779815674, "learning_rate": 1.9997973966366483e-05, "loss": 0.045, "step": 84080 }, { "epoch": 101.43572721786362, "grad_norm": 4.532022953033447, "learning_rate": 1.999797372511169e-05, "loss": 0.0467, "step": 84090 }, { "epoch": 101.44779722389862, "grad_norm": 4.7682414054870605, "learning_rate": 1.99979734838569e-05, "loss": 0.0465, "step": 84100 }, { "epoch": 101.45986722993362, "grad_norm": 4.366065979003906, "learning_rate": 1.9997973242602105e-05, "loss": 0.0446, "step": 84110 }, { "epoch": 101.47193723596862, "grad_norm": 4.315855026245117, "learning_rate": 1.999797300134731e-05, "loss": 0.0446, "step": 84120 }, { "epoch": 101.48400724200363, "grad_norm": 4.232047080993652, "learning_rate": 1.9997972760092517e-05, "loss": 0.0434, "step": 84130 }, { "epoch": 101.49607724803863, "grad_norm": 4.30462646484375, "learning_rate": 1.9997972518837723e-05, "loss": 0.048, "step": 84140 }, { "epoch": 101.50814725407362, "grad_norm": 4.6040472984313965, "learning_rate": 1.999797227758293e-05, "loss": 0.0447, "step": 84150 }, { "epoch": 101.52021726010862, "grad_norm": 4.312043190002441, "learning_rate": 1.9997972036328136e-05, "loss": 0.0444, "step": 84160 }, { "epoch": 101.53228726614363, "grad_norm": 4.467227935791016, "learning_rate": 1.9997971795073342e-05, "loss": 0.0466, "step": 84170 }, { "epoch": 101.54435727217863, "grad_norm": 4.571192741394043, "learning_rate": 1.999797155381855e-05, "loss": 0.0449, "step": 84180 }, { "epoch": 101.55642727821363, "grad_norm": 4.289122581481934, "learning_rate": 1.9997971312563755e-05, "loss": 0.0468, "step": 84190 }, { "epoch": 101.56849728424864, "grad_norm": 4.5097832679748535, "learning_rate": 1.999797107130896e-05, "loss": 0.0451, "step": 84200 }, { "epoch": 101.58056729028364, "grad_norm": 4.83054780960083, "learning_rate": 1.9997970830054167e-05, "loss": 0.0467, "step": 84210 }, { "epoch": 101.59263729631864, "grad_norm": 4.73270320892334, "learning_rate": 1.999797058879937e-05, "loss": 0.0473, "step": 84220 }, { "epoch": 101.60470730235365, "grad_norm": 4.780562877655029, "learning_rate": 1.9997970347544576e-05, "loss": 0.047, "step": 84230 }, { "epoch": 101.61677730838865, "grad_norm": 4.587071418762207, "learning_rate": 1.9997970106289782e-05, "loss": 0.0447, "step": 84240 }, { "epoch": 101.62884731442365, "grad_norm": 4.334619998931885, "learning_rate": 1.999796986503499e-05, "loss": 0.0475, "step": 84250 }, { "epoch": 101.64091732045866, "grad_norm": 4.605723857879639, "learning_rate": 1.9997969623780195e-05, "loss": 0.0465, "step": 84260 }, { "epoch": 101.65298732649366, "grad_norm": 4.789915084838867, "learning_rate": 1.99979693825254e-05, "loss": 0.0469, "step": 84270 }, { "epoch": 101.66505733252866, "grad_norm": 4.371413707733154, "learning_rate": 1.9997969141270607e-05, "loss": 0.0474, "step": 84280 }, { "epoch": 101.67712733856366, "grad_norm": 4.7439398765563965, "learning_rate": 1.9997968900015813e-05, "loss": 0.0496, "step": 84290 }, { "epoch": 101.68919734459867, "grad_norm": 4.687723636627197, "learning_rate": 1.999796865876102e-05, "loss": 0.0487, "step": 84300 }, { "epoch": 101.70126735063367, "grad_norm": 4.57931661605835, "learning_rate": 1.9997968417506226e-05, "loss": 0.0499, "step": 84310 }, { "epoch": 101.71333735666867, "grad_norm": 4.727833271026611, "learning_rate": 1.9997968176251432e-05, "loss": 0.0464, "step": 84320 }, { "epoch": 101.72540736270368, "grad_norm": 4.618788719177246, "learning_rate": 1.9997967934996638e-05, "loss": 0.0474, "step": 84330 }, { "epoch": 101.73747736873868, "grad_norm": 4.722240447998047, "learning_rate": 1.9997967693741844e-05, "loss": 0.0483, "step": 84340 }, { "epoch": 101.74954737477368, "grad_norm": 4.903902530670166, "learning_rate": 1.999796745248705e-05, "loss": 0.049, "step": 84350 }, { "epoch": 101.76161738080869, "grad_norm": 4.100432872772217, "learning_rate": 1.9997967211232257e-05, "loss": 0.0483, "step": 84360 }, { "epoch": 101.77368738684369, "grad_norm": 3.9820005893707275, "learning_rate": 1.9997966969977463e-05, "loss": 0.0498, "step": 84370 }, { "epoch": 101.7857573928787, "grad_norm": 5.001831531524658, "learning_rate": 1.999796672872267e-05, "loss": 0.0482, "step": 84380 }, { "epoch": 101.7978273989137, "grad_norm": 4.599897861480713, "learning_rate": 1.9997966487467875e-05, "loss": 0.0475, "step": 84390 }, { "epoch": 101.8098974049487, "grad_norm": 4.166306495666504, "learning_rate": 1.999796624621308e-05, "loss": 0.0462, "step": 84400 }, { "epoch": 101.8219674109837, "grad_norm": 4.923336505889893, "learning_rate": 1.9997966004958288e-05, "loss": 0.0484, "step": 84410 }, { "epoch": 101.8340374170187, "grad_norm": 4.660617828369141, "learning_rate": 1.9997965763703494e-05, "loss": 0.0486, "step": 84420 }, { "epoch": 101.84610742305371, "grad_norm": 5.0417094230651855, "learning_rate": 1.99979655224487e-05, "loss": 0.0497, "step": 84430 }, { "epoch": 101.85817742908871, "grad_norm": 4.848217010498047, "learning_rate": 1.9997965281193907e-05, "loss": 0.0495, "step": 84440 }, { "epoch": 101.87024743512372, "grad_norm": 4.627111911773682, "learning_rate": 1.9997965039939113e-05, "loss": 0.0484, "step": 84450 }, { "epoch": 101.88231744115872, "grad_norm": 4.354727745056152, "learning_rate": 1.999796479868432e-05, "loss": 0.0464, "step": 84460 }, { "epoch": 101.89438744719372, "grad_norm": 4.568583965301514, "learning_rate": 1.9997964557429522e-05, "loss": 0.0472, "step": 84470 }, { "epoch": 101.90645745322873, "grad_norm": 4.649338722229004, "learning_rate": 1.9997964316174728e-05, "loss": 0.0476, "step": 84480 }, { "epoch": 101.91852745926373, "grad_norm": 4.694737434387207, "learning_rate": 1.9997964074919934e-05, "loss": 0.0502, "step": 84490 }, { "epoch": 101.93059746529873, "grad_norm": 4.242213726043701, "learning_rate": 1.999796383366514e-05, "loss": 0.0481, "step": 84500 }, { "epoch": 101.93059746529873, "eval_loss": 12.880363464355469, "eval_runtime": 8.1732, "eval_samples_per_second": 85.279, "eval_steps_per_second": 10.767, "step": 84500 }, { "epoch": 101.94266747133373, "grad_norm": 4.727208137512207, "learning_rate": 1.9997963592410347e-05, "loss": 0.0486, "step": 84510 }, { "epoch": 101.95473747736874, "grad_norm": 4.466462135314941, "learning_rate": 1.9997963351155553e-05, "loss": 0.0507, "step": 84520 }, { "epoch": 101.96680748340374, "grad_norm": 4.809169292449951, "learning_rate": 1.999796310990076e-05, "loss": 0.048, "step": 84530 }, { "epoch": 101.97887748943874, "grad_norm": 4.376986503601074, "learning_rate": 1.9997962868645965e-05, "loss": 0.0479, "step": 84540 }, { "epoch": 101.99094749547375, "grad_norm": 4.873586177825928, "learning_rate": 1.999796262739117e-05, "loss": 0.0513, "step": 84550 }, { "epoch": 102.002414001207, "grad_norm": 4.29256534576416, "learning_rate": 1.9997962386136378e-05, "loss": 0.0462, "step": 84560 }, { "epoch": 102.014484007242, "grad_norm": 4.096035003662109, "learning_rate": 1.9997962144881584e-05, "loss": 0.0332, "step": 84570 }, { "epoch": 102.026554013277, "grad_norm": 4.026106834411621, "learning_rate": 1.999796190362679e-05, "loss": 0.0361, "step": 84580 }, { "epoch": 102.03862401931201, "grad_norm": 4.20090913772583, "learning_rate": 1.9997961662371996e-05, "loss": 0.0353, "step": 84590 }, { "epoch": 102.05069402534701, "grad_norm": 4.026282787322998, "learning_rate": 1.9997961421117203e-05, "loss": 0.036, "step": 84600 }, { "epoch": 102.06276403138202, "grad_norm": 3.924811601638794, "learning_rate": 1.999796117986241e-05, "loss": 0.0359, "step": 84610 }, { "epoch": 102.07483403741702, "grad_norm": 4.2500691413879395, "learning_rate": 1.9997960938607615e-05, "loss": 0.0376, "step": 84620 }, { "epoch": 102.08690404345202, "grad_norm": 4.2561187744140625, "learning_rate": 1.999796069735282e-05, "loss": 0.0375, "step": 84630 }, { "epoch": 102.09897404948703, "grad_norm": 4.301304340362549, "learning_rate": 1.9997960456098027e-05, "loss": 0.0389, "step": 84640 }, { "epoch": 102.11104405552203, "grad_norm": 3.947740077972412, "learning_rate": 1.9997960214843234e-05, "loss": 0.0375, "step": 84650 }, { "epoch": 102.12311406155703, "grad_norm": 4.332725524902344, "learning_rate": 1.999795997358844e-05, "loss": 0.0391, "step": 84660 }, { "epoch": 102.13518406759204, "grad_norm": 4.714250564575195, "learning_rate": 1.9997959732333646e-05, "loss": 0.0413, "step": 84670 }, { "epoch": 102.14725407362704, "grad_norm": 4.180228233337402, "learning_rate": 1.9997959491078852e-05, "loss": 0.0426, "step": 84680 }, { "epoch": 102.15932407966204, "grad_norm": 4.488434791564941, "learning_rate": 1.999795924982406e-05, "loss": 0.041, "step": 84690 }, { "epoch": 102.17139408569705, "grad_norm": 3.9680726528167725, "learning_rate": 1.9997959008569265e-05, "loss": 0.0397, "step": 84700 }, { "epoch": 102.18346409173205, "grad_norm": 4.267804145812988, "learning_rate": 1.999795876731447e-05, "loss": 0.0413, "step": 84710 }, { "epoch": 102.19553409776705, "grad_norm": 4.1917548179626465, "learning_rate": 1.9997958526059677e-05, "loss": 0.0405, "step": 84720 }, { "epoch": 102.20760410380205, "grad_norm": 4.21985387802124, "learning_rate": 1.9997958284804883e-05, "loss": 0.0405, "step": 84730 }, { "epoch": 102.21967410983706, "grad_norm": 4.087615013122559, "learning_rate": 1.999795804355009e-05, "loss": 0.0402, "step": 84740 }, { "epoch": 102.23174411587206, "grad_norm": 4.541911602020264, "learning_rate": 1.9997957802295296e-05, "loss": 0.0437, "step": 84750 }, { "epoch": 102.24381412190706, "grad_norm": 4.727056503295898, "learning_rate": 1.9997957561040502e-05, "loss": 0.0437, "step": 84760 }, { "epoch": 102.25588412794207, "grad_norm": 4.21544075012207, "learning_rate": 1.9997957319785708e-05, "loss": 0.0418, "step": 84770 }, { "epoch": 102.26795413397707, "grad_norm": 4.2461442947387695, "learning_rate": 1.9997957078530914e-05, "loss": 0.0415, "step": 84780 }, { "epoch": 102.28002414001207, "grad_norm": 4.5537567138671875, "learning_rate": 1.999795683727612e-05, "loss": 0.0436, "step": 84790 }, { "epoch": 102.29209414604708, "grad_norm": 3.9765758514404297, "learning_rate": 1.9997956596021327e-05, "loss": 0.0423, "step": 84800 }, { "epoch": 102.30416415208208, "grad_norm": 4.375091552734375, "learning_rate": 1.9997956354766533e-05, "loss": 0.0434, "step": 84810 }, { "epoch": 102.31623415811708, "grad_norm": 4.464938163757324, "learning_rate": 1.999795611351174e-05, "loss": 0.0428, "step": 84820 }, { "epoch": 102.32830416415209, "grad_norm": 4.825338363647461, "learning_rate": 1.9997955872256946e-05, "loss": 0.0439, "step": 84830 }, { "epoch": 102.34037417018709, "grad_norm": 4.051720142364502, "learning_rate": 1.9997955631002152e-05, "loss": 0.0419, "step": 84840 }, { "epoch": 102.3524441762221, "grad_norm": 4.781991958618164, "learning_rate": 1.9997955389747358e-05, "loss": 0.0416, "step": 84850 }, { "epoch": 102.3645141822571, "grad_norm": 4.7401275634765625, "learning_rate": 1.9997955148492564e-05, "loss": 0.041, "step": 84860 }, { "epoch": 102.3765841882921, "grad_norm": 5.218163967132568, "learning_rate": 1.999795490723777e-05, "loss": 0.0444, "step": 84870 }, { "epoch": 102.3886541943271, "grad_norm": 4.20598030090332, "learning_rate": 1.9997954665982973e-05, "loss": 0.0449, "step": 84880 }, { "epoch": 102.4007242003621, "grad_norm": 4.452273845672607, "learning_rate": 1.999795442472818e-05, "loss": 0.0444, "step": 84890 }, { "epoch": 102.41279420639711, "grad_norm": 5.002067565917969, "learning_rate": 1.9997954183473386e-05, "loss": 0.0437, "step": 84900 }, { "epoch": 102.42486421243211, "grad_norm": 4.579145431518555, "learning_rate": 1.9997953942218592e-05, "loss": 0.0439, "step": 84910 }, { "epoch": 102.43693421846712, "grad_norm": 4.924716949462891, "learning_rate": 1.9997953700963798e-05, "loss": 0.0457, "step": 84920 }, { "epoch": 102.44900422450212, "grad_norm": 4.508492946624756, "learning_rate": 1.9997953459709004e-05, "loss": 0.044, "step": 84930 }, { "epoch": 102.46107423053712, "grad_norm": 4.816112518310547, "learning_rate": 1.999795321845421e-05, "loss": 0.0464, "step": 84940 }, { "epoch": 102.47314423657213, "grad_norm": 4.519419193267822, "learning_rate": 1.9997952977199417e-05, "loss": 0.0445, "step": 84950 }, { "epoch": 102.48521424260713, "grad_norm": 4.516646862030029, "learning_rate": 1.9997952735944623e-05, "loss": 0.0441, "step": 84960 }, { "epoch": 102.49728424864213, "grad_norm": 4.634546279907227, "learning_rate": 1.999795249468983e-05, "loss": 0.0444, "step": 84970 }, { "epoch": 102.50935425467712, "grad_norm": 4.839666843414307, "learning_rate": 1.9997952253435035e-05, "loss": 0.0444, "step": 84980 }, { "epoch": 102.52142426071212, "grad_norm": 4.818542957305908, "learning_rate": 1.999795201218024e-05, "loss": 0.0454, "step": 84990 }, { "epoch": 102.53349426674713, "grad_norm": 5.037624835968018, "learning_rate": 1.9997951770925448e-05, "loss": 0.0448, "step": 85000 }, { "epoch": 102.53349426674713, "eval_loss": 12.882283210754395, "eval_runtime": 8.132, "eval_samples_per_second": 85.71, "eval_steps_per_second": 10.821, "step": 85000 }, { "epoch": 102.54556427278213, "grad_norm": 4.1324052810668945, "learning_rate": 1.9997951529670654e-05, "loss": 0.0457, "step": 85010 }, { "epoch": 102.55763427881713, "grad_norm": 4.237008094787598, "learning_rate": 1.999795128841586e-05, "loss": 0.0484, "step": 85020 }, { "epoch": 102.56970428485214, "grad_norm": 4.4525980949401855, "learning_rate": 1.9997951047161066e-05, "loss": 0.044, "step": 85030 }, { "epoch": 102.58177429088714, "grad_norm": 4.840774059295654, "learning_rate": 1.9997950805906273e-05, "loss": 0.0442, "step": 85040 }, { "epoch": 102.59384429692214, "grad_norm": 4.5111165046691895, "learning_rate": 1.999795056465148e-05, "loss": 0.0485, "step": 85050 }, { "epoch": 102.60591430295715, "grad_norm": 4.563551902770996, "learning_rate": 1.9997950323396685e-05, "loss": 0.0465, "step": 85060 }, { "epoch": 102.61798430899215, "grad_norm": 5.01859188079834, "learning_rate": 1.999795008214189e-05, "loss": 0.0484, "step": 85070 }, { "epoch": 102.63005431502715, "grad_norm": 4.692659854888916, "learning_rate": 1.9997949840887098e-05, "loss": 0.0482, "step": 85080 }, { "epoch": 102.64212432106216, "grad_norm": 4.898504257202148, "learning_rate": 1.9997949599632304e-05, "loss": 0.0464, "step": 85090 }, { "epoch": 102.65419432709716, "grad_norm": 4.107354640960693, "learning_rate": 1.999794935837751e-05, "loss": 0.0463, "step": 85100 }, { "epoch": 102.66626433313216, "grad_norm": 4.375610828399658, "learning_rate": 1.9997949117122716e-05, "loss": 0.0477, "step": 85110 }, { "epoch": 102.67833433916717, "grad_norm": 4.722507953643799, "learning_rate": 1.9997948875867922e-05, "loss": 0.0456, "step": 85120 }, { "epoch": 102.69040434520217, "grad_norm": 4.801541328430176, "learning_rate": 1.9997948634613125e-05, "loss": 0.0486, "step": 85130 }, { "epoch": 102.70247435123717, "grad_norm": 4.358601093292236, "learning_rate": 1.999794839335833e-05, "loss": 0.0469, "step": 85140 }, { "epoch": 102.71454435727217, "grad_norm": 4.309055805206299, "learning_rate": 1.9997948152103538e-05, "loss": 0.0462, "step": 85150 }, { "epoch": 102.72661436330718, "grad_norm": 4.897287368774414, "learning_rate": 1.9997947910848744e-05, "loss": 0.0464, "step": 85160 }, { "epoch": 102.73868436934218, "grad_norm": 4.854236125946045, "learning_rate": 1.999794766959395e-05, "loss": 0.0476, "step": 85170 }, { "epoch": 102.75075437537718, "grad_norm": 4.875826835632324, "learning_rate": 1.9997947428339156e-05, "loss": 0.0495, "step": 85180 }, { "epoch": 102.76282438141219, "grad_norm": 4.240857124328613, "learning_rate": 1.9997947187084366e-05, "loss": 0.0485, "step": 85190 }, { "epoch": 102.77489438744719, "grad_norm": 4.366481304168701, "learning_rate": 1.9997946945829572e-05, "loss": 0.0475, "step": 85200 }, { "epoch": 102.7869643934822, "grad_norm": 4.284653663635254, "learning_rate": 1.999794670457478e-05, "loss": 0.0505, "step": 85210 }, { "epoch": 102.7990343995172, "grad_norm": 4.475992202758789, "learning_rate": 1.9997946463319985e-05, "loss": 0.0467, "step": 85220 }, { "epoch": 102.8111044055522, "grad_norm": 4.829367637634277, "learning_rate": 1.999794622206519e-05, "loss": 0.0466, "step": 85230 }, { "epoch": 102.8231744115872, "grad_norm": 4.347209453582764, "learning_rate": 1.9997945980810397e-05, "loss": 0.0461, "step": 85240 }, { "epoch": 102.8352444176222, "grad_norm": 4.589097023010254, "learning_rate": 1.9997945739555603e-05, "loss": 0.0466, "step": 85250 }, { "epoch": 102.84731442365721, "grad_norm": 4.8352766036987305, "learning_rate": 1.999794549830081e-05, "loss": 0.0488, "step": 85260 }, { "epoch": 102.85938442969221, "grad_norm": 4.567698955535889, "learning_rate": 1.9997945257046016e-05, "loss": 0.0476, "step": 85270 }, { "epoch": 102.87145443572722, "grad_norm": 4.937960624694824, "learning_rate": 1.9997945015791222e-05, "loss": 0.0485, "step": 85280 }, { "epoch": 102.88352444176222, "grad_norm": 3.972806692123413, "learning_rate": 1.9997944774536425e-05, "loss": 0.0468, "step": 85290 }, { "epoch": 102.89559444779722, "grad_norm": 4.499880790710449, "learning_rate": 1.999794453328163e-05, "loss": 0.049, "step": 85300 }, { "epoch": 102.90766445383223, "grad_norm": 4.882552146911621, "learning_rate": 1.9997944292026837e-05, "loss": 0.0494, "step": 85310 }, { "epoch": 102.91973445986723, "grad_norm": 4.907533645629883, "learning_rate": 1.9997944050772043e-05, "loss": 0.0492, "step": 85320 }, { "epoch": 102.93180446590223, "grad_norm": 4.488125801086426, "learning_rate": 1.999794380951725e-05, "loss": 0.049, "step": 85330 }, { "epoch": 102.94387447193724, "grad_norm": 5.062844753265381, "learning_rate": 1.9997943568262456e-05, "loss": 0.0485, "step": 85340 }, { "epoch": 102.95594447797224, "grad_norm": 4.653478622436523, "learning_rate": 1.9997943327007662e-05, "loss": 0.0475, "step": 85350 }, { "epoch": 102.96801448400724, "grad_norm": 4.686293601989746, "learning_rate": 1.9997943085752868e-05, "loss": 0.0488, "step": 85360 }, { "epoch": 102.98008449004224, "grad_norm": 5.053554058074951, "learning_rate": 1.9997942844498074e-05, "loss": 0.0505, "step": 85370 }, { "epoch": 102.99215449607725, "grad_norm": 4.941781520843506, "learning_rate": 1.999794260324328e-05, "loss": 0.0479, "step": 85380 }, { "epoch": 103.0036210018105, "grad_norm": 3.805009365081787, "learning_rate": 1.9997942361988487e-05, "loss": 0.043, "step": 85390 }, { "epoch": 103.0156910078455, "grad_norm": 4.1741509437561035, "learning_rate": 1.9997942120733693e-05, "loss": 0.0367, "step": 85400 }, { "epoch": 103.02776101388051, "grad_norm": 3.7853524684906006, "learning_rate": 1.99979418794789e-05, "loss": 0.0375, "step": 85410 }, { "epoch": 103.03983101991551, "grad_norm": 4.2426934242248535, "learning_rate": 1.9997941638224105e-05, "loss": 0.0359, "step": 85420 }, { "epoch": 103.05190102595051, "grad_norm": 3.994798183441162, "learning_rate": 1.9997941396969312e-05, "loss": 0.035, "step": 85430 }, { "epoch": 103.06397103198552, "grad_norm": 3.458245277404785, "learning_rate": 1.9997941155714518e-05, "loss": 0.036, "step": 85440 }, { "epoch": 103.07604103802052, "grad_norm": 4.468960762023926, "learning_rate": 1.9997940914459724e-05, "loss": 0.0383, "step": 85450 }, { "epoch": 103.08811104405552, "grad_norm": 3.4453301429748535, "learning_rate": 1.999794067320493e-05, "loss": 0.0367, "step": 85460 }, { "epoch": 103.10018105009053, "grad_norm": 4.314015865325928, "learning_rate": 1.9997940431950137e-05, "loss": 0.0393, "step": 85470 }, { "epoch": 103.11225105612553, "grad_norm": 3.8216118812561035, "learning_rate": 1.9997940190695343e-05, "loss": 0.0399, "step": 85480 }, { "epoch": 103.12432106216053, "grad_norm": 4.374327182769775, "learning_rate": 1.999793994944055e-05, "loss": 0.0412, "step": 85490 }, { "epoch": 103.13639106819554, "grad_norm": 4.170340538024902, "learning_rate": 1.9997939708185755e-05, "loss": 0.0395, "step": 85500 }, { "epoch": 103.13639106819554, "eval_loss": 12.86703872680664, "eval_runtime": 8.1244, "eval_samples_per_second": 85.791, "eval_steps_per_second": 10.832, "step": 85500 }, { "epoch": 103.14846107423054, "grad_norm": 4.208583354949951, "learning_rate": 1.999793946693096e-05, "loss": 0.0384, "step": 85510 }, { "epoch": 103.16053108026554, "grad_norm": 4.142484188079834, "learning_rate": 1.9997939225676168e-05, "loss": 0.0413, "step": 85520 }, { "epoch": 103.17260108630055, "grad_norm": 4.277204990386963, "learning_rate": 1.9997938984421374e-05, "loss": 0.0405, "step": 85530 }, { "epoch": 103.18467109233555, "grad_norm": 4.3769330978393555, "learning_rate": 1.9997938743166577e-05, "loss": 0.0396, "step": 85540 }, { "epoch": 103.19674109837055, "grad_norm": 4.2090983390808105, "learning_rate": 1.9997938501911783e-05, "loss": 0.04, "step": 85550 }, { "epoch": 103.20881110440556, "grad_norm": 4.32595157623291, "learning_rate": 1.999793826065699e-05, "loss": 0.0412, "step": 85560 }, { "epoch": 103.22088111044056, "grad_norm": 4.499838829040527, "learning_rate": 1.9997938019402195e-05, "loss": 0.0436, "step": 85570 }, { "epoch": 103.23295111647556, "grad_norm": 4.094456195831299, "learning_rate": 1.99979377781474e-05, "loss": 0.0409, "step": 85580 }, { "epoch": 103.24502112251056, "grad_norm": 4.248379230499268, "learning_rate": 1.9997937536892608e-05, "loss": 0.0432, "step": 85590 }, { "epoch": 103.25709112854557, "grad_norm": 4.833425045013428, "learning_rate": 1.9997937295637814e-05, "loss": 0.0439, "step": 85600 }, { "epoch": 103.26916113458057, "grad_norm": 4.772522449493408, "learning_rate": 1.999793705438302e-05, "loss": 0.0425, "step": 85610 }, { "epoch": 103.28123114061557, "grad_norm": 4.942383766174316, "learning_rate": 1.9997936813128226e-05, "loss": 0.042, "step": 85620 }, { "epoch": 103.29330114665058, "grad_norm": 4.540747165679932, "learning_rate": 1.9997936571873433e-05, "loss": 0.0418, "step": 85630 }, { "epoch": 103.30537115268558, "grad_norm": 4.235035419464111, "learning_rate": 1.999793633061864e-05, "loss": 0.0417, "step": 85640 }, { "epoch": 103.31744115872058, "grad_norm": 3.857327461242676, "learning_rate": 1.9997936089363845e-05, "loss": 0.0456, "step": 85650 }, { "epoch": 103.32951116475559, "grad_norm": 4.582422256469727, "learning_rate": 1.999793584810905e-05, "loss": 0.0433, "step": 85660 }, { "epoch": 103.34158117079059, "grad_norm": 4.265072822570801, "learning_rate": 1.9997935606854257e-05, "loss": 0.0431, "step": 85670 }, { "epoch": 103.3536511768256, "grad_norm": 4.49191951751709, "learning_rate": 1.9997935365599464e-05, "loss": 0.0432, "step": 85680 }, { "epoch": 103.3657211828606, "grad_norm": 4.645270347595215, "learning_rate": 1.999793512434467e-05, "loss": 0.0443, "step": 85690 }, { "epoch": 103.3777911888956, "grad_norm": 4.240724086761475, "learning_rate": 1.9997934883089876e-05, "loss": 0.0442, "step": 85700 }, { "epoch": 103.3898611949306, "grad_norm": 4.297418594360352, "learning_rate": 1.9997934641835082e-05, "loss": 0.0427, "step": 85710 }, { "epoch": 103.4019312009656, "grad_norm": 4.592187404632568, "learning_rate": 1.999793440058029e-05, "loss": 0.0418, "step": 85720 }, { "epoch": 103.41400120700061, "grad_norm": 4.540578365325928, "learning_rate": 1.9997934159325495e-05, "loss": 0.0445, "step": 85730 }, { "epoch": 103.42607121303561, "grad_norm": 4.253195285797119, "learning_rate": 1.99979339180707e-05, "loss": 0.046, "step": 85740 }, { "epoch": 103.43814121907062, "grad_norm": 4.186211585998535, "learning_rate": 1.9997933676815907e-05, "loss": 0.0429, "step": 85750 }, { "epoch": 103.45021122510562, "grad_norm": 4.184093475341797, "learning_rate": 1.9997933435561113e-05, "loss": 0.0443, "step": 85760 }, { "epoch": 103.46228123114062, "grad_norm": 4.204268455505371, "learning_rate": 1.999793319430632e-05, "loss": 0.0434, "step": 85770 }, { "epoch": 103.47435123717563, "grad_norm": 4.619688987731934, "learning_rate": 1.9997932953051526e-05, "loss": 0.0409, "step": 85780 }, { "epoch": 103.48642124321063, "grad_norm": 4.595060348510742, "learning_rate": 1.9997932711796732e-05, "loss": 0.043, "step": 85790 }, { "epoch": 103.49849124924563, "grad_norm": 4.689918518066406, "learning_rate": 1.9997932470541938e-05, "loss": 0.0431, "step": 85800 }, { "epoch": 103.51056125528062, "grad_norm": 4.713631629943848, "learning_rate": 1.9997932229287145e-05, "loss": 0.0481, "step": 85810 }, { "epoch": 103.52263126131562, "grad_norm": 4.397254467010498, "learning_rate": 1.999793198803235e-05, "loss": 0.0458, "step": 85820 }, { "epoch": 103.53470126735063, "grad_norm": Infinity, "learning_rate": 1.9997931746777557e-05, "loss": 0.0439, "step": 85830 }, { "epoch": 103.54677127338563, "grad_norm": 4.203981399536133, "learning_rate": 1.9997931505522763e-05, "loss": 0.0467, "step": 85840 }, { "epoch": 103.55884127942063, "grad_norm": 4.275352478027344, "learning_rate": 1.999793126426797e-05, "loss": 0.0436, "step": 85850 }, { "epoch": 103.57091128545564, "grad_norm": 4.9550604820251465, "learning_rate": 1.9997931023013176e-05, "loss": 0.0481, "step": 85860 }, { "epoch": 103.58298129149064, "grad_norm": 4.146185874938965, "learning_rate": 1.9997930781758382e-05, "loss": 0.0456, "step": 85870 }, { "epoch": 103.59505129752564, "grad_norm": 4.096699237823486, "learning_rate": 1.9997930540503588e-05, "loss": 0.0454, "step": 85880 }, { "epoch": 103.60712130356065, "grad_norm": 4.631570339202881, "learning_rate": 1.9997930299248794e-05, "loss": 0.0437, "step": 85890 }, { "epoch": 103.61919130959565, "grad_norm": 4.618978500366211, "learning_rate": 1.9997930057994e-05, "loss": 0.0479, "step": 85900 }, { "epoch": 103.63126131563065, "grad_norm": 5.0660529136657715, "learning_rate": 1.9997929816739207e-05, "loss": 0.0447, "step": 85910 }, { "epoch": 103.64333132166566, "grad_norm": 4.772634506225586, "learning_rate": 1.9997929575484413e-05, "loss": 0.0472, "step": 85920 }, { "epoch": 103.65540132770066, "grad_norm": 4.34462833404541, "learning_rate": 1.999792933422962e-05, "loss": 0.0472, "step": 85930 }, { "epoch": 103.66747133373566, "grad_norm": 4.308986186981201, "learning_rate": 1.9997929092974825e-05, "loss": 0.0469, "step": 85940 }, { "epoch": 103.67954133977067, "grad_norm": 4.27815580368042, "learning_rate": 1.999792885172003e-05, "loss": 0.0471, "step": 85950 }, { "epoch": 103.69161134580567, "grad_norm": 4.357987880706787, "learning_rate": 1.9997928610465234e-05, "loss": 0.047, "step": 85960 }, { "epoch": 103.70368135184067, "grad_norm": 4.107696533203125, "learning_rate": 1.999792836921044e-05, "loss": 0.0442, "step": 85970 }, { "epoch": 103.71575135787567, "grad_norm": 5.312450408935547, "learning_rate": 1.9997928127955647e-05, "loss": 0.0475, "step": 85980 }, { "epoch": 103.72782136391068, "grad_norm": 4.829864025115967, "learning_rate": 1.9997927886700853e-05, "loss": 0.047, "step": 85990 }, { "epoch": 103.73989136994568, "grad_norm": 4.733739852905273, "learning_rate": 1.999792764544606e-05, "loss": 0.0472, "step": 86000 }, { "epoch": 103.73989136994568, "eval_loss": 12.914456367492676, "eval_runtime": 8.129, "eval_samples_per_second": 85.742, "eval_steps_per_second": 10.825, "step": 86000 }, { "epoch": 103.75196137598068, "grad_norm": 4.406552791595459, "learning_rate": 1.9997927404191265e-05, "loss": 0.0489, "step": 86010 }, { "epoch": 103.76403138201569, "grad_norm": 5.090130805969238, "learning_rate": 1.999792716293647e-05, "loss": 0.047, "step": 86020 }, { "epoch": 103.77610138805069, "grad_norm": 4.610297203063965, "learning_rate": 1.9997926921681678e-05, "loss": 0.0462, "step": 86030 }, { "epoch": 103.7881713940857, "grad_norm": 4.547537326812744, "learning_rate": 1.9997926680426884e-05, "loss": 0.0463, "step": 86040 }, { "epoch": 103.8002414001207, "grad_norm": 5.002312183380127, "learning_rate": 1.999792643917209e-05, "loss": 0.0474, "step": 86050 }, { "epoch": 103.8123114061557, "grad_norm": 4.852406024932861, "learning_rate": 1.9997926197917296e-05, "loss": 0.0453, "step": 86060 }, { "epoch": 103.8243814121907, "grad_norm": 4.76910924911499, "learning_rate": 1.9997925956662503e-05, "loss": 0.0485, "step": 86070 }, { "epoch": 103.8364514182257, "grad_norm": 4.779724597930908, "learning_rate": 1.999792571540771e-05, "loss": 0.0521, "step": 86080 }, { "epoch": 103.84852142426071, "grad_norm": 4.732664585113525, "learning_rate": 1.9997925474152915e-05, "loss": 0.0489, "step": 86090 }, { "epoch": 103.86059143029571, "grad_norm": 4.192737579345703, "learning_rate": 1.999792523289812e-05, "loss": 0.0474, "step": 86100 }, { "epoch": 103.87266143633072, "grad_norm": 4.44954252243042, "learning_rate": 1.9997924991643328e-05, "loss": 0.0483, "step": 86110 }, { "epoch": 103.88473144236572, "grad_norm": 4.372793197631836, "learning_rate": 1.9997924750388534e-05, "loss": 0.0484, "step": 86120 }, { "epoch": 103.89680144840072, "grad_norm": 4.176791191101074, "learning_rate": 1.999792450913374e-05, "loss": 0.0491, "step": 86130 }, { "epoch": 103.90887145443573, "grad_norm": 4.733299732208252, "learning_rate": 1.9997924267878946e-05, "loss": 0.0455, "step": 86140 }, { "epoch": 103.92094146047073, "grad_norm": 4.430912494659424, "learning_rate": 1.9997924026624152e-05, "loss": 0.0468, "step": 86150 }, { "epoch": 103.93301146650573, "grad_norm": 4.998560905456543, "learning_rate": 1.999792378536936e-05, "loss": 0.0484, "step": 86160 }, { "epoch": 103.94508147254074, "grad_norm": 4.710292816162109, "learning_rate": 1.9997923544114565e-05, "loss": 0.0516, "step": 86170 }, { "epoch": 103.95715147857574, "grad_norm": 4.388606548309326, "learning_rate": 1.999792330285977e-05, "loss": 0.0472, "step": 86180 }, { "epoch": 103.96922148461074, "grad_norm": 4.651697158813477, "learning_rate": 1.9997923061604977e-05, "loss": 0.0495, "step": 86190 }, { "epoch": 103.98129149064575, "grad_norm": 4.487128257751465, "learning_rate": 1.9997922820350184e-05, "loss": 0.0471, "step": 86200 }, { "epoch": 103.99336149668075, "grad_norm": 4.797818660736084, "learning_rate": 1.9997922579095386e-05, "loss": 0.0467, "step": 86210 }, { "epoch": 104.004828002414, "grad_norm": 4.107724666595459, "learning_rate": 1.9997922337840593e-05, "loss": 0.0399, "step": 86220 }, { "epoch": 104.016898008449, "grad_norm": 3.6780197620391846, "learning_rate": 1.99979220965858e-05, "loss": 0.0348, "step": 86230 }, { "epoch": 104.02896801448401, "grad_norm": 3.9906129837036133, "learning_rate": 1.9997921855331005e-05, "loss": 0.0333, "step": 86240 }, { "epoch": 104.04103802051901, "grad_norm": 3.9644782543182373, "learning_rate": 1.999792161407621e-05, "loss": 0.035, "step": 86250 }, { "epoch": 104.05310802655401, "grad_norm": 3.9194836616516113, "learning_rate": 1.9997921372821417e-05, "loss": 0.0362, "step": 86260 }, { "epoch": 104.06517803258902, "grad_norm": 4.067085266113281, "learning_rate": 1.9997921131566627e-05, "loss": 0.0361, "step": 86270 }, { "epoch": 104.07724803862402, "grad_norm": 3.965517282485962, "learning_rate": 1.9997920890311833e-05, "loss": 0.0392, "step": 86280 }, { "epoch": 104.08931804465902, "grad_norm": 3.983367681503296, "learning_rate": 1.999792064905704e-05, "loss": 0.0382, "step": 86290 }, { "epoch": 104.10138805069403, "grad_norm": 4.164802074432373, "learning_rate": 1.9997920407802246e-05, "loss": 0.038, "step": 86300 }, { "epoch": 104.11345805672903, "grad_norm": 4.259883880615234, "learning_rate": 1.9997920166547452e-05, "loss": 0.0406, "step": 86310 }, { "epoch": 104.12552806276403, "grad_norm": 4.250123023986816, "learning_rate": 1.9997919925292658e-05, "loss": 0.0402, "step": 86320 }, { "epoch": 104.13759806879904, "grad_norm": 4.848031997680664, "learning_rate": 1.9997919684037864e-05, "loss": 0.0407, "step": 86330 }, { "epoch": 104.14966807483404, "grad_norm": 4.095828533172607, "learning_rate": 1.999791944278307e-05, "loss": 0.0394, "step": 86340 }, { "epoch": 104.16173808086904, "grad_norm": 3.7971456050872803, "learning_rate": 1.9997919201528277e-05, "loss": 0.0381, "step": 86350 }, { "epoch": 104.17380808690405, "grad_norm": 4.353437423706055, "learning_rate": 1.9997918960273483e-05, "loss": 0.0424, "step": 86360 }, { "epoch": 104.18587809293905, "grad_norm": 4.410680770874023, "learning_rate": 1.9997918719018686e-05, "loss": 0.0401, "step": 86370 }, { "epoch": 104.19794809897405, "grad_norm": 4.896165370941162, "learning_rate": 1.9997918477763892e-05, "loss": 0.042, "step": 86380 }, { "epoch": 104.21001810500906, "grad_norm": 4.292260646820068, "learning_rate": 1.9997918236509098e-05, "loss": 0.0409, "step": 86390 }, { "epoch": 104.22208811104406, "grad_norm": 4.357766151428223, "learning_rate": 1.9997917995254304e-05, "loss": 0.0414, "step": 86400 }, { "epoch": 104.23415811707906, "grad_norm": 4.524019718170166, "learning_rate": 1.999791775399951e-05, "loss": 0.0416, "step": 86410 }, { "epoch": 104.24622812311407, "grad_norm": 4.387375831604004, "learning_rate": 1.9997917512744717e-05, "loss": 0.0408, "step": 86420 }, { "epoch": 104.25829812914907, "grad_norm": 4.288451671600342, "learning_rate": 1.9997917271489923e-05, "loss": 0.0421, "step": 86430 }, { "epoch": 104.27036813518407, "grad_norm": 4.630490779876709, "learning_rate": 1.999791703023513e-05, "loss": 0.0411, "step": 86440 }, { "epoch": 104.28243814121907, "grad_norm": 4.184970378875732, "learning_rate": 1.9997916788980336e-05, "loss": 0.0398, "step": 86450 }, { "epoch": 104.29450814725408, "grad_norm": 3.9620883464813232, "learning_rate": 1.9997916547725542e-05, "loss": 0.0403, "step": 86460 }, { "epoch": 104.30657815328908, "grad_norm": 4.187701225280762, "learning_rate": 1.9997916306470748e-05, "loss": 0.0406, "step": 86470 }, { "epoch": 104.31864815932408, "grad_norm": 4.15714693069458, "learning_rate": 1.9997916065215954e-05, "loss": 0.0434, "step": 86480 }, { "epoch": 104.33071816535909, "grad_norm": 4.294384002685547, "learning_rate": 1.999791582396116e-05, "loss": 0.0426, "step": 86490 }, { "epoch": 104.34278817139409, "grad_norm": 4.150770664215088, "learning_rate": 1.9997915582706367e-05, "loss": 0.0415, "step": 86500 }, { "epoch": 104.34278817139409, "eval_loss": 12.919672012329102, "eval_runtime": 8.1291, "eval_samples_per_second": 85.741, "eval_steps_per_second": 10.825, "step": 86500 }, { "epoch": 104.3548581774291, "grad_norm": 4.570920944213867, "learning_rate": 1.9997915341451573e-05, "loss": 0.0441, "step": 86510 }, { "epoch": 104.3669281834641, "grad_norm": 4.6230788230896, "learning_rate": 1.999791510019678e-05, "loss": 0.0459, "step": 86520 }, { "epoch": 104.3789981894991, "grad_norm": 4.382681369781494, "learning_rate": 1.9997914858941985e-05, "loss": 0.0428, "step": 86530 }, { "epoch": 104.3910681955341, "grad_norm": 4.557504653930664, "learning_rate": 1.999791461768719e-05, "loss": 0.0446, "step": 86540 }, { "epoch": 104.4031382015691, "grad_norm": 4.420532703399658, "learning_rate": 1.9997914376432398e-05, "loss": 0.0413, "step": 86550 }, { "epoch": 104.41520820760411, "grad_norm": 4.064143180847168, "learning_rate": 1.9997914135177604e-05, "loss": 0.0416, "step": 86560 }, { "epoch": 104.42727821363911, "grad_norm": 4.404715538024902, "learning_rate": 1.999791389392281e-05, "loss": 0.0451, "step": 86570 }, { "epoch": 104.43934821967412, "grad_norm": 4.519281387329102, "learning_rate": 1.9997913652668016e-05, "loss": 0.0432, "step": 86580 }, { "epoch": 104.45141822570912, "grad_norm": 4.361261367797852, "learning_rate": 1.9997913411413223e-05, "loss": 0.0449, "step": 86590 }, { "epoch": 104.46348823174412, "grad_norm": 4.50127649307251, "learning_rate": 1.999791317015843e-05, "loss": 0.0451, "step": 86600 }, { "epoch": 104.47555823777913, "grad_norm": 4.5223493576049805, "learning_rate": 1.9997912928903635e-05, "loss": 0.0436, "step": 86610 }, { "epoch": 104.48762824381413, "grad_norm": 4.60680627822876, "learning_rate": 1.9997912687648838e-05, "loss": 0.0433, "step": 86620 }, { "epoch": 104.49969824984913, "grad_norm": 4.261621475219727, "learning_rate": 1.9997912446394044e-05, "loss": 0.0445, "step": 86630 }, { "epoch": 104.51176825588412, "grad_norm": 4.280026912689209, "learning_rate": 1.999791220513925e-05, "loss": 0.0449, "step": 86640 }, { "epoch": 104.52383826191912, "grad_norm": 4.690097808837891, "learning_rate": 1.9997911963884456e-05, "loss": 0.0442, "step": 86650 }, { "epoch": 104.53590826795413, "grad_norm": 3.824739694595337, "learning_rate": 1.9997911722629663e-05, "loss": 0.0453, "step": 86660 }, { "epoch": 104.54797827398913, "grad_norm": 4.468076229095459, "learning_rate": 1.999791148137487e-05, "loss": 0.046, "step": 86670 }, { "epoch": 104.56004828002413, "grad_norm": 4.482842445373535, "learning_rate": 1.9997911240120075e-05, "loss": 0.0455, "step": 86680 }, { "epoch": 104.57211828605914, "grad_norm": 4.504620552062988, "learning_rate": 1.999791099886528e-05, "loss": 0.0441, "step": 86690 }, { "epoch": 104.58418829209414, "grad_norm": 4.687952518463135, "learning_rate": 1.9997910757610488e-05, "loss": 0.0433, "step": 86700 }, { "epoch": 104.59625829812914, "grad_norm": 4.406147480010986, "learning_rate": 1.9997910516355694e-05, "loss": 0.0444, "step": 86710 }, { "epoch": 104.60832830416415, "grad_norm": 4.1668596267700195, "learning_rate": 1.99979102751009e-05, "loss": 0.0456, "step": 86720 }, { "epoch": 104.62039831019915, "grad_norm": 4.7675676345825195, "learning_rate": 1.9997910033846106e-05, "loss": 0.0446, "step": 86730 }, { "epoch": 104.63246831623415, "grad_norm": 4.809981822967529, "learning_rate": 1.9997909792591312e-05, "loss": 0.0451, "step": 86740 }, { "epoch": 104.64453832226916, "grad_norm": 4.8199872970581055, "learning_rate": 1.999790955133652e-05, "loss": 0.0465, "step": 86750 }, { "epoch": 104.65660832830416, "grad_norm": 4.068480014801025, "learning_rate": 1.9997909310081725e-05, "loss": 0.0442, "step": 86760 }, { "epoch": 104.66867833433916, "grad_norm": 4.840579032897949, "learning_rate": 1.999790906882693e-05, "loss": 0.0453, "step": 86770 }, { "epoch": 104.68074834037417, "grad_norm": 4.4218854904174805, "learning_rate": 1.9997908827572137e-05, "loss": 0.0447, "step": 86780 }, { "epoch": 104.69281834640917, "grad_norm": 4.9697065353393555, "learning_rate": 1.9997908586317343e-05, "loss": 0.0477, "step": 86790 }, { "epoch": 104.70488835244417, "grad_norm": 4.73408842086792, "learning_rate": 1.999790834506255e-05, "loss": 0.0473, "step": 86800 }, { "epoch": 104.71695835847918, "grad_norm": 4.596570014953613, "learning_rate": 1.9997908103807756e-05, "loss": 0.047, "step": 86810 }, { "epoch": 104.72902836451418, "grad_norm": 4.466053009033203, "learning_rate": 1.9997907862552962e-05, "loss": 0.0465, "step": 86820 }, { "epoch": 104.74109837054918, "grad_norm": 4.250204086303711, "learning_rate": 1.999790762129817e-05, "loss": 0.0439, "step": 86830 }, { "epoch": 104.75316837658418, "grad_norm": 4.7789387702941895, "learning_rate": 1.9997907380043375e-05, "loss": 0.0438, "step": 86840 }, { "epoch": 104.76523838261919, "grad_norm": 4.145794868469238, "learning_rate": 1.999790713878858e-05, "loss": 0.0459, "step": 86850 }, { "epoch": 104.77730838865419, "grad_norm": 4.969925880432129, "learning_rate": 1.9997906897533787e-05, "loss": 0.0482, "step": 86860 }, { "epoch": 104.7893783946892, "grad_norm": 4.321192264556885, "learning_rate": 1.9997906656278993e-05, "loss": 0.048, "step": 86870 }, { "epoch": 104.8014484007242, "grad_norm": 4.528863906860352, "learning_rate": 1.99979064150242e-05, "loss": 0.0474, "step": 86880 }, { "epoch": 104.8135184067592, "grad_norm": 4.586606025695801, "learning_rate": 1.9997906173769406e-05, "loss": 0.0464, "step": 86890 }, { "epoch": 104.8255884127942, "grad_norm": 3.9665448665618896, "learning_rate": 1.9997905932514612e-05, "loss": 0.0449, "step": 86900 }, { "epoch": 104.83765841882921, "grad_norm": 4.813577175140381, "learning_rate": 1.9997905691259818e-05, "loss": 0.0473, "step": 86910 }, { "epoch": 104.84972842486421, "grad_norm": 4.179074287414551, "learning_rate": 1.9997905450005024e-05, "loss": 0.0475, "step": 86920 }, { "epoch": 104.86179843089921, "grad_norm": 4.8597564697265625, "learning_rate": 1.999790520875023e-05, "loss": 0.0455, "step": 86930 }, { "epoch": 104.87386843693422, "grad_norm": 4.8175883293151855, "learning_rate": 1.9997904967495437e-05, "loss": 0.0484, "step": 86940 }, { "epoch": 104.88593844296922, "grad_norm": 4.240504264831543, "learning_rate": 1.9997904726240643e-05, "loss": 0.047, "step": 86950 }, { "epoch": 104.89800844900422, "grad_norm": 4.989290714263916, "learning_rate": 1.999790448498585e-05, "loss": 0.0461, "step": 86960 }, { "epoch": 104.91007845503923, "grad_norm": 5.179777145385742, "learning_rate": 1.9997904243731055e-05, "loss": 0.05, "step": 86970 }, { "epoch": 104.92214846107423, "grad_norm": 4.837258338928223, "learning_rate": 1.999790400247626e-05, "loss": 0.0492, "step": 86980 }, { "epoch": 104.93421846710923, "grad_norm": 4.797628402709961, "learning_rate": 1.9997903761221468e-05, "loss": 0.0456, "step": 86990 }, { "epoch": 104.94628847314424, "grad_norm": 4.2213263511657715, "learning_rate": 1.9997903519966674e-05, "loss": 0.0477, "step": 87000 }, { "epoch": 104.94628847314424, "eval_loss": 12.920232772827148, "eval_runtime": 8.1248, "eval_samples_per_second": 85.787, "eval_steps_per_second": 10.831, "step": 87000 }, { "epoch": 104.95835847917924, "grad_norm": 4.608355522155762, "learning_rate": 1.999790327871188e-05, "loss": 0.0475, "step": 87010 }, { "epoch": 104.97042848521424, "grad_norm": 4.950706481933594, "learning_rate": 1.9997903037457086e-05, "loss": 0.0487, "step": 87020 }, { "epoch": 104.98249849124925, "grad_norm": 4.197350978851318, "learning_rate": 1.999790279620229e-05, "loss": 0.0494, "step": 87030 }, { "epoch": 104.99456849728425, "grad_norm": 4.991064071655273, "learning_rate": 1.9997902554947495e-05, "loss": 0.0483, "step": 87040 }, { "epoch": 105.0060350030175, "grad_norm": 3.922339677810669, "learning_rate": 1.99979023136927e-05, "loss": 0.0366, "step": 87050 }, { "epoch": 105.0181050090525, "grad_norm": 4.1496076583862305, "learning_rate": 1.9997902072437908e-05, "loss": 0.0343, "step": 87060 }, { "epoch": 105.03017501508751, "grad_norm": 4.0756659507751465, "learning_rate": 1.9997901831183114e-05, "loss": 0.0372, "step": 87070 }, { "epoch": 105.04224502112251, "grad_norm": 3.8565924167633057, "learning_rate": 1.999790158992832e-05, "loss": 0.0338, "step": 87080 }, { "epoch": 105.05431502715751, "grad_norm": 4.355337142944336, "learning_rate": 1.9997901348673527e-05, "loss": 0.0345, "step": 87090 }, { "epoch": 105.06638503319252, "grad_norm": 3.898850440979004, "learning_rate": 1.9997901107418733e-05, "loss": 0.0356, "step": 87100 }, { "epoch": 105.07845503922752, "grad_norm": 4.2150373458862305, "learning_rate": 1.999790086616394e-05, "loss": 0.0368, "step": 87110 }, { "epoch": 105.09052504526252, "grad_norm": 4.3076934814453125, "learning_rate": 1.9997900624909145e-05, "loss": 0.0378, "step": 87120 }, { "epoch": 105.10259505129753, "grad_norm": 4.405864238739014, "learning_rate": 1.999790038365435e-05, "loss": 0.0369, "step": 87130 }, { "epoch": 105.11466505733253, "grad_norm": 3.6759872436523438, "learning_rate": 1.9997900142399558e-05, "loss": 0.0393, "step": 87140 }, { "epoch": 105.12673506336753, "grad_norm": 4.250824928283691, "learning_rate": 1.9997899901144764e-05, "loss": 0.0384, "step": 87150 }, { "epoch": 105.13880506940254, "grad_norm": 3.8869924545288086, "learning_rate": 1.999789965988997e-05, "loss": 0.038, "step": 87160 }, { "epoch": 105.15087507543754, "grad_norm": 3.7809247970581055, "learning_rate": 1.9997899418635176e-05, "loss": 0.039, "step": 87170 }, { "epoch": 105.16294508147254, "grad_norm": 4.391888618469238, "learning_rate": 1.9997899177380382e-05, "loss": 0.039, "step": 87180 }, { "epoch": 105.17501508750755, "grad_norm": 4.493074893951416, "learning_rate": 1.999789893612559e-05, "loss": 0.0388, "step": 87190 }, { "epoch": 105.18708509354255, "grad_norm": 4.702561378479004, "learning_rate": 1.9997898694870795e-05, "loss": 0.0397, "step": 87200 }, { "epoch": 105.19915509957755, "grad_norm": 4.075718879699707, "learning_rate": 1.9997898453616e-05, "loss": 0.0424, "step": 87210 }, { "epoch": 105.21122510561256, "grad_norm": 4.415874004364014, "learning_rate": 1.9997898212361207e-05, "loss": 0.0388, "step": 87220 }, { "epoch": 105.22329511164756, "grad_norm": 4.6335930824279785, "learning_rate": 1.9997897971106414e-05, "loss": 0.0404, "step": 87230 }, { "epoch": 105.23536511768256, "grad_norm": 4.267516136169434, "learning_rate": 1.999789772985162e-05, "loss": 0.0416, "step": 87240 }, { "epoch": 105.24743512371757, "grad_norm": 4.2526350021362305, "learning_rate": 1.9997897488596826e-05, "loss": 0.0411, "step": 87250 }, { "epoch": 105.25950512975257, "grad_norm": 4.251952648162842, "learning_rate": 1.9997897247342032e-05, "loss": 0.041, "step": 87260 }, { "epoch": 105.27157513578757, "grad_norm": 4.391141414642334, "learning_rate": 1.999789700608724e-05, "loss": 0.0437, "step": 87270 }, { "epoch": 105.28364514182257, "grad_norm": 4.1984944343566895, "learning_rate": 1.999789676483244e-05, "loss": 0.0428, "step": 87280 }, { "epoch": 105.29571514785758, "grad_norm": 3.8432247638702393, "learning_rate": 1.9997896523577647e-05, "loss": 0.0414, "step": 87290 }, { "epoch": 105.30778515389258, "grad_norm": 4.194119453430176, "learning_rate": 1.9997896282322854e-05, "loss": 0.0423, "step": 87300 }, { "epoch": 105.31985515992758, "grad_norm": 4.546967029571533, "learning_rate": 1.999789604106806e-05, "loss": 0.044, "step": 87310 }, { "epoch": 105.33192516596259, "grad_norm": 4.760119438171387, "learning_rate": 1.9997895799813266e-05, "loss": 0.0412, "step": 87320 }, { "epoch": 105.34399517199759, "grad_norm": 4.2833099365234375, "learning_rate": 1.9997895558558472e-05, "loss": 0.0442, "step": 87330 }, { "epoch": 105.3560651780326, "grad_norm": 4.444070339202881, "learning_rate": 1.999789531730368e-05, "loss": 0.0448, "step": 87340 }, { "epoch": 105.3681351840676, "grad_norm": 4.352147102355957, "learning_rate": 1.9997895076048888e-05, "loss": 0.0444, "step": 87350 }, { "epoch": 105.3802051901026, "grad_norm": 4.179216384887695, "learning_rate": 1.9997894834794094e-05, "loss": 0.0425, "step": 87360 }, { "epoch": 105.3922751961376, "grad_norm": 4.413384914398193, "learning_rate": 1.99978945935393e-05, "loss": 0.0416, "step": 87370 }, { "epoch": 105.4043452021726, "grad_norm": 4.067813396453857, "learning_rate": 1.9997894352284507e-05, "loss": 0.0431, "step": 87380 }, { "epoch": 105.41641520820761, "grad_norm": 4.37706184387207, "learning_rate": 1.9997894111029713e-05, "loss": 0.041, "step": 87390 }, { "epoch": 105.42848521424261, "grad_norm": 3.8439552783966064, "learning_rate": 1.999789386977492e-05, "loss": 0.0418, "step": 87400 }, { "epoch": 105.44055522027762, "grad_norm": 4.276782035827637, "learning_rate": 1.9997893628520125e-05, "loss": 0.0433, "step": 87410 }, { "epoch": 105.45262522631262, "grad_norm": 4.448055744171143, "learning_rate": 1.999789338726533e-05, "loss": 0.0421, "step": 87420 }, { "epoch": 105.46469523234762, "grad_norm": 4.312745571136475, "learning_rate": 1.9997893146010538e-05, "loss": 0.043, "step": 87430 }, { "epoch": 105.47676523838263, "grad_norm": 4.43338680267334, "learning_rate": 1.9997892904755744e-05, "loss": 0.0441, "step": 87440 }, { "epoch": 105.48883524441763, "grad_norm": 4.643151760101318, "learning_rate": 1.9997892663500947e-05, "loss": 0.0462, "step": 87450 }, { "epoch": 105.50090525045263, "grad_norm": 4.515411376953125, "learning_rate": 1.9997892422246153e-05, "loss": 0.0473, "step": 87460 }, { "epoch": 105.51297525648762, "grad_norm": 4.622702598571777, "learning_rate": 1.999789218099136e-05, "loss": 0.044, "step": 87470 }, { "epoch": 105.52504526252262, "grad_norm": 3.996015787124634, "learning_rate": 1.9997891939736566e-05, "loss": 0.0424, "step": 87480 }, { "epoch": 105.53711526855763, "grad_norm": 4.985274314880371, "learning_rate": 1.9997891698481772e-05, "loss": 0.0437, "step": 87490 }, { "epoch": 105.54918527459263, "grad_norm": 4.197848796844482, "learning_rate": 1.9997891457226978e-05, "loss": 0.0455, "step": 87500 }, { "epoch": 105.54918527459263, "eval_loss": 12.928866386413574, "eval_runtime": 8.1307, "eval_samples_per_second": 85.725, "eval_steps_per_second": 10.823, "step": 87500 }, { "epoch": 105.56125528062763, "grad_norm": 4.044055461883545, "learning_rate": 1.9997891215972184e-05, "loss": 0.0442, "step": 87510 }, { "epoch": 105.57332528666264, "grad_norm": 4.0853495597839355, "learning_rate": 1.999789097471739e-05, "loss": 0.0458, "step": 87520 }, { "epoch": 105.58539529269764, "grad_norm": 4.6887335777282715, "learning_rate": 1.9997890733462597e-05, "loss": 0.0429, "step": 87530 }, { "epoch": 105.59746529873264, "grad_norm": 4.346151828765869, "learning_rate": 1.9997890492207803e-05, "loss": 0.0433, "step": 87540 }, { "epoch": 105.60953530476765, "grad_norm": 4.316634654998779, "learning_rate": 1.999789025095301e-05, "loss": 0.0464, "step": 87550 }, { "epoch": 105.62160531080265, "grad_norm": 4.696919918060303, "learning_rate": 1.9997890009698215e-05, "loss": 0.0455, "step": 87560 }, { "epoch": 105.63367531683765, "grad_norm": 4.957278728485107, "learning_rate": 1.999788976844342e-05, "loss": 0.0466, "step": 87570 }, { "epoch": 105.64574532287266, "grad_norm": 4.782529354095459, "learning_rate": 1.9997889527188628e-05, "loss": 0.0431, "step": 87580 }, { "epoch": 105.65781532890766, "grad_norm": 3.897632360458374, "learning_rate": 1.9997889285933834e-05, "loss": 0.0434, "step": 87590 }, { "epoch": 105.66988533494266, "grad_norm": 4.675207138061523, "learning_rate": 1.999788904467904e-05, "loss": 0.045, "step": 87600 }, { "epoch": 105.68195534097767, "grad_norm": 4.57510232925415, "learning_rate": 1.9997888803424246e-05, "loss": 0.0456, "step": 87610 }, { "epoch": 105.69402534701267, "grad_norm": 4.067044258117676, "learning_rate": 1.9997888562169453e-05, "loss": 0.0445, "step": 87620 }, { "epoch": 105.70609535304767, "grad_norm": 4.659963607788086, "learning_rate": 1.999788832091466e-05, "loss": 0.045, "step": 87630 }, { "epoch": 105.71816535908268, "grad_norm": 4.65509033203125, "learning_rate": 1.9997888079659865e-05, "loss": 0.0445, "step": 87640 }, { "epoch": 105.73023536511768, "grad_norm": 4.588881492614746, "learning_rate": 1.999788783840507e-05, "loss": 0.0434, "step": 87650 }, { "epoch": 105.74230537115268, "grad_norm": 4.35097599029541, "learning_rate": 1.9997887597150277e-05, "loss": 0.0459, "step": 87660 }, { "epoch": 105.75437537718769, "grad_norm": 4.353887557983398, "learning_rate": 1.9997887355895484e-05, "loss": 0.046, "step": 87670 }, { "epoch": 105.76644538322269, "grad_norm": 4.251129150390625, "learning_rate": 1.999788711464069e-05, "loss": 0.0448, "step": 87680 }, { "epoch": 105.77851538925769, "grad_norm": 5.034412384033203, "learning_rate": 1.9997886873385896e-05, "loss": 0.0472, "step": 87690 }, { "epoch": 105.7905853952927, "grad_norm": 4.362419128417969, "learning_rate": 1.99978866321311e-05, "loss": 0.0473, "step": 87700 }, { "epoch": 105.8026554013277, "grad_norm": 4.338891983032227, "learning_rate": 1.9997886390876305e-05, "loss": 0.0467, "step": 87710 }, { "epoch": 105.8147254073627, "grad_norm": 4.337541580200195, "learning_rate": 1.999788614962151e-05, "loss": 0.0465, "step": 87720 }, { "epoch": 105.8267954133977, "grad_norm": 4.652832984924316, "learning_rate": 1.9997885908366718e-05, "loss": 0.0473, "step": 87730 }, { "epoch": 105.83886541943271, "grad_norm": 4.736522197723389, "learning_rate": 1.9997885667111924e-05, "loss": 0.045, "step": 87740 }, { "epoch": 105.85093542546771, "grad_norm": 4.536698818206787, "learning_rate": 1.999788542585713e-05, "loss": 0.0475, "step": 87750 }, { "epoch": 105.86300543150271, "grad_norm": 4.330263614654541, "learning_rate": 1.9997885184602336e-05, "loss": 0.045, "step": 87760 }, { "epoch": 105.87507543753772, "grad_norm": 4.335445404052734, "learning_rate": 1.9997884943347542e-05, "loss": 0.0463, "step": 87770 }, { "epoch": 105.88714544357272, "grad_norm": 4.60622501373291, "learning_rate": 1.999788470209275e-05, "loss": 0.0458, "step": 87780 }, { "epoch": 105.89921544960772, "grad_norm": 4.782384872436523, "learning_rate": 1.9997884460837955e-05, "loss": 0.0491, "step": 87790 }, { "epoch": 105.91128545564273, "grad_norm": 4.526014804840088, "learning_rate": 1.999788421958316e-05, "loss": 0.051, "step": 87800 }, { "epoch": 105.92335546167773, "grad_norm": 4.9416422843933105, "learning_rate": 1.9997883978328367e-05, "loss": 0.048, "step": 87810 }, { "epoch": 105.93542546771273, "grad_norm": 4.398306846618652, "learning_rate": 1.9997883737073573e-05, "loss": 0.0458, "step": 87820 }, { "epoch": 105.94749547374774, "grad_norm": 4.586962699890137, "learning_rate": 1.999788349581878e-05, "loss": 0.047, "step": 87830 }, { "epoch": 105.95956547978274, "grad_norm": 4.827605247497559, "learning_rate": 1.9997883254563986e-05, "loss": 0.0464, "step": 87840 }, { "epoch": 105.97163548581774, "grad_norm": 4.936075210571289, "learning_rate": 1.9997883013309192e-05, "loss": 0.0473, "step": 87850 }, { "epoch": 105.98370549185275, "grad_norm": 4.640016078948975, "learning_rate": 1.99978827720544e-05, "loss": 0.0492, "step": 87860 }, { "epoch": 105.99577549788775, "grad_norm": 4.312194347381592, "learning_rate": 1.9997882530799605e-05, "loss": 0.0471, "step": 87870 }, { "epoch": 106.007242003621, "grad_norm": 3.4543840885162354, "learning_rate": 1.999788228954481e-05, "loss": 0.0342, "step": 87880 }, { "epoch": 106.019312009656, "grad_norm": 3.4603610038757324, "learning_rate": 1.9997882048290017e-05, "loss": 0.0307, "step": 87890 }, { "epoch": 106.03138201569101, "grad_norm": 3.682720899581909, "learning_rate": 1.9997881807035223e-05, "loss": 0.0331, "step": 87900 }, { "epoch": 106.04345202172601, "grad_norm": 3.941765069961548, "learning_rate": 1.999788156578043e-05, "loss": 0.0353, "step": 87910 }, { "epoch": 106.05552202776101, "grad_norm": 3.73152494430542, "learning_rate": 1.9997881324525636e-05, "loss": 0.0341, "step": 87920 }, { "epoch": 106.06759203379602, "grad_norm": 4.131659507751465, "learning_rate": 1.9997881083270842e-05, "loss": 0.0364, "step": 87930 }, { "epoch": 106.07966203983102, "grad_norm": 3.6165473461151123, "learning_rate": 1.9997880842016048e-05, "loss": 0.0365, "step": 87940 }, { "epoch": 106.09173204586602, "grad_norm": 3.76814866065979, "learning_rate": 1.9997880600761254e-05, "loss": 0.0383, "step": 87950 }, { "epoch": 106.10380205190103, "grad_norm": 3.9289603233337402, "learning_rate": 1.999788035950646e-05, "loss": 0.0392, "step": 87960 }, { "epoch": 106.11587205793603, "grad_norm": 4.4853386878967285, "learning_rate": 1.9997880118251667e-05, "loss": 0.0358, "step": 87970 }, { "epoch": 106.12794206397103, "grad_norm": 4.044495582580566, "learning_rate": 1.9997879876996873e-05, "loss": 0.0394, "step": 87980 }, { "epoch": 106.14001207000604, "grad_norm": 4.319424152374268, "learning_rate": 1.999787963574208e-05, "loss": 0.0423, "step": 87990 }, { "epoch": 106.15208207604104, "grad_norm": 3.6317262649536133, "learning_rate": 1.9997879394487285e-05, "loss": 0.0403, "step": 88000 }, { "epoch": 106.15208207604104, "eval_loss": 12.930998802185059, "eval_runtime": 8.1382, "eval_samples_per_second": 85.645, "eval_steps_per_second": 10.813, "step": 88000 }, { "epoch": 106.16415208207604, "grad_norm": 4.4718708992004395, "learning_rate": 1.999787915323249e-05, "loss": 0.0405, "step": 88010 }, { "epoch": 106.17622208811105, "grad_norm": 3.9985947608947754, "learning_rate": 1.9997878911977698e-05, "loss": 0.0371, "step": 88020 }, { "epoch": 106.18829209414605, "grad_norm": 4.231529712677002, "learning_rate": 1.9997878670722904e-05, "loss": 0.0402, "step": 88030 }, { "epoch": 106.20036210018105, "grad_norm": 4.363927841186523, "learning_rate": 1.999787842946811e-05, "loss": 0.0419, "step": 88040 }, { "epoch": 106.21243210621606, "grad_norm": 4.438586711883545, "learning_rate": 1.9997878188213316e-05, "loss": 0.0407, "step": 88050 }, { "epoch": 106.22450211225106, "grad_norm": 4.097319602966309, "learning_rate": 1.9997877946958523e-05, "loss": 0.0398, "step": 88060 }, { "epoch": 106.23657211828606, "grad_norm": 4.155527114868164, "learning_rate": 1.999787770570373e-05, "loss": 0.0421, "step": 88070 }, { "epoch": 106.24864212432107, "grad_norm": 4.166582107543945, "learning_rate": 1.9997877464448935e-05, "loss": 0.0411, "step": 88080 }, { "epoch": 106.26071213035607, "grad_norm": 4.409484386444092, "learning_rate": 1.999787722319414e-05, "loss": 0.0398, "step": 88090 }, { "epoch": 106.27278213639107, "grad_norm": 3.9998741149902344, "learning_rate": 1.9997876981939348e-05, "loss": 0.0401, "step": 88100 }, { "epoch": 106.28485214242608, "grad_norm": 4.435238838195801, "learning_rate": 1.999787674068455e-05, "loss": 0.0426, "step": 88110 }, { "epoch": 106.29692214846108, "grad_norm": 4.562404632568359, "learning_rate": 1.9997876499429757e-05, "loss": 0.0417, "step": 88120 }, { "epoch": 106.30899215449608, "grad_norm": 3.8915207386016846, "learning_rate": 1.9997876258174963e-05, "loss": 0.041, "step": 88130 }, { "epoch": 106.32106216053108, "grad_norm": 4.025484085083008, "learning_rate": 1.999787601692017e-05, "loss": 0.0371, "step": 88140 }, { "epoch": 106.33313216656609, "grad_norm": 4.360507488250732, "learning_rate": 1.9997875775665375e-05, "loss": 0.0394, "step": 88150 }, { "epoch": 106.34520217260109, "grad_norm": 3.9263851642608643, "learning_rate": 1.999787553441058e-05, "loss": 0.0429, "step": 88160 }, { "epoch": 106.3572721786361, "grad_norm": 4.0221452713012695, "learning_rate": 1.9997875293155788e-05, "loss": 0.0413, "step": 88170 }, { "epoch": 106.3693421846711, "grad_norm": 4.474569797515869, "learning_rate": 1.9997875051900994e-05, "loss": 0.0402, "step": 88180 }, { "epoch": 106.3814121907061, "grad_norm": 4.252552032470703, "learning_rate": 1.99978748106462e-05, "loss": 0.0413, "step": 88190 }, { "epoch": 106.3934821967411, "grad_norm": 4.2813544273376465, "learning_rate": 1.9997874569391406e-05, "loss": 0.0417, "step": 88200 }, { "epoch": 106.40555220277611, "grad_norm": 4.4662394523620605, "learning_rate": 1.9997874328136612e-05, "loss": 0.0413, "step": 88210 }, { "epoch": 106.41762220881111, "grad_norm": 4.383070945739746, "learning_rate": 1.999787408688182e-05, "loss": 0.0433, "step": 88220 }, { "epoch": 106.42969221484611, "grad_norm": 4.148296356201172, "learning_rate": 1.9997873845627025e-05, "loss": 0.0419, "step": 88230 }, { "epoch": 106.44176222088112, "grad_norm": 4.460903644561768, "learning_rate": 1.999787360437223e-05, "loss": 0.0426, "step": 88240 }, { "epoch": 106.45383222691612, "grad_norm": 4.98909330368042, "learning_rate": 1.9997873363117437e-05, "loss": 0.0439, "step": 88250 }, { "epoch": 106.46590223295112, "grad_norm": 4.269433975219727, "learning_rate": 1.9997873121862644e-05, "loss": 0.0433, "step": 88260 }, { "epoch": 106.47797223898613, "grad_norm": 4.627402305603027, "learning_rate": 1.999787288060785e-05, "loss": 0.0435, "step": 88270 }, { "epoch": 106.49004224502113, "grad_norm": 4.156811237335205, "learning_rate": 1.9997872639353056e-05, "loss": 0.0434, "step": 88280 }, { "epoch": 106.50211225105613, "grad_norm": 4.46370267868042, "learning_rate": 1.9997872398098262e-05, "loss": 0.0429, "step": 88290 }, { "epoch": 106.51418225709112, "grad_norm": 4.51674747467041, "learning_rate": 1.999787215684347e-05, "loss": 0.0442, "step": 88300 }, { "epoch": 106.52625226312612, "grad_norm": 4.362595558166504, "learning_rate": 1.9997871915588675e-05, "loss": 0.0427, "step": 88310 }, { "epoch": 106.53832226916113, "grad_norm": 4.628920078277588, "learning_rate": 1.999787167433388e-05, "loss": 0.0433, "step": 88320 }, { "epoch": 106.55039227519613, "grad_norm": 4.656781196594238, "learning_rate": 1.9997871433079087e-05, "loss": 0.0431, "step": 88330 }, { "epoch": 106.56246228123113, "grad_norm": 4.503579616546631, "learning_rate": 1.9997871191824293e-05, "loss": 0.0453, "step": 88340 }, { "epoch": 106.57453228726614, "grad_norm": 4.996925354003906, "learning_rate": 1.99978709505695e-05, "loss": 0.0435, "step": 88350 }, { "epoch": 106.58660229330114, "grad_norm": 4.497054100036621, "learning_rate": 1.9997870709314702e-05, "loss": 0.0452, "step": 88360 }, { "epoch": 106.59867229933614, "grad_norm": 4.149116516113281, "learning_rate": 1.999787046805991e-05, "loss": 0.0458, "step": 88370 }, { "epoch": 106.61074230537115, "grad_norm": 4.497748374938965, "learning_rate": 1.9997870226805115e-05, "loss": 0.046, "step": 88380 }, { "epoch": 106.62281231140615, "grad_norm": 4.483869552612305, "learning_rate": 1.999786998555032e-05, "loss": 0.0453, "step": 88390 }, { "epoch": 106.63488231744115, "grad_norm": 4.51910400390625, "learning_rate": 1.9997869744295527e-05, "loss": 0.0462, "step": 88400 }, { "epoch": 106.64695232347616, "grad_norm": 4.8539347648620605, "learning_rate": 1.9997869503040733e-05, "loss": 0.0457, "step": 88410 }, { "epoch": 106.65902232951116, "grad_norm": 4.577709197998047, "learning_rate": 1.999786926178594e-05, "loss": 0.0452, "step": 88420 }, { "epoch": 106.67109233554616, "grad_norm": 4.3441033363342285, "learning_rate": 1.999786902053115e-05, "loss": 0.0447, "step": 88430 }, { "epoch": 106.68316234158117, "grad_norm": 4.663232326507568, "learning_rate": 1.9997868779276355e-05, "loss": 0.0438, "step": 88440 }, { "epoch": 106.69523234761617, "grad_norm": 4.274661064147949, "learning_rate": 1.999786853802156e-05, "loss": 0.0452, "step": 88450 }, { "epoch": 106.70730235365117, "grad_norm": 4.963343620300293, "learning_rate": 1.9997868296766768e-05, "loss": 0.0467, "step": 88460 }, { "epoch": 106.71937235968618, "grad_norm": 4.56907844543457, "learning_rate": 1.9997868055511974e-05, "loss": 0.0446, "step": 88470 }, { "epoch": 106.73144236572118, "grad_norm": 4.437971115112305, "learning_rate": 1.999786781425718e-05, "loss": 0.0457, "step": 88480 }, { "epoch": 106.74351237175618, "grad_norm": 4.28140926361084, "learning_rate": 1.9997867573002387e-05, "loss": 0.0465, "step": 88490 }, { "epoch": 106.75558237779119, "grad_norm": 4.437565326690674, "learning_rate": 1.9997867331747593e-05, "loss": 0.046, "step": 88500 }, { "epoch": 106.75558237779119, "eval_loss": 12.9603853225708, "eval_runtime": 8.1316, "eval_samples_per_second": 85.715, "eval_steps_per_second": 10.822, "step": 88500 }, { "epoch": 106.76765238382619, "grad_norm": 4.76531982421875, "learning_rate": 1.99978670904928e-05, "loss": 0.0469, "step": 88510 }, { "epoch": 106.77972238986119, "grad_norm": 4.870534896850586, "learning_rate": 1.9997866849238005e-05, "loss": 0.0474, "step": 88520 }, { "epoch": 106.7917923958962, "grad_norm": 4.9446234703063965, "learning_rate": 1.9997866607983208e-05, "loss": 0.0444, "step": 88530 }, { "epoch": 106.8038624019312, "grad_norm": 4.626737117767334, "learning_rate": 1.9997866366728414e-05, "loss": 0.048, "step": 88540 }, { "epoch": 106.8159324079662, "grad_norm": 4.687595844268799, "learning_rate": 1.999786612547362e-05, "loss": 0.048, "step": 88550 }, { "epoch": 106.8280024140012, "grad_norm": 4.673398494720459, "learning_rate": 1.9997865884218827e-05, "loss": 0.0448, "step": 88560 }, { "epoch": 106.84007242003621, "grad_norm": 4.936649799346924, "learning_rate": 1.9997865642964033e-05, "loss": 0.0465, "step": 88570 }, { "epoch": 106.85214242607121, "grad_norm": 4.592889308929443, "learning_rate": 1.999786540170924e-05, "loss": 0.0469, "step": 88580 }, { "epoch": 106.86421243210621, "grad_norm": 4.317801475524902, "learning_rate": 1.9997865160454445e-05, "loss": 0.0466, "step": 88590 }, { "epoch": 106.87628243814122, "grad_norm": 4.5152482986450195, "learning_rate": 1.999786491919965e-05, "loss": 0.0437, "step": 88600 }, { "epoch": 106.88835244417622, "grad_norm": 4.449270725250244, "learning_rate": 1.9997864677944858e-05, "loss": 0.0454, "step": 88610 }, { "epoch": 106.90042245021122, "grad_norm": 4.461658954620361, "learning_rate": 1.9997864436690064e-05, "loss": 0.0468, "step": 88620 }, { "epoch": 106.91249245624623, "grad_norm": 4.879205703735352, "learning_rate": 1.999786419543527e-05, "loss": 0.0488, "step": 88630 }, { "epoch": 106.92456246228123, "grad_norm": 4.770890712738037, "learning_rate": 1.9997863954180476e-05, "loss": 0.0495, "step": 88640 }, { "epoch": 106.93663246831623, "grad_norm": 5.126278877258301, "learning_rate": 1.9997863712925683e-05, "loss": 0.0497, "step": 88650 }, { "epoch": 106.94870247435124, "grad_norm": 4.250522136688232, "learning_rate": 1.999786347167089e-05, "loss": 0.0472, "step": 88660 }, { "epoch": 106.96077248038624, "grad_norm": 4.166049003601074, "learning_rate": 1.9997863230416095e-05, "loss": 0.0494, "step": 88670 }, { "epoch": 106.97284248642124, "grad_norm": 4.3283610343933105, "learning_rate": 1.99978629891613e-05, "loss": 0.0463, "step": 88680 }, { "epoch": 106.98491249245625, "grad_norm": 4.967171669006348, "learning_rate": 1.9997862747906507e-05, "loss": 0.0468, "step": 88690 }, { "epoch": 106.99698249849125, "grad_norm": 4.60133695602417, "learning_rate": 1.9997862506651714e-05, "loss": 0.0491, "step": 88700 }, { "epoch": 107.0084490042245, "grad_norm": 3.750814914703369, "learning_rate": 1.999786226539692e-05, "loss": 0.0355, "step": 88710 }, { "epoch": 107.0205190102595, "grad_norm": 3.6492578983306885, "learning_rate": 1.9997862024142126e-05, "loss": 0.0327, "step": 88720 }, { "epoch": 107.03258901629451, "grad_norm": 3.817441940307617, "learning_rate": 1.9997861782887332e-05, "loss": 0.0329, "step": 88730 }, { "epoch": 107.04465902232951, "grad_norm": 3.8220131397247314, "learning_rate": 1.999786154163254e-05, "loss": 0.0365, "step": 88740 }, { "epoch": 107.05672902836451, "grad_norm": 4.058883190155029, "learning_rate": 1.9997861300377745e-05, "loss": 0.0368, "step": 88750 }, { "epoch": 107.06879903439952, "grad_norm": 4.247000694274902, "learning_rate": 1.999786105912295e-05, "loss": 0.036, "step": 88760 }, { "epoch": 107.08086904043452, "grad_norm": 4.278661727905273, "learning_rate": 1.9997860817868157e-05, "loss": 0.0366, "step": 88770 }, { "epoch": 107.09293904646952, "grad_norm": 4.545688629150391, "learning_rate": 1.999786057661336e-05, "loss": 0.039, "step": 88780 }, { "epoch": 107.10500905250453, "grad_norm": 3.9485578536987305, "learning_rate": 1.9997860335358566e-05, "loss": 0.0401, "step": 88790 }, { "epoch": 107.11707905853953, "grad_norm": 4.309386730194092, "learning_rate": 1.9997860094103772e-05, "loss": 0.0376, "step": 88800 }, { "epoch": 107.12914906457453, "grad_norm": 4.203876495361328, "learning_rate": 1.999785985284898e-05, "loss": 0.0345, "step": 88810 }, { "epoch": 107.14121907060954, "grad_norm": 3.8120968341827393, "learning_rate": 1.9997859611594185e-05, "loss": 0.039, "step": 88820 }, { "epoch": 107.15328907664454, "grad_norm": 4.185163974761963, "learning_rate": 1.999785937033939e-05, "loss": 0.0383, "step": 88830 }, { "epoch": 107.16535908267954, "grad_norm": 4.04659366607666, "learning_rate": 1.9997859129084597e-05, "loss": 0.0388, "step": 88840 }, { "epoch": 107.17742908871455, "grad_norm": 3.895853042602539, "learning_rate": 1.9997858887829803e-05, "loss": 0.0381, "step": 88850 }, { "epoch": 107.18949909474955, "grad_norm": 4.452259063720703, "learning_rate": 1.999785864657501e-05, "loss": 0.0413, "step": 88860 }, { "epoch": 107.20156910078455, "grad_norm": 4.220227241516113, "learning_rate": 1.9997858405320216e-05, "loss": 0.0396, "step": 88870 }, { "epoch": 107.21363910681956, "grad_norm": 3.9677417278289795, "learning_rate": 1.9997858164065422e-05, "loss": 0.039, "step": 88880 }, { "epoch": 107.22570911285456, "grad_norm": 4.0176262855529785, "learning_rate": 1.999785792281063e-05, "loss": 0.0388, "step": 88890 }, { "epoch": 107.23777911888956, "grad_norm": 3.987696409225464, "learning_rate": 1.9997857681555835e-05, "loss": 0.0397, "step": 88900 }, { "epoch": 107.24984912492457, "grad_norm": 3.9638001918792725, "learning_rate": 1.999785744030104e-05, "loss": 0.0388, "step": 88910 }, { "epoch": 107.26191913095957, "grad_norm": 4.325412750244141, "learning_rate": 1.9997857199046247e-05, "loss": 0.041, "step": 88920 }, { "epoch": 107.27398913699457, "grad_norm": 4.39304256439209, "learning_rate": 1.9997856957791453e-05, "loss": 0.0408, "step": 88930 }, { "epoch": 107.28605914302958, "grad_norm": 4.445115089416504, "learning_rate": 1.999785671653666e-05, "loss": 0.0417, "step": 88940 }, { "epoch": 107.29812914906458, "grad_norm": 4.215365886688232, "learning_rate": 1.9997856475281866e-05, "loss": 0.0436, "step": 88950 }, { "epoch": 107.31019915509958, "grad_norm": 4.160392761230469, "learning_rate": 1.9997856234027072e-05, "loss": 0.0424, "step": 88960 }, { "epoch": 107.32226916113459, "grad_norm": 4.405841827392578, "learning_rate": 1.9997855992772278e-05, "loss": 0.0421, "step": 88970 }, { "epoch": 107.33433916716959, "grad_norm": 4.424933910369873, "learning_rate": 1.9997855751517484e-05, "loss": 0.0403, "step": 88980 }, { "epoch": 107.34640917320459, "grad_norm": 3.8669583797454834, "learning_rate": 1.999785551026269e-05, "loss": 0.0395, "step": 88990 }, { "epoch": 107.3584791792396, "grad_norm": 4.268723487854004, "learning_rate": 1.9997855269007897e-05, "loss": 0.0403, "step": 89000 }, { "epoch": 107.3584791792396, "eval_loss": 12.951783180236816, "eval_runtime": 8.1285, "eval_samples_per_second": 85.748, "eval_steps_per_second": 10.826, "step": 89000 }, { "epoch": 107.3705491852746, "grad_norm": 4.310104846954346, "learning_rate": 1.9997855027753103e-05, "loss": 0.0401, "step": 89010 }, { "epoch": 107.3826191913096, "grad_norm": 4.359238147735596, "learning_rate": 1.999785478649831e-05, "loss": 0.0409, "step": 89020 }, { "epoch": 107.3946891973446, "grad_norm": 4.0494160652160645, "learning_rate": 1.9997854545243515e-05, "loss": 0.0421, "step": 89030 }, { "epoch": 107.40675920337961, "grad_norm": 4.533732891082764, "learning_rate": 1.999785430398872e-05, "loss": 0.044, "step": 89040 }, { "epoch": 107.41882920941461, "grad_norm": 4.343237400054932, "learning_rate": 1.9997854062733928e-05, "loss": 0.042, "step": 89050 }, { "epoch": 107.43089921544961, "grad_norm": 4.162615776062012, "learning_rate": 1.9997853821479134e-05, "loss": 0.0428, "step": 89060 }, { "epoch": 107.44296922148462, "grad_norm": 4.338191986083984, "learning_rate": 1.999785358022434e-05, "loss": 0.0435, "step": 89070 }, { "epoch": 107.45503922751962, "grad_norm": 4.015114784240723, "learning_rate": 1.9997853338969546e-05, "loss": 0.0432, "step": 89080 }, { "epoch": 107.46710923355462, "grad_norm": 4.385900497436523, "learning_rate": 1.9997853097714753e-05, "loss": 0.0434, "step": 89090 }, { "epoch": 107.47917923958963, "grad_norm": 5.137434005737305, "learning_rate": 1.999785285645996e-05, "loss": 0.0435, "step": 89100 }, { "epoch": 107.49124924562463, "grad_norm": 4.7707319259643555, "learning_rate": 1.9997852615205165e-05, "loss": 0.0433, "step": 89110 }, { "epoch": 107.50331925165963, "grad_norm": 4.239076137542725, "learning_rate": 1.999785237395037e-05, "loss": 0.0423, "step": 89120 }, { "epoch": 107.51538925769462, "grad_norm": 4.351833343505859, "learning_rate": 1.9997852132695578e-05, "loss": 0.0431, "step": 89130 }, { "epoch": 107.52745926372963, "grad_norm": 4.058877944946289, "learning_rate": 1.9997851891440784e-05, "loss": 0.0411, "step": 89140 }, { "epoch": 107.53952926976463, "grad_norm": 4.310504913330078, "learning_rate": 1.999785165018599e-05, "loss": 0.043, "step": 89150 }, { "epoch": 107.55159927579963, "grad_norm": 4.277721881866455, "learning_rate": 1.9997851408931196e-05, "loss": 0.042, "step": 89160 }, { "epoch": 107.56366928183463, "grad_norm": 3.6348183155059814, "learning_rate": 1.9997851167676402e-05, "loss": 0.0438, "step": 89170 }, { "epoch": 107.57573928786964, "grad_norm": 4.67029333114624, "learning_rate": 1.999785092642161e-05, "loss": 0.0435, "step": 89180 }, { "epoch": 107.58780929390464, "grad_norm": 4.1254096031188965, "learning_rate": 1.999785068516681e-05, "loss": 0.0429, "step": 89190 }, { "epoch": 107.59987929993964, "grad_norm": 4.480268955230713, "learning_rate": 1.9997850443912018e-05, "loss": 0.043, "step": 89200 }, { "epoch": 107.61194930597465, "grad_norm": 4.180893421173096, "learning_rate": 1.9997850202657224e-05, "loss": 0.0418, "step": 89210 }, { "epoch": 107.62401931200965, "grad_norm": 4.461721897125244, "learning_rate": 1.999784996140243e-05, "loss": 0.0434, "step": 89220 }, { "epoch": 107.63608931804465, "grad_norm": 5.07706880569458, "learning_rate": 1.9997849720147636e-05, "loss": 0.0463, "step": 89230 }, { "epoch": 107.64815932407966, "grad_norm": 4.112742900848389, "learning_rate": 1.9997849478892843e-05, "loss": 0.044, "step": 89240 }, { "epoch": 107.66022933011466, "grad_norm": 4.343356132507324, "learning_rate": 1.999784923763805e-05, "loss": 0.0423, "step": 89250 }, { "epoch": 107.67229933614966, "grad_norm": 4.189510345458984, "learning_rate": 1.9997848996383255e-05, "loss": 0.0448, "step": 89260 }, { "epoch": 107.68436934218467, "grad_norm": 4.205492973327637, "learning_rate": 1.999784875512846e-05, "loss": 0.0451, "step": 89270 }, { "epoch": 107.69643934821967, "grad_norm": 4.9094414710998535, "learning_rate": 1.9997848513873667e-05, "loss": 0.0441, "step": 89280 }, { "epoch": 107.70850935425467, "grad_norm": 4.584593772888184, "learning_rate": 1.9997848272618874e-05, "loss": 0.0436, "step": 89290 }, { "epoch": 107.72057936028968, "grad_norm": 4.300698280334473, "learning_rate": 1.999784803136408e-05, "loss": 0.0451, "step": 89300 }, { "epoch": 107.73264936632468, "grad_norm": 4.818943500518799, "learning_rate": 1.9997847790109286e-05, "loss": 0.0458, "step": 89310 }, { "epoch": 107.74471937235968, "grad_norm": 4.000196933746338, "learning_rate": 1.9997847548854492e-05, "loss": 0.0431, "step": 89320 }, { "epoch": 107.75678937839469, "grad_norm": 4.7857136726379395, "learning_rate": 1.99978473075997e-05, "loss": 0.0457, "step": 89330 }, { "epoch": 107.76885938442969, "grad_norm": 4.112119674682617, "learning_rate": 1.9997847066344905e-05, "loss": 0.0459, "step": 89340 }, { "epoch": 107.78092939046469, "grad_norm": 4.229408264160156, "learning_rate": 1.999784682509011e-05, "loss": 0.0444, "step": 89350 }, { "epoch": 107.7929993964997, "grad_norm": 4.136718273162842, "learning_rate": 1.9997846583835317e-05, "loss": 0.0457, "step": 89360 }, { "epoch": 107.8050694025347, "grad_norm": 4.653751850128174, "learning_rate": 1.9997846342580523e-05, "loss": 0.0443, "step": 89370 }, { "epoch": 107.8171394085697, "grad_norm": 5.412283420562744, "learning_rate": 1.999784610132573e-05, "loss": 0.0468, "step": 89380 }, { "epoch": 107.8292094146047, "grad_norm": 4.643008232116699, "learning_rate": 1.9997845860070936e-05, "loss": 0.0459, "step": 89390 }, { "epoch": 107.84127942063971, "grad_norm": 4.853544235229492, "learning_rate": 1.9997845618816142e-05, "loss": 0.0448, "step": 89400 }, { "epoch": 107.85334942667471, "grad_norm": 4.322865009307861, "learning_rate": 1.9997845377561348e-05, "loss": 0.0455, "step": 89410 }, { "epoch": 107.86541943270971, "grad_norm": 4.8105340003967285, "learning_rate": 1.9997845136306554e-05, "loss": 0.0461, "step": 89420 }, { "epoch": 107.87748943874472, "grad_norm": 4.252740383148193, "learning_rate": 1.999784489505176e-05, "loss": 0.0454, "step": 89430 }, { "epoch": 107.88955944477972, "grad_norm": 4.683557987213135, "learning_rate": 1.9997844653796963e-05, "loss": 0.0479, "step": 89440 }, { "epoch": 107.90162945081472, "grad_norm": 4.633675575256348, "learning_rate": 1.999784441254217e-05, "loss": 0.0481, "step": 89450 }, { "epoch": 107.91369945684973, "grad_norm": 5.331716060638428, "learning_rate": 1.9997844171287376e-05, "loss": 0.0462, "step": 89460 }, { "epoch": 107.92576946288473, "grad_norm": 4.539869785308838, "learning_rate": 1.9997843930032582e-05, "loss": 0.0466, "step": 89470 }, { "epoch": 107.93783946891973, "grad_norm": 4.905856132507324, "learning_rate": 1.9997843688777788e-05, "loss": 0.0485, "step": 89480 }, { "epoch": 107.94990947495474, "grad_norm": 4.726032733917236, "learning_rate": 1.9997843447522995e-05, "loss": 0.0467, "step": 89490 }, { "epoch": 107.96197948098974, "grad_norm": 4.609572410583496, "learning_rate": 1.99978432062682e-05, "loss": 0.0473, "step": 89500 }, { "epoch": 107.96197948098974, "eval_loss": 12.96074104309082, "eval_runtime": 8.1847, "eval_samples_per_second": 85.159, "eval_steps_per_second": 10.752, "step": 89500 }, { "epoch": 107.97404948702474, "grad_norm": 4.652061462402344, "learning_rate": 1.999784296501341e-05, "loss": 0.0481, "step": 89510 }, { "epoch": 107.98611949305975, "grad_norm": 4.469645023345947, "learning_rate": 1.9997842723758617e-05, "loss": 0.0461, "step": 89520 }, { "epoch": 107.99818949909475, "grad_norm": 4.185280799865723, "learning_rate": 1.9997842482503823e-05, "loss": 0.049, "step": 89530 }, { "epoch": 108.009656004828, "grad_norm": 3.8331799507141113, "learning_rate": 1.999784224124903e-05, "loss": 0.0328, "step": 89540 }, { "epoch": 108.021726010863, "grad_norm": 4.284505844116211, "learning_rate": 1.9997841999994235e-05, "loss": 0.0329, "step": 89550 }, { "epoch": 108.03379601689801, "grad_norm": 3.760679244995117, "learning_rate": 1.999784175873944e-05, "loss": 0.0343, "step": 89560 }, { "epoch": 108.04586602293301, "grad_norm": 3.731952428817749, "learning_rate": 1.9997841517484648e-05, "loss": 0.0346, "step": 89570 }, { "epoch": 108.05793602896802, "grad_norm": 4.094696044921875, "learning_rate": 1.9997841276229854e-05, "loss": 0.0355, "step": 89580 }, { "epoch": 108.07000603500302, "grad_norm": 3.881540298461914, "learning_rate": 1.999784103497506e-05, "loss": 0.0371, "step": 89590 }, { "epoch": 108.08207604103802, "grad_norm": 4.191500186920166, "learning_rate": 1.9997840793720263e-05, "loss": 0.0367, "step": 89600 }, { "epoch": 108.09414604707302, "grad_norm": 3.4789369106292725, "learning_rate": 1.999784055246547e-05, "loss": 0.038, "step": 89610 }, { "epoch": 108.10621605310803, "grad_norm": 4.292841911315918, "learning_rate": 1.9997840311210675e-05, "loss": 0.0369, "step": 89620 }, { "epoch": 108.11828605914303, "grad_norm": 4.127758502960205, "learning_rate": 1.999784006995588e-05, "loss": 0.036, "step": 89630 }, { "epoch": 108.13035606517803, "grad_norm": 4.070250034332275, "learning_rate": 1.9997839828701088e-05, "loss": 0.0368, "step": 89640 }, { "epoch": 108.14242607121304, "grad_norm": 4.420553684234619, "learning_rate": 1.9997839587446294e-05, "loss": 0.0406, "step": 89650 }, { "epoch": 108.15449607724804, "grad_norm": 4.378361225128174, "learning_rate": 1.99978393461915e-05, "loss": 0.0394, "step": 89660 }, { "epoch": 108.16656608328304, "grad_norm": 3.790999174118042, "learning_rate": 1.9997839104936706e-05, "loss": 0.0375, "step": 89670 }, { "epoch": 108.17863608931805, "grad_norm": 4.267336845397949, "learning_rate": 1.9997838863681913e-05, "loss": 0.0361, "step": 89680 }, { "epoch": 108.19070609535305, "grad_norm": 4.077511787414551, "learning_rate": 1.999783862242712e-05, "loss": 0.0372, "step": 89690 }, { "epoch": 108.20277610138805, "grad_norm": 4.474502086639404, "learning_rate": 1.9997838381172325e-05, "loss": 0.0394, "step": 89700 }, { "epoch": 108.21484610742306, "grad_norm": 4.43627405166626, "learning_rate": 1.999783813991753e-05, "loss": 0.0371, "step": 89710 }, { "epoch": 108.22691611345806, "grad_norm": 4.245051383972168, "learning_rate": 1.9997837898662737e-05, "loss": 0.0399, "step": 89720 }, { "epoch": 108.23898611949306, "grad_norm": 4.435319423675537, "learning_rate": 1.9997837657407944e-05, "loss": 0.0375, "step": 89730 }, { "epoch": 108.25105612552807, "grad_norm": 4.818587779998779, "learning_rate": 1.999783741615315e-05, "loss": 0.0411, "step": 89740 }, { "epoch": 108.26312613156307, "grad_norm": 4.278494358062744, "learning_rate": 1.9997837174898356e-05, "loss": 0.0432, "step": 89750 }, { "epoch": 108.27519613759807, "grad_norm": 3.971649408340454, "learning_rate": 1.9997836933643562e-05, "loss": 0.0407, "step": 89760 }, { "epoch": 108.28726614363308, "grad_norm": 4.192323684692383, "learning_rate": 1.999783669238877e-05, "loss": 0.0399, "step": 89770 }, { "epoch": 108.29933614966808, "grad_norm": 4.621811389923096, "learning_rate": 1.9997836451133975e-05, "loss": 0.0418, "step": 89780 }, { "epoch": 108.31140615570308, "grad_norm": 4.181812763214111, "learning_rate": 1.999783620987918e-05, "loss": 0.0423, "step": 89790 }, { "epoch": 108.32347616173809, "grad_norm": 4.274931907653809, "learning_rate": 1.9997835968624387e-05, "loss": 0.0396, "step": 89800 }, { "epoch": 108.33554616777309, "grad_norm": 3.9714558124542236, "learning_rate": 1.9997835727369593e-05, "loss": 0.0387, "step": 89810 }, { "epoch": 108.34761617380809, "grad_norm": 4.702773571014404, "learning_rate": 1.99978354861148e-05, "loss": 0.0403, "step": 89820 }, { "epoch": 108.3596861798431, "grad_norm": 3.872037410736084, "learning_rate": 1.9997835244860006e-05, "loss": 0.0395, "step": 89830 }, { "epoch": 108.3717561858781, "grad_norm": 4.781141757965088, "learning_rate": 1.9997835003605212e-05, "loss": 0.0415, "step": 89840 }, { "epoch": 108.3838261919131, "grad_norm": 4.134364604949951, "learning_rate": 1.9997834762350415e-05, "loss": 0.0411, "step": 89850 }, { "epoch": 108.3958961979481, "grad_norm": 4.2039408683776855, "learning_rate": 1.999783452109562e-05, "loss": 0.04, "step": 89860 }, { "epoch": 108.40796620398311, "grad_norm": 4.38966703414917, "learning_rate": 1.9997834279840827e-05, "loss": 0.04, "step": 89870 }, { "epoch": 108.42003621001811, "grad_norm": 4.374575614929199, "learning_rate": 1.9997834038586034e-05, "loss": 0.0415, "step": 89880 }, { "epoch": 108.43210621605311, "grad_norm": 4.514824867248535, "learning_rate": 1.999783379733124e-05, "loss": 0.0406, "step": 89890 }, { "epoch": 108.44417622208812, "grad_norm": 4.912534713745117, "learning_rate": 1.9997833556076446e-05, "loss": 0.0428, "step": 89900 }, { "epoch": 108.45624622812312, "grad_norm": 4.0932817459106445, "learning_rate": 1.9997833314821652e-05, "loss": 0.0416, "step": 89910 }, { "epoch": 108.46831623415812, "grad_norm": 4.431766510009766, "learning_rate": 1.999783307356686e-05, "loss": 0.0418, "step": 89920 }, { "epoch": 108.48038624019313, "grad_norm": 4.514699459075928, "learning_rate": 1.9997832832312065e-05, "loss": 0.0432, "step": 89930 }, { "epoch": 108.49245624622813, "grad_norm": 3.7669222354888916, "learning_rate": 1.999783259105727e-05, "loss": 0.0426, "step": 89940 }, { "epoch": 108.50452625226312, "grad_norm": 4.688840389251709, "learning_rate": 1.9997832349802477e-05, "loss": 0.044, "step": 89950 }, { "epoch": 108.51659625829812, "grad_norm": 3.9050254821777344, "learning_rate": 1.9997832108547683e-05, "loss": 0.0414, "step": 89960 }, { "epoch": 108.52866626433313, "grad_norm": 4.6415815353393555, "learning_rate": 1.999783186729289e-05, "loss": 0.0416, "step": 89970 }, { "epoch": 108.54073627036813, "grad_norm": 4.2915191650390625, "learning_rate": 1.9997831626038096e-05, "loss": 0.0445, "step": 89980 }, { "epoch": 108.55280627640313, "grad_norm": 3.88543701171875, "learning_rate": 1.9997831384783302e-05, "loss": 0.0443, "step": 89990 }, { "epoch": 108.56487628243814, "grad_norm": 4.155815124511719, "learning_rate": 1.9997831143528508e-05, "loss": 0.0447, "step": 90000 }, { "epoch": 108.56487628243814, "eval_loss": 12.96729564666748, "eval_runtime": 8.1305, "eval_samples_per_second": 85.727, "eval_steps_per_second": 10.823, "step": 90000 }, { "epoch": 108.57694628847314, "grad_norm": 4.019302845001221, "learning_rate": 1.9997830902273714e-05, "loss": 0.0439, "step": 90010 }, { "epoch": 108.58901629450814, "grad_norm": 4.12695837020874, "learning_rate": 1.999783066101892e-05, "loss": 0.0432, "step": 90020 }, { "epoch": 108.60108630054314, "grad_norm": 4.374637603759766, "learning_rate": 1.9997830419764127e-05, "loss": 0.0414, "step": 90030 }, { "epoch": 108.61315630657815, "grad_norm": 4.302980899810791, "learning_rate": 1.9997830178509333e-05, "loss": 0.0457, "step": 90040 }, { "epoch": 108.62522631261315, "grad_norm": 4.537432670593262, "learning_rate": 1.999782993725454e-05, "loss": 0.0459, "step": 90050 }, { "epoch": 108.63729631864815, "grad_norm": 4.327533721923828, "learning_rate": 1.9997829695999745e-05, "loss": 0.0439, "step": 90060 }, { "epoch": 108.64936632468316, "grad_norm": 4.170380115509033, "learning_rate": 1.999782945474495e-05, "loss": 0.0436, "step": 90070 }, { "epoch": 108.66143633071816, "grad_norm": 4.659152507781982, "learning_rate": 1.9997829213490158e-05, "loss": 0.0454, "step": 90080 }, { "epoch": 108.67350633675316, "grad_norm": 4.749820709228516, "learning_rate": 1.9997828972235364e-05, "loss": 0.0435, "step": 90090 }, { "epoch": 108.68557634278817, "grad_norm": 3.9367377758026123, "learning_rate": 1.999782873098057e-05, "loss": 0.0433, "step": 90100 }, { "epoch": 108.69764634882317, "grad_norm": 4.250601768493652, "learning_rate": 1.9997828489725776e-05, "loss": 0.0437, "step": 90110 }, { "epoch": 108.70971635485817, "grad_norm": 5.211303234100342, "learning_rate": 1.9997828248470983e-05, "loss": 0.0443, "step": 90120 }, { "epoch": 108.72178636089318, "grad_norm": 4.711114406585693, "learning_rate": 1.999782800721619e-05, "loss": 0.0454, "step": 90130 }, { "epoch": 108.73385636692818, "grad_norm": 5.09336519241333, "learning_rate": 1.9997827765961395e-05, "loss": 0.0466, "step": 90140 }, { "epoch": 108.74592637296318, "grad_norm": 4.589167594909668, "learning_rate": 1.99978275247066e-05, "loss": 0.0455, "step": 90150 }, { "epoch": 108.75799637899819, "grad_norm": 4.011967182159424, "learning_rate": 1.9997827283451808e-05, "loss": 0.0445, "step": 90160 }, { "epoch": 108.77006638503319, "grad_norm": 4.849642276763916, "learning_rate": 1.9997827042197014e-05, "loss": 0.048, "step": 90170 }, { "epoch": 108.78213639106819, "grad_norm": 4.475957870483398, "learning_rate": 1.999782680094222e-05, "loss": 0.0454, "step": 90180 }, { "epoch": 108.7942063971032, "grad_norm": 4.674715995788574, "learning_rate": 1.9997826559687426e-05, "loss": 0.047, "step": 90190 }, { "epoch": 108.8062764031382, "grad_norm": 4.881089687347412, "learning_rate": 1.9997826318432632e-05, "loss": 0.0464, "step": 90200 }, { "epoch": 108.8183464091732, "grad_norm": 4.182114124298096, "learning_rate": 1.999782607717784e-05, "loss": 0.0461, "step": 90210 }, { "epoch": 108.8304164152082, "grad_norm": 4.662785530090332, "learning_rate": 1.9997825835923045e-05, "loss": 0.0458, "step": 90220 }, { "epoch": 108.84248642124321, "grad_norm": 5.093433856964111, "learning_rate": 1.999782559466825e-05, "loss": 0.0447, "step": 90230 }, { "epoch": 108.85455642727821, "grad_norm": 4.337923049926758, "learning_rate": 1.9997825353413457e-05, "loss": 0.0468, "step": 90240 }, { "epoch": 108.86662643331321, "grad_norm": 4.426502704620361, "learning_rate": 1.9997825112158663e-05, "loss": 0.0463, "step": 90250 }, { "epoch": 108.87869643934822, "grad_norm": 4.072250843048096, "learning_rate": 1.999782487090387e-05, "loss": 0.0447, "step": 90260 }, { "epoch": 108.89076644538322, "grad_norm": 4.5595831871032715, "learning_rate": 1.9997824629649073e-05, "loss": 0.0477, "step": 90270 }, { "epoch": 108.90283645141822, "grad_norm": 4.578862190246582, "learning_rate": 1.999782438839428e-05, "loss": 0.0455, "step": 90280 }, { "epoch": 108.91490645745323, "grad_norm": 4.709077835083008, "learning_rate": 1.9997824147139485e-05, "loss": 0.0467, "step": 90290 }, { "epoch": 108.92697646348823, "grad_norm": 4.659517765045166, "learning_rate": 1.999782390588469e-05, "loss": 0.0483, "step": 90300 }, { "epoch": 108.93904646952323, "grad_norm": 4.436054229736328, "learning_rate": 1.9997823664629897e-05, "loss": 0.0448, "step": 90310 }, { "epoch": 108.95111647555824, "grad_norm": 4.4680562019348145, "learning_rate": 1.9997823423375104e-05, "loss": 0.046, "step": 90320 }, { "epoch": 108.96318648159324, "grad_norm": 4.350229740142822, "learning_rate": 1.999782318212031e-05, "loss": 0.0473, "step": 90330 }, { "epoch": 108.97525648762824, "grad_norm": 4.226140022277832, "learning_rate": 1.9997822940865516e-05, "loss": 0.0459, "step": 90340 }, { "epoch": 108.98732649366325, "grad_norm": 4.849513530731201, "learning_rate": 1.9997822699610722e-05, "loss": 0.0476, "step": 90350 }, { "epoch": 108.99939649969825, "grad_norm": 4.219803333282471, "learning_rate": 1.999782245835593e-05, "loss": 0.0453, "step": 90360 }, { "epoch": 109.0108630054315, "grad_norm": 3.8839714527130127, "learning_rate": 1.9997822217101135e-05, "loss": 0.0346, "step": 90370 }, { "epoch": 109.0229330114665, "grad_norm": 4.564253330230713, "learning_rate": 1.999782197584634e-05, "loss": 0.0319, "step": 90380 }, { "epoch": 109.03500301750151, "grad_norm": 3.8644320964813232, "learning_rate": 1.9997821734591547e-05, "loss": 0.0322, "step": 90390 }, { "epoch": 109.04707302353651, "grad_norm": 3.4287331104278564, "learning_rate": 1.9997821493336753e-05, "loss": 0.0346, "step": 90400 }, { "epoch": 109.05914302957152, "grad_norm": 3.3080012798309326, "learning_rate": 1.999782125208196e-05, "loss": 0.0346, "step": 90410 }, { "epoch": 109.07121303560652, "grad_norm": 4.1977057456970215, "learning_rate": 1.9997821010827166e-05, "loss": 0.0354, "step": 90420 }, { "epoch": 109.08328304164152, "grad_norm": 4.163266181945801, "learning_rate": 1.9997820769572372e-05, "loss": 0.0377, "step": 90430 }, { "epoch": 109.09535304767653, "grad_norm": 3.9655280113220215, "learning_rate": 1.9997820528317578e-05, "loss": 0.0368, "step": 90440 }, { "epoch": 109.10742305371153, "grad_norm": 3.762136697769165, "learning_rate": 1.9997820287062784e-05, "loss": 0.0349, "step": 90450 }, { "epoch": 109.11949305974653, "grad_norm": 4.228010177612305, "learning_rate": 1.999782004580799e-05, "loss": 0.0362, "step": 90460 }, { "epoch": 109.13156306578153, "grad_norm": 4.456912994384766, "learning_rate": 1.9997819804553197e-05, "loss": 0.0394, "step": 90470 }, { "epoch": 109.14363307181654, "grad_norm": 3.9839894771575928, "learning_rate": 1.9997819563298403e-05, "loss": 0.037, "step": 90480 }, { "epoch": 109.15570307785154, "grad_norm": 3.8375511169433594, "learning_rate": 1.999781932204361e-05, "loss": 0.04, "step": 90490 }, { "epoch": 109.16777308388654, "grad_norm": 4.464683532714844, "learning_rate": 1.9997819080788815e-05, "loss": 0.0372, "step": 90500 }, { "epoch": 109.16777308388654, "eval_loss": 12.960920333862305, "eval_runtime": 8.1265, "eval_samples_per_second": 85.769, "eval_steps_per_second": 10.829, "step": 90500 }, { "epoch": 109.17984308992155, "grad_norm": 4.299412250518799, "learning_rate": 1.9997818839534022e-05, "loss": 0.0385, "step": 90510 }, { "epoch": 109.19191309595655, "grad_norm": 4.3527445793151855, "learning_rate": 1.9997818598279225e-05, "loss": 0.0389, "step": 90520 }, { "epoch": 109.20398310199155, "grad_norm": 3.907951593399048, "learning_rate": 1.999781835702443e-05, "loss": 0.0391, "step": 90530 }, { "epoch": 109.21605310802656, "grad_norm": 4.134695053100586, "learning_rate": 1.9997818115769637e-05, "loss": 0.0386, "step": 90540 }, { "epoch": 109.22812311406156, "grad_norm": 3.907813310623169, "learning_rate": 1.9997817874514843e-05, "loss": 0.0399, "step": 90550 }, { "epoch": 109.24019312009656, "grad_norm": 3.6312105655670166, "learning_rate": 1.999781763326005e-05, "loss": 0.0366, "step": 90560 }, { "epoch": 109.25226312613157, "grad_norm": 4.186295032501221, "learning_rate": 1.9997817392005256e-05, "loss": 0.0373, "step": 90570 }, { "epoch": 109.26433313216657, "grad_norm": 3.722766876220703, "learning_rate": 1.9997817150750462e-05, "loss": 0.0387, "step": 90580 }, { "epoch": 109.27640313820157, "grad_norm": 4.047800064086914, "learning_rate": 1.999781690949567e-05, "loss": 0.0403, "step": 90590 }, { "epoch": 109.28847314423658, "grad_norm": 3.8306405544281006, "learning_rate": 1.9997816668240878e-05, "loss": 0.0391, "step": 90600 }, { "epoch": 109.30054315027158, "grad_norm": 4.073200702667236, "learning_rate": 1.9997816426986084e-05, "loss": 0.0396, "step": 90610 }, { "epoch": 109.31261315630658, "grad_norm": 3.9266772270202637, "learning_rate": 1.999781618573129e-05, "loss": 0.0411, "step": 90620 }, { "epoch": 109.32468316234159, "grad_norm": 4.474181175231934, "learning_rate": 1.9997815944476496e-05, "loss": 0.0412, "step": 90630 }, { "epoch": 109.33675316837659, "grad_norm": 3.850006341934204, "learning_rate": 1.9997815703221702e-05, "loss": 0.0391, "step": 90640 }, { "epoch": 109.34882317441159, "grad_norm": 3.9543309211730957, "learning_rate": 1.999781546196691e-05, "loss": 0.0411, "step": 90650 }, { "epoch": 109.3608931804466, "grad_norm": 4.576484680175781, "learning_rate": 1.9997815220712115e-05, "loss": 0.0415, "step": 90660 }, { "epoch": 109.3729631864816, "grad_norm": 4.395376682281494, "learning_rate": 1.999781497945732e-05, "loss": 0.0407, "step": 90670 }, { "epoch": 109.3850331925166, "grad_norm": 4.72895622253418, "learning_rate": 1.9997814738202524e-05, "loss": 0.0416, "step": 90680 }, { "epoch": 109.3971031985516, "grad_norm": 4.671524524688721, "learning_rate": 1.999781449694773e-05, "loss": 0.043, "step": 90690 }, { "epoch": 109.40917320458661, "grad_norm": 3.8146491050720215, "learning_rate": 1.9997814255692936e-05, "loss": 0.0385, "step": 90700 }, { "epoch": 109.42124321062161, "grad_norm": 4.006521701812744, "learning_rate": 1.9997814014438143e-05, "loss": 0.0433, "step": 90710 }, { "epoch": 109.43331321665661, "grad_norm": 4.076420307159424, "learning_rate": 1.999781377318335e-05, "loss": 0.0408, "step": 90720 }, { "epoch": 109.44538322269162, "grad_norm": 4.200831890106201, "learning_rate": 1.9997813531928555e-05, "loss": 0.0415, "step": 90730 }, { "epoch": 109.45745322872662, "grad_norm": 3.705306053161621, "learning_rate": 1.999781329067376e-05, "loss": 0.0414, "step": 90740 }, { "epoch": 109.46952323476162, "grad_norm": 4.29075813293457, "learning_rate": 1.9997813049418967e-05, "loss": 0.04, "step": 90750 }, { "epoch": 109.48159324079663, "grad_norm": 4.04674768447876, "learning_rate": 1.9997812808164174e-05, "loss": 0.0415, "step": 90760 }, { "epoch": 109.49366324683163, "grad_norm": 4.682492733001709, "learning_rate": 1.999781256690938e-05, "loss": 0.0435, "step": 90770 }, { "epoch": 109.50573325286662, "grad_norm": 3.911781072616577, "learning_rate": 1.9997812325654586e-05, "loss": 0.0409, "step": 90780 }, { "epoch": 109.51780325890162, "grad_norm": 4.380587577819824, "learning_rate": 1.9997812084399792e-05, "loss": 0.042, "step": 90790 }, { "epoch": 109.52987326493663, "grad_norm": 4.537583827972412, "learning_rate": 1.9997811843145e-05, "loss": 0.0432, "step": 90800 }, { "epoch": 109.54194327097163, "grad_norm": 4.304986000061035, "learning_rate": 1.9997811601890205e-05, "loss": 0.0429, "step": 90810 }, { "epoch": 109.55401327700663, "grad_norm": 4.179041385650635, "learning_rate": 1.999781136063541e-05, "loss": 0.042, "step": 90820 }, { "epoch": 109.56608328304164, "grad_norm": 4.328545570373535, "learning_rate": 1.9997811119380617e-05, "loss": 0.0438, "step": 90830 }, { "epoch": 109.57815328907664, "grad_norm": 4.188544273376465, "learning_rate": 1.9997810878125823e-05, "loss": 0.0423, "step": 90840 }, { "epoch": 109.59022329511164, "grad_norm": 4.627157688140869, "learning_rate": 1.999781063687103e-05, "loss": 0.0454, "step": 90850 }, { "epoch": 109.60229330114664, "grad_norm": 4.659063816070557, "learning_rate": 1.9997810395616236e-05, "loss": 0.0444, "step": 90860 }, { "epoch": 109.61436330718165, "grad_norm": 4.813961029052734, "learning_rate": 1.9997810154361442e-05, "loss": 0.0438, "step": 90870 }, { "epoch": 109.62643331321665, "grad_norm": 4.102046966552734, "learning_rate": 1.9997809913106648e-05, "loss": 0.043, "step": 90880 }, { "epoch": 109.63850331925165, "grad_norm": 4.927762985229492, "learning_rate": 1.9997809671851854e-05, "loss": 0.0467, "step": 90890 }, { "epoch": 109.65057332528666, "grad_norm": 4.298531532287598, "learning_rate": 1.999780943059706e-05, "loss": 0.0444, "step": 90900 }, { "epoch": 109.66264333132166, "grad_norm": 4.661178112030029, "learning_rate": 1.9997809189342267e-05, "loss": 0.0429, "step": 90910 }, { "epoch": 109.67471333735666, "grad_norm": 4.60453987121582, "learning_rate": 1.9997808948087473e-05, "loss": 0.0437, "step": 90920 }, { "epoch": 109.68678334339167, "grad_norm": 4.094152927398682, "learning_rate": 1.9997808706832676e-05, "loss": 0.046, "step": 90930 }, { "epoch": 109.69885334942667, "grad_norm": 4.101780414581299, "learning_rate": 1.9997808465577882e-05, "loss": 0.0438, "step": 90940 }, { "epoch": 109.71092335546167, "grad_norm": 4.080630302429199, "learning_rate": 1.999780822432309e-05, "loss": 0.0429, "step": 90950 }, { "epoch": 109.72299336149668, "grad_norm": 4.60542631149292, "learning_rate": 1.9997807983068295e-05, "loss": 0.0443, "step": 90960 }, { "epoch": 109.73506336753168, "grad_norm": 4.4904656410217285, "learning_rate": 1.99978077418135e-05, "loss": 0.043, "step": 90970 }, { "epoch": 109.74713337356668, "grad_norm": 4.794617176055908, "learning_rate": 1.9997807500558707e-05, "loss": 0.0427, "step": 90980 }, { "epoch": 109.75920337960169, "grad_norm": 4.119545936584473, "learning_rate": 1.9997807259303913e-05, "loss": 0.0445, "step": 90990 }, { "epoch": 109.77127338563669, "grad_norm": 4.18511962890625, "learning_rate": 1.999780701804912e-05, "loss": 0.0461, "step": 91000 }, { "epoch": 109.77127338563669, "eval_loss": 12.989845275878906, "eval_runtime": 8.1129, "eval_samples_per_second": 85.912, "eval_steps_per_second": 10.847, "step": 91000 }, { "epoch": 109.78334339167169, "grad_norm": 4.434848308563232, "learning_rate": 1.9997806776794326e-05, "loss": 0.0461, "step": 91010 }, { "epoch": 109.7954133977067, "grad_norm": 4.137471675872803, "learning_rate": 1.9997806535539532e-05, "loss": 0.0461, "step": 91020 }, { "epoch": 109.8074834037417, "grad_norm": 4.606768608093262, "learning_rate": 1.9997806294284738e-05, "loss": 0.0448, "step": 91030 }, { "epoch": 109.8195534097767, "grad_norm": 4.372289180755615, "learning_rate": 1.9997806053029944e-05, "loss": 0.0441, "step": 91040 }, { "epoch": 109.8316234158117, "grad_norm": 4.712072372436523, "learning_rate": 1.999780581177515e-05, "loss": 0.0431, "step": 91050 }, { "epoch": 109.84369342184671, "grad_norm": 4.582446575164795, "learning_rate": 1.9997805570520357e-05, "loss": 0.0459, "step": 91060 }, { "epoch": 109.85576342788171, "grad_norm": 4.7080254554748535, "learning_rate": 1.9997805329265563e-05, "loss": 0.0472, "step": 91070 }, { "epoch": 109.86783343391672, "grad_norm": 4.456027507781982, "learning_rate": 1.999780508801077e-05, "loss": 0.0452, "step": 91080 }, { "epoch": 109.87990343995172, "grad_norm": 4.6008830070495605, "learning_rate": 1.9997804846755975e-05, "loss": 0.0475, "step": 91090 }, { "epoch": 109.89197344598672, "grad_norm": 4.748607635498047, "learning_rate": 1.999780460550118e-05, "loss": 0.0459, "step": 91100 }, { "epoch": 109.90404345202172, "grad_norm": 4.262002944946289, "learning_rate": 1.9997804364246388e-05, "loss": 0.0432, "step": 91110 }, { "epoch": 109.91611345805673, "grad_norm": 4.320861339569092, "learning_rate": 1.9997804122991594e-05, "loss": 0.045, "step": 91120 }, { "epoch": 109.92818346409173, "grad_norm": 4.409178733825684, "learning_rate": 1.99978038817368e-05, "loss": 0.0459, "step": 91130 }, { "epoch": 109.94025347012673, "grad_norm": 5.161316871643066, "learning_rate": 1.9997803640482006e-05, "loss": 0.0459, "step": 91140 }, { "epoch": 109.95232347616174, "grad_norm": 4.141584396362305, "learning_rate": 1.9997803399227213e-05, "loss": 0.0454, "step": 91150 }, { "epoch": 109.96439348219674, "grad_norm": 4.2108917236328125, "learning_rate": 1.999780315797242e-05, "loss": 0.0475, "step": 91160 }, { "epoch": 109.97646348823174, "grad_norm": 4.595193862915039, "learning_rate": 1.9997802916717625e-05, "loss": 0.0467, "step": 91170 }, { "epoch": 109.98853349426675, "grad_norm": 4.86818790435791, "learning_rate": 1.999780267546283e-05, "loss": 0.047, "step": 91180 }, { "epoch": 110.0, "grad_norm": 8.344314575195312, "learning_rate": 1.9997802434208038e-05, "loss": 0.0467, "step": 91190 }, { "epoch": 110.012070006035, "grad_norm": 3.390171527862549, "learning_rate": 1.9997802192953244e-05, "loss": 0.032, "step": 91200 }, { "epoch": 110.02414001207, "grad_norm": 4.013706684112549, "learning_rate": 1.999780195169845e-05, "loss": 0.0344, "step": 91210 }, { "epoch": 110.03621001810501, "grad_norm": 3.688868999481201, "learning_rate": 1.9997801710443656e-05, "loss": 0.0332, "step": 91220 }, { "epoch": 110.04828002414001, "grad_norm": 4.028476238250732, "learning_rate": 1.9997801469188862e-05, "loss": 0.0335, "step": 91230 }, { "epoch": 110.06035003017502, "grad_norm": 3.8102633953094482, "learning_rate": 1.999780122793407e-05, "loss": 0.0355, "step": 91240 }, { "epoch": 110.07242003621002, "grad_norm": 3.39420223236084, "learning_rate": 1.9997800986679275e-05, "loss": 0.0351, "step": 91250 }, { "epoch": 110.08449004224502, "grad_norm": 3.657534599304199, "learning_rate": 1.999780074542448e-05, "loss": 0.0356, "step": 91260 }, { "epoch": 110.09656004828003, "grad_norm": 4.269285202026367, "learning_rate": 1.9997800504169687e-05, "loss": 0.0341, "step": 91270 }, { "epoch": 110.10863005431503, "grad_norm": 4.12546443939209, "learning_rate": 1.9997800262914894e-05, "loss": 0.0353, "step": 91280 }, { "epoch": 110.12070006035003, "grad_norm": 4.385762691497803, "learning_rate": 1.99978000216601e-05, "loss": 0.0363, "step": 91290 }, { "epoch": 110.13277006638504, "grad_norm": 4.339166164398193, "learning_rate": 1.9997799780405306e-05, "loss": 0.0395, "step": 91300 }, { "epoch": 110.14484007242004, "grad_norm": 4.074184894561768, "learning_rate": 1.9997799539150512e-05, "loss": 0.0383, "step": 91310 }, { "epoch": 110.15691007845504, "grad_norm": 3.474895715713501, "learning_rate": 1.999779929789572e-05, "loss": 0.0349, "step": 91320 }, { "epoch": 110.16898008449004, "grad_norm": 4.256560325622559, "learning_rate": 1.9997799056640925e-05, "loss": 0.0362, "step": 91330 }, { "epoch": 110.18105009052505, "grad_norm": 4.043673515319824, "learning_rate": 1.9997798815386127e-05, "loss": 0.0389, "step": 91340 }, { "epoch": 110.19312009656005, "grad_norm": 3.8107805252075195, "learning_rate": 1.9997798574131334e-05, "loss": 0.0387, "step": 91350 }, { "epoch": 110.20519010259505, "grad_norm": 4.444796562194824, "learning_rate": 1.999779833287654e-05, "loss": 0.0393, "step": 91360 }, { "epoch": 110.21726010863006, "grad_norm": 3.625535249710083, "learning_rate": 1.9997798091621746e-05, "loss": 0.0384, "step": 91370 }, { "epoch": 110.22933011466506, "grad_norm": 4.079197883605957, "learning_rate": 1.9997797850366952e-05, "loss": 0.0391, "step": 91380 }, { "epoch": 110.24140012070006, "grad_norm": 4.273597717285156, "learning_rate": 1.999779760911216e-05, "loss": 0.0397, "step": 91390 }, { "epoch": 110.25347012673507, "grad_norm": 4.152924537658691, "learning_rate": 1.9997797367857365e-05, "loss": 0.0399, "step": 91400 }, { "epoch": 110.26554013277007, "grad_norm": 4.370284557342529, "learning_rate": 1.999779712660257e-05, "loss": 0.0396, "step": 91410 }, { "epoch": 110.27761013880507, "grad_norm": 4.242639064788818, "learning_rate": 1.9997796885347777e-05, "loss": 0.0412, "step": 91420 }, { "epoch": 110.28968014484008, "grad_norm": 3.9729042053222656, "learning_rate": 1.9997796644092983e-05, "loss": 0.0376, "step": 91430 }, { "epoch": 110.30175015087508, "grad_norm": 4.1473541259765625, "learning_rate": 1.999779640283819e-05, "loss": 0.0389, "step": 91440 }, { "epoch": 110.31382015691008, "grad_norm": 4.4654669761657715, "learning_rate": 1.9997796161583396e-05, "loss": 0.0394, "step": 91450 }, { "epoch": 110.32589016294509, "grad_norm": 4.201931953430176, "learning_rate": 1.9997795920328602e-05, "loss": 0.0418, "step": 91460 }, { "epoch": 110.33796016898009, "grad_norm": 3.7360668182373047, "learning_rate": 1.9997795679073808e-05, "loss": 0.0398, "step": 91470 }, { "epoch": 110.35003017501509, "grad_norm": 3.781140089035034, "learning_rate": 1.9997795437819014e-05, "loss": 0.0399, "step": 91480 }, { "epoch": 110.3621001810501, "grad_norm": 4.639044761657715, "learning_rate": 1.999779519656422e-05, "loss": 0.0427, "step": 91490 }, { "epoch": 110.3741701870851, "grad_norm": 4.255108833312988, "learning_rate": 1.9997794955309427e-05, "loss": 0.0413, "step": 91500 }, { "epoch": 110.3741701870851, "eval_loss": 12.989724159240723, "eval_runtime": 8.161, "eval_samples_per_second": 85.406, "eval_steps_per_second": 10.783, "step": 91500 }, { "epoch": 110.3862401931201, "grad_norm": 4.690604209899902, "learning_rate": 1.9997794714054633e-05, "loss": 0.0384, "step": 91510 }, { "epoch": 110.3983101991551, "grad_norm": 4.369612693786621, "learning_rate": 1.999779447279984e-05, "loss": 0.0413, "step": 91520 }, { "epoch": 110.41038020519011, "grad_norm": 4.343887805938721, "learning_rate": 1.9997794231545046e-05, "loss": 0.04, "step": 91530 }, { "epoch": 110.42245021122511, "grad_norm": 4.578200817108154, "learning_rate": 1.9997793990290252e-05, "loss": 0.0411, "step": 91540 }, { "epoch": 110.43452021726011, "grad_norm": 4.305022239685059, "learning_rate": 1.9997793749035458e-05, "loss": 0.0408, "step": 91550 }, { "epoch": 110.44659022329512, "grad_norm": 4.247934818267822, "learning_rate": 1.9997793507780664e-05, "loss": 0.0424, "step": 91560 }, { "epoch": 110.45866022933012, "grad_norm": 3.983776569366455, "learning_rate": 1.999779326652587e-05, "loss": 0.0422, "step": 91570 }, { "epoch": 110.47073023536512, "grad_norm": 4.134951591491699, "learning_rate": 1.9997793025271077e-05, "loss": 0.0421, "step": 91580 }, { "epoch": 110.48280024140013, "grad_norm": 4.1630988121032715, "learning_rate": 1.999779278401628e-05, "loss": 0.0419, "step": 91590 }, { "epoch": 110.49487024743513, "grad_norm": 3.70658802986145, "learning_rate": 1.9997792542761486e-05, "loss": 0.0421, "step": 91600 }, { "epoch": 110.50694025347012, "grad_norm": 4.3523268699646, "learning_rate": 1.9997792301506692e-05, "loss": 0.0416, "step": 91610 }, { "epoch": 110.51901025950512, "grad_norm": 4.0946502685546875, "learning_rate": 1.9997792060251898e-05, "loss": 0.0418, "step": 91620 }, { "epoch": 110.53108026554013, "grad_norm": 4.027958869934082, "learning_rate": 1.9997791818997104e-05, "loss": 0.0403, "step": 91630 }, { "epoch": 110.54315027157513, "grad_norm": 4.565913677215576, "learning_rate": 1.999779157774231e-05, "loss": 0.0453, "step": 91640 }, { "epoch": 110.55522027761013, "grad_norm": 4.47498893737793, "learning_rate": 1.9997791336487517e-05, "loss": 0.0428, "step": 91650 }, { "epoch": 110.56729028364514, "grad_norm": 4.154623031616211, "learning_rate": 1.9997791095232723e-05, "loss": 0.0436, "step": 91660 }, { "epoch": 110.57936028968014, "grad_norm": 4.643732070922852, "learning_rate": 1.9997790853977933e-05, "loss": 0.0441, "step": 91670 }, { "epoch": 110.59143029571514, "grad_norm": 3.9122884273529053, "learning_rate": 1.999779061272314e-05, "loss": 0.0416, "step": 91680 }, { "epoch": 110.60350030175015, "grad_norm": 4.409422874450684, "learning_rate": 1.9997790371468345e-05, "loss": 0.0434, "step": 91690 }, { "epoch": 110.61557030778515, "grad_norm": 4.323653697967529, "learning_rate": 1.999779013021355e-05, "loss": 0.0432, "step": 91700 }, { "epoch": 110.62764031382015, "grad_norm": 4.168150901794434, "learning_rate": 1.9997789888958757e-05, "loss": 0.0435, "step": 91710 }, { "epoch": 110.63971031985515, "grad_norm": 4.473565578460693, "learning_rate": 1.9997789647703964e-05, "loss": 0.0433, "step": 91720 }, { "epoch": 110.65178032589016, "grad_norm": 4.8348565101623535, "learning_rate": 1.999778940644917e-05, "loss": 0.0408, "step": 91730 }, { "epoch": 110.66385033192516, "grad_norm": 4.139120101928711, "learning_rate": 1.9997789165194376e-05, "loss": 0.042, "step": 91740 }, { "epoch": 110.67592033796016, "grad_norm": 4.262072563171387, "learning_rate": 1.9997788923939582e-05, "loss": 0.0421, "step": 91750 }, { "epoch": 110.68799034399517, "grad_norm": 4.817467212677002, "learning_rate": 1.9997788682684785e-05, "loss": 0.0445, "step": 91760 }, { "epoch": 110.70006035003017, "grad_norm": 4.61085844039917, "learning_rate": 1.999778844142999e-05, "loss": 0.0422, "step": 91770 }, { "epoch": 110.71213035606517, "grad_norm": 4.519757270812988, "learning_rate": 1.9997788200175198e-05, "loss": 0.0445, "step": 91780 }, { "epoch": 110.72420036210018, "grad_norm": 4.645823001861572, "learning_rate": 1.9997787958920404e-05, "loss": 0.0425, "step": 91790 }, { "epoch": 110.73627036813518, "grad_norm": 4.322093963623047, "learning_rate": 1.999778771766561e-05, "loss": 0.046, "step": 91800 }, { "epoch": 110.74834037417018, "grad_norm": 4.6250739097595215, "learning_rate": 1.9997787476410816e-05, "loss": 0.0447, "step": 91810 }, { "epoch": 110.76041038020519, "grad_norm": 4.2336745262146, "learning_rate": 1.9997787235156022e-05, "loss": 0.0455, "step": 91820 }, { "epoch": 110.77248038624019, "grad_norm": 4.432364463806152, "learning_rate": 1.999778699390123e-05, "loss": 0.0453, "step": 91830 }, { "epoch": 110.7845503922752, "grad_norm": 3.8090150356292725, "learning_rate": 1.9997786752646435e-05, "loss": 0.0433, "step": 91840 }, { "epoch": 110.7966203983102, "grad_norm": 4.436679363250732, "learning_rate": 1.999778651139164e-05, "loss": 0.0442, "step": 91850 }, { "epoch": 110.8086904043452, "grad_norm": 4.3299102783203125, "learning_rate": 1.9997786270136847e-05, "loss": 0.0449, "step": 91860 }, { "epoch": 110.8207604103802, "grad_norm": 4.964810848236084, "learning_rate": 1.9997786028882053e-05, "loss": 0.0473, "step": 91870 }, { "epoch": 110.8328304164152, "grad_norm": 4.510311126708984, "learning_rate": 1.999778578762726e-05, "loss": 0.0442, "step": 91880 }, { "epoch": 110.84490042245021, "grad_norm": 4.339536666870117, "learning_rate": 1.9997785546372466e-05, "loss": 0.0475, "step": 91890 }, { "epoch": 110.85697042848521, "grad_norm": 4.296759128570557, "learning_rate": 1.9997785305117672e-05, "loss": 0.0475, "step": 91900 }, { "epoch": 110.86904043452022, "grad_norm": 4.460140228271484, "learning_rate": 1.9997785063862878e-05, "loss": 0.0471, "step": 91910 }, { "epoch": 110.88111044055522, "grad_norm": 4.626201629638672, "learning_rate": 1.9997784822608085e-05, "loss": 0.048, "step": 91920 }, { "epoch": 110.89318044659022, "grad_norm": 4.852933406829834, "learning_rate": 1.999778458135329e-05, "loss": 0.0467, "step": 91930 }, { "epoch": 110.90525045262522, "grad_norm": 4.7429914474487305, "learning_rate": 1.9997784340098497e-05, "loss": 0.0452, "step": 91940 }, { "epoch": 110.91732045866023, "grad_norm": 4.805665016174316, "learning_rate": 1.9997784098843703e-05, "loss": 0.0459, "step": 91950 }, { "epoch": 110.92939046469523, "grad_norm": 4.196742534637451, "learning_rate": 1.999778385758891e-05, "loss": 0.048, "step": 91960 }, { "epoch": 110.94146047073023, "grad_norm": 4.297990798950195, "learning_rate": 1.9997783616334116e-05, "loss": 0.0465, "step": 91970 }, { "epoch": 110.95353047676524, "grad_norm": 4.247276782989502, "learning_rate": 1.9997783375079322e-05, "loss": 0.0471, "step": 91980 }, { "epoch": 110.96560048280024, "grad_norm": 4.21795654296875, "learning_rate": 1.9997783133824528e-05, "loss": 0.0454, "step": 91990 }, { "epoch": 110.97767048883524, "grad_norm": 4.328416347503662, "learning_rate": 1.9997782892569734e-05, "loss": 0.044, "step": 92000 }, { "epoch": 110.97767048883524, "eval_loss": 13.000364303588867, "eval_runtime": 8.1565, "eval_samples_per_second": 85.454, "eval_steps_per_second": 10.789, "step": 92000 }, { "epoch": 110.98974049487025, "grad_norm": 4.254039287567139, "learning_rate": 1.9997782651314937e-05, "loss": 0.0468, "step": 92010 }, { "epoch": 111.0012070006035, "grad_norm": 3.4589145183563232, "learning_rate": 1.9997782410060143e-05, "loss": 0.044, "step": 92020 }, { "epoch": 111.0132770066385, "grad_norm": 3.781611919403076, "learning_rate": 1.999778216880535e-05, "loss": 0.0282, "step": 92030 }, { "epoch": 111.0253470126735, "grad_norm": 3.8568100929260254, "learning_rate": 1.9997781927550556e-05, "loss": 0.031, "step": 92040 }, { "epoch": 111.03741701870851, "grad_norm": 3.6322529315948486, "learning_rate": 1.9997781686295762e-05, "loss": 0.0329, "step": 92050 }, { "epoch": 111.04948702474351, "grad_norm": 3.8193647861480713, "learning_rate": 1.9997781445040968e-05, "loss": 0.0342, "step": 92060 }, { "epoch": 111.06155703077852, "grad_norm": 3.8801920413970947, "learning_rate": 1.9997781203786174e-05, "loss": 0.0366, "step": 92070 }, { "epoch": 111.07362703681352, "grad_norm": 3.469130039215088, "learning_rate": 1.999778096253138e-05, "loss": 0.0349, "step": 92080 }, { "epoch": 111.08569704284852, "grad_norm": 4.030123233795166, "learning_rate": 1.9997780721276587e-05, "loss": 0.0354, "step": 92090 }, { "epoch": 111.09776704888353, "grad_norm": 4.379192352294922, "learning_rate": 1.9997780480021793e-05, "loss": 0.0357, "step": 92100 }, { "epoch": 111.10983705491853, "grad_norm": 3.976069927215576, "learning_rate": 1.9997780238767e-05, "loss": 0.0365, "step": 92110 }, { "epoch": 111.12190706095353, "grad_norm": 4.0763702392578125, "learning_rate": 1.9997779997512205e-05, "loss": 0.0365, "step": 92120 }, { "epoch": 111.13397706698854, "grad_norm": 4.240047454833984, "learning_rate": 1.999777975625741e-05, "loss": 0.0379, "step": 92130 }, { "epoch": 111.14604707302354, "grad_norm": 3.7347159385681152, "learning_rate": 1.9997779515002618e-05, "loss": 0.0363, "step": 92140 }, { "epoch": 111.15811707905854, "grad_norm": 3.8530211448669434, "learning_rate": 1.9997779273747824e-05, "loss": 0.0375, "step": 92150 }, { "epoch": 111.17018708509354, "grad_norm": 3.5895516872406006, "learning_rate": 1.999777903249303e-05, "loss": 0.0365, "step": 92160 }, { "epoch": 111.18225709112855, "grad_norm": 4.070231914520264, "learning_rate": 1.9997778791238237e-05, "loss": 0.0378, "step": 92170 }, { "epoch": 111.19432709716355, "grad_norm": 3.7306642532348633, "learning_rate": 1.9997778549983443e-05, "loss": 0.0389, "step": 92180 }, { "epoch": 111.20639710319855, "grad_norm": 4.350713729858398, "learning_rate": 1.999777830872865e-05, "loss": 0.0388, "step": 92190 }, { "epoch": 111.21846710923356, "grad_norm": 3.743882656097412, "learning_rate": 1.9997778067473855e-05, "loss": 0.0386, "step": 92200 }, { "epoch": 111.23053711526856, "grad_norm": 3.7391982078552246, "learning_rate": 1.999777782621906e-05, "loss": 0.0391, "step": 92210 }, { "epoch": 111.24260712130356, "grad_norm": 4.211119174957275, "learning_rate": 1.9997777584964268e-05, "loss": 0.0381, "step": 92220 }, { "epoch": 111.25467712733857, "grad_norm": 3.994464874267578, "learning_rate": 1.9997777343709474e-05, "loss": 0.0386, "step": 92230 }, { "epoch": 111.26674713337357, "grad_norm": 3.9171533584594727, "learning_rate": 1.999777710245468e-05, "loss": 0.0384, "step": 92240 }, { "epoch": 111.27881713940857, "grad_norm": 4.527454853057861, "learning_rate": 1.9997776861199886e-05, "loss": 0.038, "step": 92250 }, { "epoch": 111.29088714544358, "grad_norm": 3.5625321865081787, "learning_rate": 1.9997776619945092e-05, "loss": 0.0389, "step": 92260 }, { "epoch": 111.30295715147858, "grad_norm": 4.016025543212891, "learning_rate": 1.99977763786903e-05, "loss": 0.0388, "step": 92270 }, { "epoch": 111.31502715751358, "grad_norm": 3.8937079906463623, "learning_rate": 1.9997776137435505e-05, "loss": 0.0366, "step": 92280 }, { "epoch": 111.32709716354859, "grad_norm": 4.104799270629883, "learning_rate": 1.999777589618071e-05, "loss": 0.0395, "step": 92290 }, { "epoch": 111.33916716958359, "grad_norm": 4.380310535430908, "learning_rate": 1.9997775654925917e-05, "loss": 0.0384, "step": 92300 }, { "epoch": 111.35123717561859, "grad_norm": 4.248131275177002, "learning_rate": 1.9997775413671124e-05, "loss": 0.0402, "step": 92310 }, { "epoch": 111.3633071816536, "grad_norm": 4.45596170425415, "learning_rate": 1.999777517241633e-05, "loss": 0.0385, "step": 92320 }, { "epoch": 111.3753771876886, "grad_norm": 3.8097689151763916, "learning_rate": 1.9997774931161536e-05, "loss": 0.0394, "step": 92330 }, { "epoch": 111.3874471937236, "grad_norm": 4.288915634155273, "learning_rate": 1.9997774689906742e-05, "loss": 0.0402, "step": 92340 }, { "epoch": 111.3995171997586, "grad_norm": 4.442718982696533, "learning_rate": 1.999777444865195e-05, "loss": 0.0396, "step": 92350 }, { "epoch": 111.41158720579361, "grad_norm": 4.569223880767822, "learning_rate": 1.9997774207397155e-05, "loss": 0.0395, "step": 92360 }, { "epoch": 111.42365721182861, "grad_norm": 4.622280120849609, "learning_rate": 1.999777396614236e-05, "loss": 0.0413, "step": 92370 }, { "epoch": 111.43572721786362, "grad_norm": 3.9018237590789795, "learning_rate": 1.9997773724887567e-05, "loss": 0.0417, "step": 92380 }, { "epoch": 111.44779722389862, "grad_norm": 3.901007890701294, "learning_rate": 1.9997773483632773e-05, "loss": 0.0411, "step": 92390 }, { "epoch": 111.45986722993362, "grad_norm": 3.3235669136047363, "learning_rate": 1.999777324237798e-05, "loss": 0.0397, "step": 92400 }, { "epoch": 111.47193723596862, "grad_norm": 3.948411464691162, "learning_rate": 1.9997773001123186e-05, "loss": 0.0419, "step": 92410 }, { "epoch": 111.48400724200363, "grad_norm": 4.165928840637207, "learning_rate": 1.999777275986839e-05, "loss": 0.041, "step": 92420 }, { "epoch": 111.49607724803863, "grad_norm": 4.624114513397217, "learning_rate": 1.9997772518613595e-05, "loss": 0.0412, "step": 92430 }, { "epoch": 111.50814725407362, "grad_norm": 4.321056842803955, "learning_rate": 1.99977722773588e-05, "loss": 0.043, "step": 92440 }, { "epoch": 111.52021726010862, "grad_norm": 4.606742858886719, "learning_rate": 1.9997772036104007e-05, "loss": 0.0417, "step": 92450 }, { "epoch": 111.53228726614363, "grad_norm": 3.6410369873046875, "learning_rate": 1.9997771794849213e-05, "loss": 0.0409, "step": 92460 }, { "epoch": 111.54435727217863, "grad_norm": 4.619623184204102, "learning_rate": 1.999777155359442e-05, "loss": 0.0414, "step": 92470 }, { "epoch": 111.55642727821363, "grad_norm": 3.696096897125244, "learning_rate": 1.9997771312339626e-05, "loss": 0.044, "step": 92480 }, { "epoch": 111.56849728424864, "grad_norm": 4.434634208679199, "learning_rate": 1.9997771071084832e-05, "loss": 0.0404, "step": 92490 }, { "epoch": 111.58056729028364, "grad_norm": 4.358808517456055, "learning_rate": 1.9997770829830038e-05, "loss": 0.0425, "step": 92500 }, { "epoch": 111.58056729028364, "eval_loss": 13.001224517822266, "eval_runtime": 8.1477, "eval_samples_per_second": 85.546, "eval_steps_per_second": 10.801, "step": 92500 }, { "epoch": 111.59263729631864, "grad_norm": 4.320267677307129, "learning_rate": 1.9997770588575244e-05, "loss": 0.0421, "step": 92510 }, { "epoch": 111.60470730235365, "grad_norm": 4.477258205413818, "learning_rate": 1.999777034732045e-05, "loss": 0.0446, "step": 92520 }, { "epoch": 111.61677730838865, "grad_norm": 4.309397220611572, "learning_rate": 1.9997770106065657e-05, "loss": 0.042, "step": 92530 }, { "epoch": 111.62884731442365, "grad_norm": 4.262439250946045, "learning_rate": 1.9997769864810863e-05, "loss": 0.0425, "step": 92540 }, { "epoch": 111.64091732045866, "grad_norm": 4.267659664154053, "learning_rate": 1.999776962355607e-05, "loss": 0.0426, "step": 92550 }, { "epoch": 111.65298732649366, "grad_norm": 4.726113319396973, "learning_rate": 1.9997769382301276e-05, "loss": 0.043, "step": 92560 }, { "epoch": 111.66505733252866, "grad_norm": 4.174613952636719, "learning_rate": 1.9997769141046482e-05, "loss": 0.0422, "step": 92570 }, { "epoch": 111.67712733856366, "grad_norm": 4.242783069610596, "learning_rate": 1.9997768899791688e-05, "loss": 0.0433, "step": 92580 }, { "epoch": 111.68919734459867, "grad_norm": 3.893270492553711, "learning_rate": 1.9997768658536894e-05, "loss": 0.0412, "step": 92590 }, { "epoch": 111.70126735063367, "grad_norm": 4.158952236175537, "learning_rate": 1.99977684172821e-05, "loss": 0.0444, "step": 92600 }, { "epoch": 111.71333735666867, "grad_norm": 4.3841729164123535, "learning_rate": 1.9997768176027307e-05, "loss": 0.0427, "step": 92610 }, { "epoch": 111.72540736270368, "grad_norm": 4.481019020080566, "learning_rate": 1.9997767934772513e-05, "loss": 0.0438, "step": 92620 }, { "epoch": 111.73747736873868, "grad_norm": 4.453845977783203, "learning_rate": 1.999776769351772e-05, "loss": 0.0455, "step": 92630 }, { "epoch": 111.74954737477368, "grad_norm": 3.9472198486328125, "learning_rate": 1.9997767452262925e-05, "loss": 0.0435, "step": 92640 }, { "epoch": 111.76161738080869, "grad_norm": 4.781380653381348, "learning_rate": 1.999776721100813e-05, "loss": 0.0439, "step": 92650 }, { "epoch": 111.77368738684369, "grad_norm": 4.815435886383057, "learning_rate": 1.9997766969753338e-05, "loss": 0.0447, "step": 92660 }, { "epoch": 111.7857573928787, "grad_norm": 4.097535133361816, "learning_rate": 1.999776672849854e-05, "loss": 0.0442, "step": 92670 }, { "epoch": 111.7978273989137, "grad_norm": 4.213503360748291, "learning_rate": 1.9997766487243747e-05, "loss": 0.0441, "step": 92680 }, { "epoch": 111.8098974049487, "grad_norm": 4.765072822570801, "learning_rate": 1.9997766245988953e-05, "loss": 0.045, "step": 92690 }, { "epoch": 111.8219674109837, "grad_norm": 4.486705780029297, "learning_rate": 1.999776600473416e-05, "loss": 0.0432, "step": 92700 }, { "epoch": 111.8340374170187, "grad_norm": 4.264054775238037, "learning_rate": 1.9997765763479365e-05, "loss": 0.0471, "step": 92710 }, { "epoch": 111.84610742305371, "grad_norm": 4.708172798156738, "learning_rate": 1.999776552222457e-05, "loss": 0.045, "step": 92720 }, { "epoch": 111.85817742908871, "grad_norm": 4.713948726654053, "learning_rate": 1.9997765280969778e-05, "loss": 0.0445, "step": 92730 }, { "epoch": 111.87024743512372, "grad_norm": 5.141341209411621, "learning_rate": 1.9997765039714984e-05, "loss": 0.0451, "step": 92740 }, { "epoch": 111.88231744115872, "grad_norm": 4.309174537658691, "learning_rate": 1.9997764798460194e-05, "loss": 0.0474, "step": 92750 }, { "epoch": 111.89438744719372, "grad_norm": 4.619624614715576, "learning_rate": 1.99977645572054e-05, "loss": 0.0452, "step": 92760 }, { "epoch": 111.90645745322873, "grad_norm": 4.174884796142578, "learning_rate": 1.9997764315950606e-05, "loss": 0.0476, "step": 92770 }, { "epoch": 111.91852745926373, "grad_norm": 4.635237216949463, "learning_rate": 1.9997764074695812e-05, "loss": 0.0453, "step": 92780 }, { "epoch": 111.93059746529873, "grad_norm": 3.9882655143737793, "learning_rate": 1.999776383344102e-05, "loss": 0.0421, "step": 92790 }, { "epoch": 111.94266747133373, "grad_norm": 4.558913707733154, "learning_rate": 1.9997763592186225e-05, "loss": 0.045, "step": 92800 }, { "epoch": 111.95473747736874, "grad_norm": 4.272077560424805, "learning_rate": 1.999776335093143e-05, "loss": 0.045, "step": 92810 }, { "epoch": 111.96680748340374, "grad_norm": 4.890918254852295, "learning_rate": 1.9997763109676637e-05, "loss": 0.0456, "step": 92820 }, { "epoch": 111.97887748943874, "grad_norm": 4.719023704528809, "learning_rate": 1.9997762868421843e-05, "loss": 0.0485, "step": 92830 }, { "epoch": 111.99094749547375, "grad_norm": 4.4405646324157715, "learning_rate": 1.9997762627167046e-05, "loss": 0.0464, "step": 92840 }, { "epoch": 112.002414001207, "grad_norm": 3.731797933578491, "learning_rate": 1.9997762385912252e-05, "loss": 0.0405, "step": 92850 }, { "epoch": 112.014484007242, "grad_norm": 3.162351608276367, "learning_rate": 1.999776214465746e-05, "loss": 0.0288, "step": 92860 }, { "epoch": 112.026554013277, "grad_norm": 4.453939437866211, "learning_rate": 1.9997761903402665e-05, "loss": 0.0309, "step": 92870 }, { "epoch": 112.03862401931201, "grad_norm": 3.8085150718688965, "learning_rate": 1.999776166214787e-05, "loss": 0.0367, "step": 92880 }, { "epoch": 112.05069402534701, "grad_norm": 3.882937431335449, "learning_rate": 1.9997761420893077e-05, "loss": 0.0321, "step": 92890 }, { "epoch": 112.06276403138202, "grad_norm": 3.55747389793396, "learning_rate": 1.9997761179638283e-05, "loss": 0.0341, "step": 92900 }, { "epoch": 112.07483403741702, "grad_norm": 3.922508478164673, "learning_rate": 1.999776093838349e-05, "loss": 0.0352, "step": 92910 }, { "epoch": 112.08690404345202, "grad_norm": 4.110607147216797, "learning_rate": 1.9997760697128696e-05, "loss": 0.0357, "step": 92920 }, { "epoch": 112.09897404948703, "grad_norm": 4.167728900909424, "learning_rate": 1.9997760455873902e-05, "loss": 0.0366, "step": 92930 }, { "epoch": 112.11104405552203, "grad_norm": 3.8136074542999268, "learning_rate": 1.999776021461911e-05, "loss": 0.0378, "step": 92940 }, { "epoch": 112.12311406155703, "grad_norm": 4.06884241104126, "learning_rate": 1.9997759973364315e-05, "loss": 0.0373, "step": 92950 }, { "epoch": 112.13518406759204, "grad_norm": 3.7550342082977295, "learning_rate": 1.999775973210952e-05, "loss": 0.0364, "step": 92960 }, { "epoch": 112.14725407362704, "grad_norm": 4.328742027282715, "learning_rate": 1.9997759490854727e-05, "loss": 0.0379, "step": 92970 }, { "epoch": 112.15932407966204, "grad_norm": 4.461869716644287, "learning_rate": 1.9997759249599933e-05, "loss": 0.0376, "step": 92980 }, { "epoch": 112.17139408569705, "grad_norm": 4.265625, "learning_rate": 1.999775900834514e-05, "loss": 0.0362, "step": 92990 }, { "epoch": 112.18346409173205, "grad_norm": 3.956714391708374, "learning_rate": 1.9997758767090346e-05, "loss": 0.0371, "step": 93000 }, { "epoch": 112.18346409173205, "eval_loss": 13.004483222961426, "eval_runtime": 8.165, "eval_samples_per_second": 85.364, "eval_steps_per_second": 10.778, "step": 93000 }, { "epoch": 112.19553409776705, "grad_norm": 4.455540657043457, "learning_rate": 1.9997758525835552e-05, "loss": 0.0373, "step": 93010 }, { "epoch": 112.20760410380205, "grad_norm": 4.416782379150391, "learning_rate": 1.9997758284580758e-05, "loss": 0.0366, "step": 93020 }, { "epoch": 112.21967410983706, "grad_norm": 3.8094773292541504, "learning_rate": 1.9997758043325964e-05, "loss": 0.0376, "step": 93030 }, { "epoch": 112.23174411587206, "grad_norm": 4.828092098236084, "learning_rate": 1.999775780207117e-05, "loss": 0.0381, "step": 93040 }, { "epoch": 112.24381412190706, "grad_norm": 4.092986106872559, "learning_rate": 1.9997757560816377e-05, "loss": 0.0389, "step": 93050 }, { "epoch": 112.25588412794207, "grad_norm": 4.247086048126221, "learning_rate": 1.9997757319561583e-05, "loss": 0.0404, "step": 93060 }, { "epoch": 112.26795413397707, "grad_norm": 4.117461681365967, "learning_rate": 1.999775707830679e-05, "loss": 0.0379, "step": 93070 }, { "epoch": 112.28002414001207, "grad_norm": 4.710278511047363, "learning_rate": 1.9997756837051995e-05, "loss": 0.0408, "step": 93080 }, { "epoch": 112.29209414604708, "grad_norm": 3.810734510421753, "learning_rate": 1.9997756595797198e-05, "loss": 0.0396, "step": 93090 }, { "epoch": 112.30416415208208, "grad_norm": 3.50244140625, "learning_rate": 1.9997756354542404e-05, "loss": 0.0402, "step": 93100 }, { "epoch": 112.31623415811708, "grad_norm": 4.86591100692749, "learning_rate": 1.999775611328761e-05, "loss": 0.0412, "step": 93110 }, { "epoch": 112.32830416415209, "grad_norm": 4.326733112335205, "learning_rate": 1.9997755872032817e-05, "loss": 0.0392, "step": 93120 }, { "epoch": 112.34037417018709, "grad_norm": 3.962594985961914, "learning_rate": 1.9997755630778023e-05, "loss": 0.0395, "step": 93130 }, { "epoch": 112.3524441762221, "grad_norm": 4.085904598236084, "learning_rate": 1.999775538952323e-05, "loss": 0.0398, "step": 93140 }, { "epoch": 112.3645141822571, "grad_norm": 4.1428375244140625, "learning_rate": 1.9997755148268435e-05, "loss": 0.0407, "step": 93150 }, { "epoch": 112.3765841882921, "grad_norm": 4.008388996124268, "learning_rate": 1.999775490701364e-05, "loss": 0.0422, "step": 93160 }, { "epoch": 112.3886541943271, "grad_norm": 4.098060131072998, "learning_rate": 1.9997754665758848e-05, "loss": 0.0401, "step": 93170 }, { "epoch": 112.4007242003621, "grad_norm": 4.0945868492126465, "learning_rate": 1.9997754424504054e-05, "loss": 0.0413, "step": 93180 }, { "epoch": 112.41279420639711, "grad_norm": 3.8067994117736816, "learning_rate": 1.999775418324926e-05, "loss": 0.0403, "step": 93190 }, { "epoch": 112.42486421243211, "grad_norm": 4.2355499267578125, "learning_rate": 1.9997753941994467e-05, "loss": 0.0411, "step": 93200 }, { "epoch": 112.43693421846712, "grad_norm": 3.9985198974609375, "learning_rate": 1.9997753700739673e-05, "loss": 0.039, "step": 93210 }, { "epoch": 112.44900422450212, "grad_norm": 4.533400535583496, "learning_rate": 1.999775345948488e-05, "loss": 0.0405, "step": 93220 }, { "epoch": 112.46107423053712, "grad_norm": 4.095654010772705, "learning_rate": 1.9997753218230085e-05, "loss": 0.042, "step": 93230 }, { "epoch": 112.47314423657213, "grad_norm": 3.627504825592041, "learning_rate": 1.999775297697529e-05, "loss": 0.041, "step": 93240 }, { "epoch": 112.48521424260713, "grad_norm": 4.1308159828186035, "learning_rate": 1.9997752735720498e-05, "loss": 0.04, "step": 93250 }, { "epoch": 112.49728424864213, "grad_norm": 4.147393226623535, "learning_rate": 1.9997752494465704e-05, "loss": 0.0422, "step": 93260 }, { "epoch": 112.50935425467712, "grad_norm": 4.487138271331787, "learning_rate": 1.999775225321091e-05, "loss": 0.0422, "step": 93270 }, { "epoch": 112.52142426071212, "grad_norm": 4.253719329833984, "learning_rate": 1.9997752011956116e-05, "loss": 0.0398, "step": 93280 }, { "epoch": 112.53349426674713, "grad_norm": 4.254733562469482, "learning_rate": 1.9997751770701322e-05, "loss": 0.0412, "step": 93290 }, { "epoch": 112.54556427278213, "grad_norm": 3.53439998626709, "learning_rate": 1.999775152944653e-05, "loss": 0.0411, "step": 93300 }, { "epoch": 112.55763427881713, "grad_norm": 4.202939510345459, "learning_rate": 1.9997751288191735e-05, "loss": 0.0422, "step": 93310 }, { "epoch": 112.56970428485214, "grad_norm": 4.406848430633545, "learning_rate": 1.999775104693694e-05, "loss": 0.0432, "step": 93320 }, { "epoch": 112.58177429088714, "grad_norm": 4.025633811950684, "learning_rate": 1.9997750805682147e-05, "loss": 0.0411, "step": 93330 }, { "epoch": 112.59384429692214, "grad_norm": 4.460520267486572, "learning_rate": 1.9997750564427354e-05, "loss": 0.0407, "step": 93340 }, { "epoch": 112.60591430295715, "grad_norm": 4.019546031951904, "learning_rate": 1.999775032317256e-05, "loss": 0.0407, "step": 93350 }, { "epoch": 112.61798430899215, "grad_norm": 4.984706878662109, "learning_rate": 1.9997750081917766e-05, "loss": 0.0438, "step": 93360 }, { "epoch": 112.63005431502715, "grad_norm": 4.250373840332031, "learning_rate": 1.9997749840662972e-05, "loss": 0.0434, "step": 93370 }, { "epoch": 112.64212432106216, "grad_norm": 4.997101306915283, "learning_rate": 1.999774959940818e-05, "loss": 0.0445, "step": 93380 }, { "epoch": 112.65419432709716, "grad_norm": 4.432432651519775, "learning_rate": 1.9997749358153385e-05, "loss": 0.0428, "step": 93390 }, { "epoch": 112.66626433313216, "grad_norm": 4.243999481201172, "learning_rate": 1.999774911689859e-05, "loss": 0.0423, "step": 93400 }, { "epoch": 112.67833433916717, "grad_norm": 4.165016174316406, "learning_rate": 1.9997748875643797e-05, "loss": 0.0445, "step": 93410 }, { "epoch": 112.69040434520217, "grad_norm": 4.249668121337891, "learning_rate": 1.9997748634389003e-05, "loss": 0.0439, "step": 93420 }, { "epoch": 112.70247435123717, "grad_norm": 4.30850076675415, "learning_rate": 1.999774839313421e-05, "loss": 0.0431, "step": 93430 }, { "epoch": 112.71454435727217, "grad_norm": 4.391417980194092, "learning_rate": 1.9997748151879416e-05, "loss": 0.0431, "step": 93440 }, { "epoch": 112.72661436330718, "grad_norm": 4.821363925933838, "learning_rate": 1.9997747910624622e-05, "loss": 0.0444, "step": 93450 }, { "epoch": 112.73868436934218, "grad_norm": 3.9302217960357666, "learning_rate": 1.9997747669369828e-05, "loss": 0.0445, "step": 93460 }, { "epoch": 112.75075437537718, "grad_norm": 4.184377193450928, "learning_rate": 1.9997747428115034e-05, "loss": 0.0432, "step": 93470 }, { "epoch": 112.76282438141219, "grad_norm": 4.600458145141602, "learning_rate": 1.999774718686024e-05, "loss": 0.0436, "step": 93480 }, { "epoch": 112.77489438744719, "grad_norm": 4.238802433013916, "learning_rate": 1.9997746945605447e-05, "loss": 0.0442, "step": 93490 }, { "epoch": 112.7869643934822, "grad_norm": 4.539072036743164, "learning_rate": 1.999774670435065e-05, "loss": 0.0433, "step": 93500 }, { "epoch": 112.7869643934822, "eval_loss": 13.013444900512695, "eval_runtime": 8.1371, "eval_samples_per_second": 85.657, "eval_steps_per_second": 10.815, "step": 93500 }, { "epoch": 112.7990343995172, "grad_norm": 4.089810848236084, "learning_rate": 1.9997746463095856e-05, "loss": 0.0411, "step": 93510 }, { "epoch": 112.8111044055522, "grad_norm": 3.829099655151367, "learning_rate": 1.9997746221841062e-05, "loss": 0.0423, "step": 93520 }, { "epoch": 112.8231744115872, "grad_norm": 4.475998878479004, "learning_rate": 1.9997745980586268e-05, "loss": 0.0441, "step": 93530 }, { "epoch": 112.8352444176222, "grad_norm": 4.282635688781738, "learning_rate": 1.9997745739331474e-05, "loss": 0.0447, "step": 93540 }, { "epoch": 112.84731442365721, "grad_norm": 4.054980754852295, "learning_rate": 1.999774549807668e-05, "loss": 0.042, "step": 93550 }, { "epoch": 112.85938442969221, "grad_norm": 4.045647144317627, "learning_rate": 1.9997745256821887e-05, "loss": 0.042, "step": 93560 }, { "epoch": 112.87145443572722, "grad_norm": 4.097137451171875, "learning_rate": 1.9997745015567093e-05, "loss": 0.0452, "step": 93570 }, { "epoch": 112.88352444176222, "grad_norm": 4.207859039306641, "learning_rate": 1.99977447743123e-05, "loss": 0.0441, "step": 93580 }, { "epoch": 112.89559444779722, "grad_norm": 5.101869106292725, "learning_rate": 1.9997744533057506e-05, "loss": 0.0441, "step": 93590 }, { "epoch": 112.90766445383223, "grad_norm": 4.8290181159973145, "learning_rate": 1.9997744291802712e-05, "loss": 0.0454, "step": 93600 }, { "epoch": 112.91973445986723, "grad_norm": 4.572831153869629, "learning_rate": 1.9997744050547918e-05, "loss": 0.0453, "step": 93610 }, { "epoch": 112.93180446590223, "grad_norm": 4.9763288497924805, "learning_rate": 1.9997743809293124e-05, "loss": 0.0433, "step": 93620 }, { "epoch": 112.94387447193724, "grad_norm": 4.352704048156738, "learning_rate": 1.999774356803833e-05, "loss": 0.0468, "step": 93630 }, { "epoch": 112.95594447797224, "grad_norm": 4.802908897399902, "learning_rate": 1.9997743326783537e-05, "loss": 0.0449, "step": 93640 }, { "epoch": 112.96801448400724, "grad_norm": 4.461472988128662, "learning_rate": 1.9997743085528743e-05, "loss": 0.0469, "step": 93650 }, { "epoch": 112.98008449004224, "grad_norm": 4.452162265777588, "learning_rate": 1.999774284427395e-05, "loss": 0.045, "step": 93660 }, { "epoch": 112.99215449607725, "grad_norm": 4.581039905548096, "learning_rate": 1.9997742603019155e-05, "loss": 0.0464, "step": 93670 }, { "epoch": 113.0036210018105, "grad_norm": 3.57177472114563, "learning_rate": 1.999774236176436e-05, "loss": 0.0416, "step": 93680 }, { "epoch": 113.0156910078455, "grad_norm": 3.599946975708008, "learning_rate": 1.9997742120509568e-05, "loss": 0.0287, "step": 93690 }, { "epoch": 113.02776101388051, "grad_norm": 3.4498133659362793, "learning_rate": 1.9997741879254774e-05, "loss": 0.0306, "step": 93700 }, { "epoch": 113.03983101991551, "grad_norm": 3.9356255531311035, "learning_rate": 1.999774163799998e-05, "loss": 0.0327, "step": 93710 }, { "epoch": 113.05190102595051, "grad_norm": 3.921713352203369, "learning_rate": 1.9997741396745186e-05, "loss": 0.0326, "step": 93720 }, { "epoch": 113.06397103198552, "grad_norm": 3.3483874797821045, "learning_rate": 1.9997741155490393e-05, "loss": 0.0347, "step": 93730 }, { "epoch": 113.07604103802052, "grad_norm": 3.7579967975616455, "learning_rate": 1.99977409142356e-05, "loss": 0.0356, "step": 93740 }, { "epoch": 113.08811104405552, "grad_norm": 3.938999652862549, "learning_rate": 1.99977406729808e-05, "loss": 0.0329, "step": 93750 }, { "epoch": 113.10018105009053, "grad_norm": 3.6803338527679443, "learning_rate": 1.9997740431726008e-05, "loss": 0.0339, "step": 93760 }, { "epoch": 113.11225105612553, "grad_norm": 3.4106955528259277, "learning_rate": 1.9997740190471214e-05, "loss": 0.0337, "step": 93770 }, { "epoch": 113.12432106216053, "grad_norm": 3.9470303058624268, "learning_rate": 1.999773994921642e-05, "loss": 0.0354, "step": 93780 }, { "epoch": 113.13639106819554, "grad_norm": 4.133861064910889, "learning_rate": 1.9997739707961626e-05, "loss": 0.0375, "step": 93790 }, { "epoch": 113.14846107423054, "grad_norm": 4.194736003875732, "learning_rate": 1.9997739466706833e-05, "loss": 0.0362, "step": 93800 }, { "epoch": 113.16053108026554, "grad_norm": 4.354088306427002, "learning_rate": 1.999773922545204e-05, "loss": 0.0368, "step": 93810 }, { "epoch": 113.17260108630055, "grad_norm": 4.072744846343994, "learning_rate": 1.9997738984197245e-05, "loss": 0.0389, "step": 93820 }, { "epoch": 113.18467109233555, "grad_norm": 4.1348490715026855, "learning_rate": 1.9997738742942455e-05, "loss": 0.0368, "step": 93830 }, { "epoch": 113.19674109837055, "grad_norm": 4.284971237182617, "learning_rate": 1.999773850168766e-05, "loss": 0.037, "step": 93840 }, { "epoch": 113.20881110440556, "grad_norm": 3.8503987789154053, "learning_rate": 1.9997738260432867e-05, "loss": 0.0369, "step": 93850 }, { "epoch": 113.22088111044056, "grad_norm": 3.6573140621185303, "learning_rate": 1.9997738019178073e-05, "loss": 0.0357, "step": 93860 }, { "epoch": 113.23295111647556, "grad_norm": 4.121221542358398, "learning_rate": 1.999773777792328e-05, "loss": 0.0384, "step": 93870 }, { "epoch": 113.24502112251056, "grad_norm": 3.7629170417785645, "learning_rate": 1.9997737536668486e-05, "loss": 0.0363, "step": 93880 }, { "epoch": 113.25709112854557, "grad_norm": 4.365046977996826, "learning_rate": 1.9997737295413692e-05, "loss": 0.0381, "step": 93890 }, { "epoch": 113.26916113458057, "grad_norm": 3.8894526958465576, "learning_rate": 1.9997737054158898e-05, "loss": 0.0377, "step": 93900 }, { "epoch": 113.28123114061557, "grad_norm": 4.361610412597656, "learning_rate": 1.99977368129041e-05, "loss": 0.0382, "step": 93910 }, { "epoch": 113.29330114665058, "grad_norm": 4.048924446105957, "learning_rate": 1.9997736571649307e-05, "loss": 0.0389, "step": 93920 }, { "epoch": 113.30537115268558, "grad_norm": 3.8373916149139404, "learning_rate": 1.9997736330394513e-05, "loss": 0.038, "step": 93930 }, { "epoch": 113.31744115872058, "grad_norm": 4.460630416870117, "learning_rate": 1.999773608913972e-05, "loss": 0.0398, "step": 93940 }, { "epoch": 113.32951116475559, "grad_norm": 4.071352481842041, "learning_rate": 1.9997735847884926e-05, "loss": 0.0397, "step": 93950 }, { "epoch": 113.34158117079059, "grad_norm": 4.348461627960205, "learning_rate": 1.9997735606630132e-05, "loss": 0.0388, "step": 93960 }, { "epoch": 113.3536511768256, "grad_norm": 4.013121604919434, "learning_rate": 1.999773536537534e-05, "loss": 0.0414, "step": 93970 }, { "epoch": 113.3657211828606, "grad_norm": 4.013495445251465, "learning_rate": 1.9997735124120545e-05, "loss": 0.0384, "step": 93980 }, { "epoch": 113.3777911888956, "grad_norm": 4.054185390472412, "learning_rate": 1.999773488286575e-05, "loss": 0.0404, "step": 93990 }, { "epoch": 113.3898611949306, "grad_norm": 4.398401260375977, "learning_rate": 1.9997734641610957e-05, "loss": 0.0411, "step": 94000 }, { "epoch": 113.3898611949306, "eval_loss": 13.02129077911377, "eval_runtime": 8.137, "eval_samples_per_second": 85.658, "eval_steps_per_second": 10.815, "step": 94000 }, { "epoch": 113.4019312009656, "grad_norm": 4.289637565612793, "learning_rate": 1.9997734400356163e-05, "loss": 0.0405, "step": 94010 }, { "epoch": 113.41400120700061, "grad_norm": 3.807788848876953, "learning_rate": 1.999773415910137e-05, "loss": 0.0396, "step": 94020 }, { "epoch": 113.42607121303561, "grad_norm": 3.789163112640381, "learning_rate": 1.9997733917846576e-05, "loss": 0.0383, "step": 94030 }, { "epoch": 113.43814121907062, "grad_norm": 3.778817892074585, "learning_rate": 1.9997733676591782e-05, "loss": 0.0397, "step": 94040 }, { "epoch": 113.45021122510562, "grad_norm": 4.191930770874023, "learning_rate": 1.9997733435336988e-05, "loss": 0.0407, "step": 94050 }, { "epoch": 113.46228123114062, "grad_norm": 4.291943073272705, "learning_rate": 1.9997733194082194e-05, "loss": 0.0408, "step": 94060 }, { "epoch": 113.47435123717563, "grad_norm": 4.395610332489014, "learning_rate": 1.99977329528274e-05, "loss": 0.0417, "step": 94070 }, { "epoch": 113.48642124321063, "grad_norm": 3.8800432682037354, "learning_rate": 1.9997732711572607e-05, "loss": 0.041, "step": 94080 }, { "epoch": 113.49849124924563, "grad_norm": 4.020613670349121, "learning_rate": 1.9997732470317813e-05, "loss": 0.0405, "step": 94090 }, { "epoch": 113.51056125528062, "grad_norm": 4.558541297912598, "learning_rate": 1.999773222906302e-05, "loss": 0.0401, "step": 94100 }, { "epoch": 113.52263126131562, "grad_norm": 4.590328216552734, "learning_rate": 1.9997731987808225e-05, "loss": 0.0393, "step": 94110 }, { "epoch": 113.53470126735063, "grad_norm": 3.7851016521453857, "learning_rate": 1.999773174655343e-05, "loss": 0.0392, "step": 94120 }, { "epoch": 113.54677127338563, "grad_norm": 4.486434459686279, "learning_rate": 1.9997731505298638e-05, "loss": 0.042, "step": 94130 }, { "epoch": 113.55884127942063, "grad_norm": 4.196164131164551, "learning_rate": 1.9997731264043844e-05, "loss": 0.0405, "step": 94140 }, { "epoch": 113.57091128545564, "grad_norm": 4.339909076690674, "learning_rate": 1.999773102278905e-05, "loss": 0.0405, "step": 94150 }, { "epoch": 113.58298129149064, "grad_norm": 4.2238616943359375, "learning_rate": 1.9997730781534253e-05, "loss": 0.0391, "step": 94160 }, { "epoch": 113.59505129752564, "grad_norm": 4.256354331970215, "learning_rate": 1.999773054027946e-05, "loss": 0.0399, "step": 94170 }, { "epoch": 113.60712130356065, "grad_norm": 4.142507076263428, "learning_rate": 1.9997730299024665e-05, "loss": 0.041, "step": 94180 }, { "epoch": 113.61919130959565, "grad_norm": 4.198040962219238, "learning_rate": 1.9997730057769872e-05, "loss": 0.0413, "step": 94190 }, { "epoch": 113.63126131563065, "grad_norm": 4.677823066711426, "learning_rate": 1.9997729816515078e-05, "loss": 0.0408, "step": 94200 }, { "epoch": 113.64333132166566, "grad_norm": 4.149754047393799, "learning_rate": 1.9997729575260284e-05, "loss": 0.0402, "step": 94210 }, { "epoch": 113.65540132770066, "grad_norm": 4.516277313232422, "learning_rate": 1.999772933400549e-05, "loss": 0.0425, "step": 94220 }, { "epoch": 113.66747133373566, "grad_norm": 4.028141975402832, "learning_rate": 1.9997729092750697e-05, "loss": 0.0415, "step": 94230 }, { "epoch": 113.67954133977067, "grad_norm": 4.3509626388549805, "learning_rate": 1.9997728851495903e-05, "loss": 0.0418, "step": 94240 }, { "epoch": 113.69161134580567, "grad_norm": 4.054190635681152, "learning_rate": 1.999772861024111e-05, "loss": 0.0416, "step": 94250 }, { "epoch": 113.70368135184067, "grad_norm": 4.55413818359375, "learning_rate": 1.9997728368986315e-05, "loss": 0.043, "step": 94260 }, { "epoch": 113.71575135787567, "grad_norm": 4.709308624267578, "learning_rate": 1.999772812773152e-05, "loss": 0.0426, "step": 94270 }, { "epoch": 113.72782136391068, "grad_norm": 4.450843811035156, "learning_rate": 1.9997727886476728e-05, "loss": 0.042, "step": 94280 }, { "epoch": 113.73989136994568, "grad_norm": 4.487822532653809, "learning_rate": 1.9997727645221934e-05, "loss": 0.044, "step": 94290 }, { "epoch": 113.75196137598068, "grad_norm": 4.481651782989502, "learning_rate": 1.999772740396714e-05, "loss": 0.0427, "step": 94300 }, { "epoch": 113.76403138201569, "grad_norm": 4.639625072479248, "learning_rate": 1.9997727162712346e-05, "loss": 0.0438, "step": 94310 }, { "epoch": 113.77610138805069, "grad_norm": 4.437882423400879, "learning_rate": 1.9997726921457552e-05, "loss": 0.041, "step": 94320 }, { "epoch": 113.7881713940857, "grad_norm": 4.350271701812744, "learning_rate": 1.999772668020276e-05, "loss": 0.0445, "step": 94330 }, { "epoch": 113.8002414001207, "grad_norm": 4.55293083190918, "learning_rate": 1.9997726438947965e-05, "loss": 0.0435, "step": 94340 }, { "epoch": 113.8123114061557, "grad_norm": 4.877509593963623, "learning_rate": 1.999772619769317e-05, "loss": 0.045, "step": 94350 }, { "epoch": 113.8243814121907, "grad_norm": 4.495406627655029, "learning_rate": 1.9997725956438377e-05, "loss": 0.0434, "step": 94360 }, { "epoch": 113.8364514182257, "grad_norm": 4.321389198303223, "learning_rate": 1.9997725715183584e-05, "loss": 0.0436, "step": 94370 }, { "epoch": 113.84852142426071, "grad_norm": 4.2551374435424805, "learning_rate": 1.999772547392879e-05, "loss": 0.0457, "step": 94380 }, { "epoch": 113.86059143029571, "grad_norm": 4.300424575805664, "learning_rate": 1.9997725232673996e-05, "loss": 0.0443, "step": 94390 }, { "epoch": 113.87266143633072, "grad_norm": 4.495434284210205, "learning_rate": 1.9997724991419202e-05, "loss": 0.0455, "step": 94400 }, { "epoch": 113.88473144236572, "grad_norm": 4.108336925506592, "learning_rate": 1.999772475016441e-05, "loss": 0.043, "step": 94410 }, { "epoch": 113.89680144840072, "grad_norm": 5.060318470001221, "learning_rate": 1.9997724508909615e-05, "loss": 0.045, "step": 94420 }, { "epoch": 113.90887145443573, "grad_norm": 4.316554069519043, "learning_rate": 1.999772426765482e-05, "loss": 0.0437, "step": 94430 }, { "epoch": 113.92094146047073, "grad_norm": 4.163443565368652, "learning_rate": 1.9997724026400027e-05, "loss": 0.0456, "step": 94440 }, { "epoch": 113.93301146650573, "grad_norm": 4.383227348327637, "learning_rate": 1.9997723785145233e-05, "loss": 0.0438, "step": 94450 }, { "epoch": 113.94508147254074, "grad_norm": 4.651416301727295, "learning_rate": 1.999772354389044e-05, "loss": 0.045, "step": 94460 }, { "epoch": 113.95715147857574, "grad_norm": 4.659160137176514, "learning_rate": 1.9997723302635646e-05, "loss": 0.0446, "step": 94470 }, { "epoch": 113.96922148461074, "grad_norm": 4.23032808303833, "learning_rate": 1.9997723061380852e-05, "loss": 0.0439, "step": 94480 }, { "epoch": 113.98129149064575, "grad_norm": 4.647943496704102, "learning_rate": 1.9997722820126058e-05, "loss": 0.0445, "step": 94490 }, { "epoch": 113.99336149668075, "grad_norm": 4.351492404937744, "learning_rate": 1.9997722578871264e-05, "loss": 0.0461, "step": 94500 }, { "epoch": 113.99336149668075, "eval_loss": 13.050703048706055, "eval_runtime": 8.1385, "eval_samples_per_second": 85.642, "eval_steps_per_second": 10.813, "step": 94500 }, { "epoch": 114.004828002414, "grad_norm": 3.3192977905273438, "learning_rate": 1.999772233761647e-05, "loss": 0.0386, "step": 94510 }, { "epoch": 114.016898008449, "grad_norm": 3.419288158416748, "learning_rate": 1.9997722096361677e-05, "loss": 0.0322, "step": 94520 }, { "epoch": 114.02896801448401, "grad_norm": 3.6216378211975098, "learning_rate": 1.9997721855106883e-05, "loss": 0.0333, "step": 94530 }, { "epoch": 114.04103802051901, "grad_norm": 3.9118428230285645, "learning_rate": 1.999772161385209e-05, "loss": 0.0341, "step": 94540 }, { "epoch": 114.05310802655401, "grad_norm": 3.924020528793335, "learning_rate": 1.9997721372597295e-05, "loss": 0.0333, "step": 94550 }, { "epoch": 114.06517803258902, "grad_norm": 3.507814407348633, "learning_rate": 1.99977211313425e-05, "loss": 0.0323, "step": 94560 }, { "epoch": 114.07724803862402, "grad_norm": 3.7387373447418213, "learning_rate": 1.9997720890087708e-05, "loss": 0.0335, "step": 94570 }, { "epoch": 114.08931804465902, "grad_norm": 3.873440980911255, "learning_rate": 1.999772064883291e-05, "loss": 0.0343, "step": 94580 }, { "epoch": 114.10138805069403, "grad_norm": 3.5705020427703857, "learning_rate": 1.9997720407578117e-05, "loss": 0.0347, "step": 94590 }, { "epoch": 114.11345805672903, "grad_norm": 3.58559513092041, "learning_rate": 1.9997720166323323e-05, "loss": 0.0343, "step": 94600 }, { "epoch": 114.12552806276403, "grad_norm": 3.9945719242095947, "learning_rate": 1.999771992506853e-05, "loss": 0.0368, "step": 94610 }, { "epoch": 114.13759806879904, "grad_norm": 4.201625347137451, "learning_rate": 1.9997719683813736e-05, "loss": 0.0361, "step": 94620 }, { "epoch": 114.14966807483404, "grad_norm": 3.790738344192505, "learning_rate": 1.9997719442558942e-05, "loss": 0.0367, "step": 94630 }, { "epoch": 114.16173808086904, "grad_norm": 3.74751877784729, "learning_rate": 1.9997719201304148e-05, "loss": 0.0364, "step": 94640 }, { "epoch": 114.17380808690405, "grad_norm": 3.754560947418213, "learning_rate": 1.9997718960049354e-05, "loss": 0.0363, "step": 94650 }, { "epoch": 114.18587809293905, "grad_norm": 3.695794105529785, "learning_rate": 1.999771871879456e-05, "loss": 0.036, "step": 94660 }, { "epoch": 114.19794809897405, "grad_norm": 3.9902052879333496, "learning_rate": 1.9997718477539767e-05, "loss": 0.037, "step": 94670 }, { "epoch": 114.21001810500906, "grad_norm": 3.956972599029541, "learning_rate": 1.9997718236284973e-05, "loss": 0.038, "step": 94680 }, { "epoch": 114.22208811104406, "grad_norm": 3.3505611419677734, "learning_rate": 1.999771799503018e-05, "loss": 0.0369, "step": 94690 }, { "epoch": 114.23415811707906, "grad_norm": 4.140176296234131, "learning_rate": 1.9997717753775385e-05, "loss": 0.037, "step": 94700 }, { "epoch": 114.24622812311407, "grad_norm": 4.145816326141357, "learning_rate": 1.999771751252059e-05, "loss": 0.0351, "step": 94710 }, { "epoch": 114.25829812914907, "grad_norm": 3.934222936630249, "learning_rate": 1.9997717271265798e-05, "loss": 0.0368, "step": 94720 }, { "epoch": 114.27036813518407, "grad_norm": 4.022954940795898, "learning_rate": 1.9997717030011004e-05, "loss": 0.0407, "step": 94730 }, { "epoch": 114.28243814121907, "grad_norm": 4.313994884490967, "learning_rate": 1.999771678875621e-05, "loss": 0.0381, "step": 94740 }, { "epoch": 114.29450814725408, "grad_norm": 3.8412156105041504, "learning_rate": 1.9997716547501416e-05, "loss": 0.0351, "step": 94750 }, { "epoch": 114.30657815328908, "grad_norm": 4.162898063659668, "learning_rate": 1.9997716306246623e-05, "loss": 0.0387, "step": 94760 }, { "epoch": 114.31864815932408, "grad_norm": 3.87606143951416, "learning_rate": 1.999771606499183e-05, "loss": 0.0407, "step": 94770 }, { "epoch": 114.33071816535909, "grad_norm": 4.122989654541016, "learning_rate": 1.9997715823737035e-05, "loss": 0.0392, "step": 94780 }, { "epoch": 114.34278817139409, "grad_norm": 4.244252681732178, "learning_rate": 1.999771558248224e-05, "loss": 0.04, "step": 94790 }, { "epoch": 114.3548581774291, "grad_norm": 4.267334461212158, "learning_rate": 1.9997715341227447e-05, "loss": 0.0381, "step": 94800 }, { "epoch": 114.3669281834641, "grad_norm": 4.229121208190918, "learning_rate": 1.9997715099972654e-05, "loss": 0.0404, "step": 94810 }, { "epoch": 114.3789981894991, "grad_norm": 3.586549758911133, "learning_rate": 1.999771485871786e-05, "loss": 0.0383, "step": 94820 }, { "epoch": 114.3910681955341, "grad_norm": 4.3601226806640625, "learning_rate": 1.9997714617463063e-05, "loss": 0.0384, "step": 94830 }, { "epoch": 114.4031382015691, "grad_norm": 3.907285213470459, "learning_rate": 1.999771437620827e-05, "loss": 0.0373, "step": 94840 }, { "epoch": 114.41520820760411, "grad_norm": 3.9911720752716064, "learning_rate": 1.9997714134953475e-05, "loss": 0.0416, "step": 94850 }, { "epoch": 114.42727821363911, "grad_norm": 4.224750518798828, "learning_rate": 1.999771389369868e-05, "loss": 0.0413, "step": 94860 }, { "epoch": 114.43934821967412, "grad_norm": 3.8811728954315186, "learning_rate": 1.9997713652443888e-05, "loss": 0.0379, "step": 94870 }, { "epoch": 114.45141822570912, "grad_norm": 3.617539167404175, "learning_rate": 1.9997713411189094e-05, "loss": 0.0397, "step": 94880 }, { "epoch": 114.46348823174412, "grad_norm": 4.000955581665039, "learning_rate": 1.99977131699343e-05, "loss": 0.0416, "step": 94890 }, { "epoch": 114.47555823777913, "grad_norm": 4.12085485458374, "learning_rate": 1.9997712928679506e-05, "loss": 0.0414, "step": 94900 }, { "epoch": 114.48762824381413, "grad_norm": 3.964261770248413, "learning_rate": 1.9997712687424716e-05, "loss": 0.039, "step": 94910 }, { "epoch": 114.49969824984913, "grad_norm": 3.8601744174957275, "learning_rate": 1.9997712446169922e-05, "loss": 0.0417, "step": 94920 }, { "epoch": 114.51176825588412, "grad_norm": 4.431443691253662, "learning_rate": 1.9997712204915128e-05, "loss": 0.0419, "step": 94930 }, { "epoch": 114.52383826191912, "grad_norm": 3.778369188308716, "learning_rate": 1.9997711963660334e-05, "loss": 0.0401, "step": 94940 }, { "epoch": 114.53590826795413, "grad_norm": 4.067944526672363, "learning_rate": 1.999771172240554e-05, "loss": 0.0395, "step": 94950 }, { "epoch": 114.54797827398913, "grad_norm": 4.366089820861816, "learning_rate": 1.9997711481150747e-05, "loss": 0.0399, "step": 94960 }, { "epoch": 114.56004828002413, "grad_norm": 4.691936016082764, "learning_rate": 1.9997711239895953e-05, "loss": 0.0403, "step": 94970 }, { "epoch": 114.57211828605914, "grad_norm": 4.14574670791626, "learning_rate": 1.999771099864116e-05, "loss": 0.0425, "step": 94980 }, { "epoch": 114.58418829209414, "grad_norm": 4.1652398109436035, "learning_rate": 1.9997710757386362e-05, "loss": 0.0405, "step": 94990 }, { "epoch": 114.59625829812914, "grad_norm": 4.245462417602539, "learning_rate": 1.999771051613157e-05, "loss": 0.0435, "step": 95000 }, { "epoch": 114.59625829812914, "eval_loss": 13.065452575683594, "eval_runtime": 8.1253, "eval_samples_per_second": 85.782, "eval_steps_per_second": 10.83, "step": 95000 }, { "epoch": 114.60832830416415, "grad_norm": 3.957965850830078, "learning_rate": 1.9997710274876775e-05, "loss": 0.042, "step": 95010 }, { "epoch": 114.62039831019915, "grad_norm": 4.298033237457275, "learning_rate": 1.999771003362198e-05, "loss": 0.0439, "step": 95020 }, { "epoch": 114.63246831623415, "grad_norm": 4.039405345916748, "learning_rate": 1.9997709792367187e-05, "loss": 0.0433, "step": 95030 }, { "epoch": 114.64453832226916, "grad_norm": 4.226551055908203, "learning_rate": 1.9997709551112393e-05, "loss": 0.043, "step": 95040 }, { "epoch": 114.65660832830416, "grad_norm": 4.0991106033325195, "learning_rate": 1.99977093098576e-05, "loss": 0.0427, "step": 95050 }, { "epoch": 114.66867833433916, "grad_norm": 4.739250183105469, "learning_rate": 1.9997709068602806e-05, "loss": 0.0429, "step": 95060 }, { "epoch": 114.68074834037417, "grad_norm": 4.05232572555542, "learning_rate": 1.9997708827348012e-05, "loss": 0.042, "step": 95070 }, { "epoch": 114.69281834640917, "grad_norm": 4.296613693237305, "learning_rate": 1.9997708586093218e-05, "loss": 0.0429, "step": 95080 }, { "epoch": 114.70488835244417, "grad_norm": 4.13738489151001, "learning_rate": 1.9997708344838424e-05, "loss": 0.0435, "step": 95090 }, { "epoch": 114.71695835847918, "grad_norm": 4.109089374542236, "learning_rate": 1.999770810358363e-05, "loss": 0.0408, "step": 95100 }, { "epoch": 114.72902836451418, "grad_norm": 4.355175971984863, "learning_rate": 1.9997707862328837e-05, "loss": 0.0407, "step": 95110 }, { "epoch": 114.74109837054918, "grad_norm": 4.028783798217773, "learning_rate": 1.9997707621074043e-05, "loss": 0.0427, "step": 95120 }, { "epoch": 114.75316837658418, "grad_norm": 4.397209167480469, "learning_rate": 1.999770737981925e-05, "loss": 0.0433, "step": 95130 }, { "epoch": 114.76523838261919, "grad_norm": 3.958521842956543, "learning_rate": 1.9997707138564455e-05, "loss": 0.0427, "step": 95140 }, { "epoch": 114.77730838865419, "grad_norm": 4.246341705322266, "learning_rate": 1.999770689730966e-05, "loss": 0.0424, "step": 95150 }, { "epoch": 114.7893783946892, "grad_norm": 4.419798851013184, "learning_rate": 1.9997706656054868e-05, "loss": 0.0441, "step": 95160 }, { "epoch": 114.8014484007242, "grad_norm": 3.6137735843658447, "learning_rate": 1.9997706414800074e-05, "loss": 0.0439, "step": 95170 }, { "epoch": 114.8135184067592, "grad_norm": 4.678729057312012, "learning_rate": 1.999770617354528e-05, "loss": 0.0415, "step": 95180 }, { "epoch": 114.8255884127942, "grad_norm": 3.9879283905029297, "learning_rate": 1.9997705932290486e-05, "loss": 0.0407, "step": 95190 }, { "epoch": 114.83765841882921, "grad_norm": 4.734506130218506, "learning_rate": 1.9997705691035693e-05, "loss": 0.0433, "step": 95200 }, { "epoch": 114.84972842486421, "grad_norm": 4.557642936706543, "learning_rate": 1.99977054497809e-05, "loss": 0.044, "step": 95210 }, { "epoch": 114.86179843089921, "grad_norm": 4.8047871589660645, "learning_rate": 1.9997705208526105e-05, "loss": 0.0446, "step": 95220 }, { "epoch": 114.87386843693422, "grad_norm": 3.9376344680786133, "learning_rate": 1.999770496727131e-05, "loss": 0.043, "step": 95230 }, { "epoch": 114.88593844296922, "grad_norm": 3.5702061653137207, "learning_rate": 1.9997704726016514e-05, "loss": 0.0454, "step": 95240 }, { "epoch": 114.89800844900422, "grad_norm": 4.682522773742676, "learning_rate": 1.999770448476172e-05, "loss": 0.0429, "step": 95250 }, { "epoch": 114.91007845503923, "grad_norm": 4.62144660949707, "learning_rate": 1.9997704243506927e-05, "loss": 0.0445, "step": 95260 }, { "epoch": 114.92214846107423, "grad_norm": 4.008183002471924, "learning_rate": 1.9997704002252133e-05, "loss": 0.0449, "step": 95270 }, { "epoch": 114.93421846710923, "grad_norm": 4.135227203369141, "learning_rate": 1.999770376099734e-05, "loss": 0.0427, "step": 95280 }, { "epoch": 114.94628847314424, "grad_norm": 3.8319497108459473, "learning_rate": 1.9997703519742545e-05, "loss": 0.0445, "step": 95290 }, { "epoch": 114.95835847917924, "grad_norm": 4.639453887939453, "learning_rate": 1.999770327848775e-05, "loss": 0.0453, "step": 95300 }, { "epoch": 114.97042848521424, "grad_norm": 4.794231414794922, "learning_rate": 1.9997703037232958e-05, "loss": 0.0465, "step": 95310 }, { "epoch": 114.98249849124925, "grad_norm": 4.1448469161987305, "learning_rate": 1.9997702795978164e-05, "loss": 0.0467, "step": 95320 }, { "epoch": 114.99456849728425, "grad_norm": 4.310220241546631, "learning_rate": 1.999770255472337e-05, "loss": 0.0451, "step": 95330 }, { "epoch": 115.0060350030175, "grad_norm": 3.443922281265259, "learning_rate": 1.9997702313468576e-05, "loss": 0.0377, "step": 95340 }, { "epoch": 115.0181050090525, "grad_norm": 3.531824827194214, "learning_rate": 1.9997702072213783e-05, "loss": 0.0294, "step": 95350 }, { "epoch": 115.03017501508751, "grad_norm": 3.182236433029175, "learning_rate": 1.999770183095899e-05, "loss": 0.0305, "step": 95360 }, { "epoch": 115.04224502112251, "grad_norm": 3.4352142810821533, "learning_rate": 1.9997701589704195e-05, "loss": 0.0337, "step": 95370 }, { "epoch": 115.05431502715751, "grad_norm": 3.8350937366485596, "learning_rate": 1.99977013484494e-05, "loss": 0.0326, "step": 95380 }, { "epoch": 115.06638503319252, "grad_norm": 4.097568988800049, "learning_rate": 1.9997701107194607e-05, "loss": 0.035, "step": 95390 }, { "epoch": 115.07845503922752, "grad_norm": 3.831408977508545, "learning_rate": 1.9997700865939814e-05, "loss": 0.0326, "step": 95400 }, { "epoch": 115.09052504526252, "grad_norm": 3.6768572330474854, "learning_rate": 1.999770062468502e-05, "loss": 0.0326, "step": 95410 }, { "epoch": 115.10259505129753, "grad_norm": 3.6441636085510254, "learning_rate": 1.9997700383430226e-05, "loss": 0.0342, "step": 95420 }, { "epoch": 115.11466505733253, "grad_norm": 3.9117848873138428, "learning_rate": 1.9997700142175432e-05, "loss": 0.0362, "step": 95430 }, { "epoch": 115.12673506336753, "grad_norm": 3.7682089805603027, "learning_rate": 1.999769990092064e-05, "loss": 0.036, "step": 95440 }, { "epoch": 115.13880506940254, "grad_norm": 4.038073539733887, "learning_rate": 1.9997699659665845e-05, "loss": 0.0357, "step": 95450 }, { "epoch": 115.15087507543754, "grad_norm": 3.8207337856292725, "learning_rate": 1.999769941841105e-05, "loss": 0.0381, "step": 95460 }, { "epoch": 115.16294508147254, "grad_norm": 4.368856906890869, "learning_rate": 1.9997699177156257e-05, "loss": 0.0373, "step": 95470 }, { "epoch": 115.17501508750755, "grad_norm": 4.326677322387695, "learning_rate": 1.9997698935901463e-05, "loss": 0.0357, "step": 95480 }, { "epoch": 115.18708509354255, "grad_norm": 3.829798936843872, "learning_rate": 1.999769869464667e-05, "loss": 0.0376, "step": 95490 }, { "epoch": 115.19915509957755, "grad_norm": 3.3970916271209717, "learning_rate": 1.9997698453391876e-05, "loss": 0.0348, "step": 95500 }, { "epoch": 115.19915509957755, "eval_loss": 13.055354118347168, "eval_runtime": 8.1321, "eval_samples_per_second": 85.71, "eval_steps_per_second": 10.821, "step": 95500 }, { "epoch": 115.21122510561256, "grad_norm": 3.509647846221924, "learning_rate": 1.9997698212137082e-05, "loss": 0.0359, "step": 95510 }, { "epoch": 115.22329511164756, "grad_norm": 4.4078593254089355, "learning_rate": 1.9997697970882288e-05, "loss": 0.0368, "step": 95520 }, { "epoch": 115.23536511768256, "grad_norm": 3.804614782333374, "learning_rate": 1.9997697729627494e-05, "loss": 0.0375, "step": 95530 }, { "epoch": 115.24743512371757, "grad_norm": 4.275757789611816, "learning_rate": 1.99976974883727e-05, "loss": 0.0364, "step": 95540 }, { "epoch": 115.25950512975257, "grad_norm": 4.164200305938721, "learning_rate": 1.9997697247117907e-05, "loss": 0.0387, "step": 95550 }, { "epoch": 115.27157513578757, "grad_norm": 4.1409196853637695, "learning_rate": 1.9997697005863113e-05, "loss": 0.0409, "step": 95560 }, { "epoch": 115.28364514182257, "grad_norm": 4.197382926940918, "learning_rate": 1.999769676460832e-05, "loss": 0.0393, "step": 95570 }, { "epoch": 115.29571514785758, "grad_norm": 3.7285869121551514, "learning_rate": 1.9997696523353525e-05, "loss": 0.04, "step": 95580 }, { "epoch": 115.30778515389258, "grad_norm": 4.130099296569824, "learning_rate": 1.999769628209873e-05, "loss": 0.0366, "step": 95590 }, { "epoch": 115.31985515992758, "grad_norm": 4.5288615226745605, "learning_rate": 1.9997696040843938e-05, "loss": 0.0384, "step": 95600 }, { "epoch": 115.33192516596259, "grad_norm": 4.539351463317871, "learning_rate": 1.9997695799589144e-05, "loss": 0.0384, "step": 95610 }, { "epoch": 115.34399517199759, "grad_norm": 3.8870794773101807, "learning_rate": 1.999769555833435e-05, "loss": 0.0372, "step": 95620 }, { "epoch": 115.3560651780326, "grad_norm": 4.154291152954102, "learning_rate": 1.9997695317079557e-05, "loss": 0.0385, "step": 95630 }, { "epoch": 115.3681351840676, "grad_norm": 4.019582271575928, "learning_rate": 1.9997695075824763e-05, "loss": 0.0381, "step": 95640 }, { "epoch": 115.3802051901026, "grad_norm": 4.034758567810059, "learning_rate": 1.999769483456997e-05, "loss": 0.0403, "step": 95650 }, { "epoch": 115.3922751961376, "grad_norm": 4.75013542175293, "learning_rate": 1.9997694593315172e-05, "loss": 0.0396, "step": 95660 }, { "epoch": 115.4043452021726, "grad_norm": 4.418388843536377, "learning_rate": 1.9997694352060378e-05, "loss": 0.0406, "step": 95670 }, { "epoch": 115.41641520820761, "grad_norm": 3.9456846714019775, "learning_rate": 1.9997694110805584e-05, "loss": 0.0413, "step": 95680 }, { "epoch": 115.42848521424261, "grad_norm": 4.20583438873291, "learning_rate": 1.999769386955079e-05, "loss": 0.0383, "step": 95690 }, { "epoch": 115.44055522027762, "grad_norm": 4.396178722381592, "learning_rate": 1.9997693628295997e-05, "loss": 0.0394, "step": 95700 }, { "epoch": 115.45262522631262, "grad_norm": 3.814589738845825, "learning_rate": 1.9997693387041203e-05, "loss": 0.0407, "step": 95710 }, { "epoch": 115.46469523234762, "grad_norm": 4.227082252502441, "learning_rate": 1.999769314578641e-05, "loss": 0.0396, "step": 95720 }, { "epoch": 115.47676523838263, "grad_norm": 4.566699981689453, "learning_rate": 1.9997692904531615e-05, "loss": 0.0414, "step": 95730 }, { "epoch": 115.48883524441763, "grad_norm": 4.12945556640625, "learning_rate": 1.999769266327682e-05, "loss": 0.0419, "step": 95740 }, { "epoch": 115.50090525045263, "grad_norm": 3.9893717765808105, "learning_rate": 1.9997692422022028e-05, "loss": 0.0413, "step": 95750 }, { "epoch": 115.51297525648762, "grad_norm": 3.774073839187622, "learning_rate": 1.9997692180767234e-05, "loss": 0.0411, "step": 95760 }, { "epoch": 115.52504526252262, "grad_norm": 3.99302077293396, "learning_rate": 1.999769193951244e-05, "loss": 0.0409, "step": 95770 }, { "epoch": 115.53711526855763, "grad_norm": 3.872709035873413, "learning_rate": 1.9997691698257646e-05, "loss": 0.0408, "step": 95780 }, { "epoch": 115.54918527459263, "grad_norm": 3.8719863891601562, "learning_rate": 1.9997691457002853e-05, "loss": 0.0402, "step": 95790 }, { "epoch": 115.56125528062763, "grad_norm": 4.366710186004639, "learning_rate": 1.999769121574806e-05, "loss": 0.0412, "step": 95800 }, { "epoch": 115.57332528666264, "grad_norm": 3.8696165084838867, "learning_rate": 1.9997690974493265e-05, "loss": 0.04, "step": 95810 }, { "epoch": 115.58539529269764, "grad_norm": 4.182074069976807, "learning_rate": 1.999769073323847e-05, "loss": 0.0414, "step": 95820 }, { "epoch": 115.59746529873264, "grad_norm": 4.491949081420898, "learning_rate": 1.9997690491983677e-05, "loss": 0.0406, "step": 95830 }, { "epoch": 115.60953530476765, "grad_norm": 4.597873687744141, "learning_rate": 1.9997690250728884e-05, "loss": 0.0423, "step": 95840 }, { "epoch": 115.62160531080265, "grad_norm": 4.760190010070801, "learning_rate": 1.999769000947409e-05, "loss": 0.0405, "step": 95850 }, { "epoch": 115.63367531683765, "grad_norm": 3.8727962970733643, "learning_rate": 1.9997689768219296e-05, "loss": 0.0415, "step": 95860 }, { "epoch": 115.64574532287266, "grad_norm": 4.355525016784668, "learning_rate": 1.9997689526964502e-05, "loss": 0.0413, "step": 95870 }, { "epoch": 115.65781532890766, "grad_norm": 4.5642170906066895, "learning_rate": 1.999768928570971e-05, "loss": 0.0411, "step": 95880 }, { "epoch": 115.66988533494266, "grad_norm": 4.170434951782227, "learning_rate": 1.9997689044454915e-05, "loss": 0.0409, "step": 95890 }, { "epoch": 115.68195534097767, "grad_norm": 4.22218132019043, "learning_rate": 1.999768880320012e-05, "loss": 0.0414, "step": 95900 }, { "epoch": 115.69402534701267, "grad_norm": 4.356837749481201, "learning_rate": 1.9997688561945324e-05, "loss": 0.0397, "step": 95910 }, { "epoch": 115.70609535304767, "grad_norm": 4.22553014755249, "learning_rate": 1.999768832069053e-05, "loss": 0.0422, "step": 95920 }, { "epoch": 115.71816535908268, "grad_norm": 4.0283942222595215, "learning_rate": 1.9997688079435736e-05, "loss": 0.042, "step": 95930 }, { "epoch": 115.73023536511768, "grad_norm": 4.5349602699279785, "learning_rate": 1.9997687838180942e-05, "loss": 0.042, "step": 95940 }, { "epoch": 115.74230537115268, "grad_norm": 4.003815174102783, "learning_rate": 1.999768759692615e-05, "loss": 0.0433, "step": 95950 }, { "epoch": 115.75437537718769, "grad_norm": 4.346705436706543, "learning_rate": 1.9997687355671355e-05, "loss": 0.0414, "step": 95960 }, { "epoch": 115.76644538322269, "grad_norm": 3.861203908920288, "learning_rate": 1.999768711441656e-05, "loss": 0.0403, "step": 95970 }, { "epoch": 115.77851538925769, "grad_norm": 4.453656196594238, "learning_rate": 1.9997686873161767e-05, "loss": 0.0405, "step": 95980 }, { "epoch": 115.7905853952927, "grad_norm": 4.509680271148682, "learning_rate": 1.9997686631906977e-05, "loss": 0.0435, "step": 95990 }, { "epoch": 115.8026554013277, "grad_norm": 4.344795227050781, "learning_rate": 1.9997686390652183e-05, "loss": 0.0422, "step": 96000 }, { "epoch": 115.8026554013277, "eval_loss": 13.066657066345215, "eval_runtime": 8.1208, "eval_samples_per_second": 85.829, "eval_steps_per_second": 10.836, "step": 96000 }, { "epoch": 115.8147254073627, "grad_norm": 4.074538230895996, "learning_rate": 1.999768614939739e-05, "loss": 0.0434, "step": 96010 }, { "epoch": 115.8267954133977, "grad_norm": 4.827625274658203, "learning_rate": 1.9997685908142596e-05, "loss": 0.0437, "step": 96020 }, { "epoch": 115.83886541943271, "grad_norm": 4.0060811042785645, "learning_rate": 1.9997685666887802e-05, "loss": 0.0403, "step": 96030 }, { "epoch": 115.85093542546771, "grad_norm": 4.426583766937256, "learning_rate": 1.9997685425633008e-05, "loss": 0.0431, "step": 96040 }, { "epoch": 115.86300543150271, "grad_norm": 4.158529281616211, "learning_rate": 1.9997685184378214e-05, "loss": 0.041, "step": 96050 }, { "epoch": 115.87507543753772, "grad_norm": 4.213403701782227, "learning_rate": 1.999768494312342e-05, "loss": 0.0408, "step": 96060 }, { "epoch": 115.88714544357272, "grad_norm": 4.276681423187256, "learning_rate": 1.9997684701868623e-05, "loss": 0.046, "step": 96070 }, { "epoch": 115.89921544960772, "grad_norm": 4.214180946350098, "learning_rate": 1.999768446061383e-05, "loss": 0.0442, "step": 96080 }, { "epoch": 115.91128545564273, "grad_norm": 4.277773380279541, "learning_rate": 1.9997684219359036e-05, "loss": 0.0415, "step": 96090 }, { "epoch": 115.92335546167773, "grad_norm": 4.433655261993408, "learning_rate": 1.9997683978104242e-05, "loss": 0.0441, "step": 96100 }, { "epoch": 115.93542546771273, "grad_norm": 4.580681324005127, "learning_rate": 1.9997683736849448e-05, "loss": 0.043, "step": 96110 }, { "epoch": 115.94749547374774, "grad_norm": 3.993882179260254, "learning_rate": 1.9997683495594654e-05, "loss": 0.043, "step": 96120 }, { "epoch": 115.95956547978274, "grad_norm": 4.931636810302734, "learning_rate": 1.999768325433986e-05, "loss": 0.0435, "step": 96130 }, { "epoch": 115.97163548581774, "grad_norm": 4.348952293395996, "learning_rate": 1.9997683013085067e-05, "loss": 0.0434, "step": 96140 }, { "epoch": 115.98370549185275, "grad_norm": 4.362470626831055, "learning_rate": 1.9997682771830273e-05, "loss": 0.0431, "step": 96150 }, { "epoch": 115.99577549788775, "grad_norm": 3.9176197052001953, "learning_rate": 1.999768253057548e-05, "loss": 0.0443, "step": 96160 }, { "epoch": 116.007242003621, "grad_norm": 3.471954584121704, "learning_rate": 1.9997682289320685e-05, "loss": 0.0344, "step": 96170 }, { "epoch": 116.019312009656, "grad_norm": 3.6217243671417236, "learning_rate": 1.999768204806589e-05, "loss": 0.0305, "step": 96180 }, { "epoch": 116.03138201569101, "grad_norm": 3.6876325607299805, "learning_rate": 1.9997681806811098e-05, "loss": 0.0342, "step": 96190 }, { "epoch": 116.04345202172601, "grad_norm": 3.7793490886688232, "learning_rate": 1.9997681565556304e-05, "loss": 0.0332, "step": 96200 }, { "epoch": 116.05552202776101, "grad_norm": 4.0865092277526855, "learning_rate": 1.999768132430151e-05, "loss": 0.0334, "step": 96210 }, { "epoch": 116.06759203379602, "grad_norm": 3.900670051574707, "learning_rate": 1.9997681083046716e-05, "loss": 0.0344, "step": 96220 }, { "epoch": 116.07966203983102, "grad_norm": 3.4491617679595947, "learning_rate": 1.9997680841791923e-05, "loss": 0.033, "step": 96230 }, { "epoch": 116.09173204586602, "grad_norm": 3.739440679550171, "learning_rate": 1.999768060053713e-05, "loss": 0.0348, "step": 96240 }, { "epoch": 116.10380205190103, "grad_norm": 3.6271567344665527, "learning_rate": 1.9997680359282335e-05, "loss": 0.0362, "step": 96250 }, { "epoch": 116.11587205793603, "grad_norm": 3.673802137374878, "learning_rate": 1.999768011802754e-05, "loss": 0.0362, "step": 96260 }, { "epoch": 116.12794206397103, "grad_norm": 3.820650339126587, "learning_rate": 1.9997679876772748e-05, "loss": 0.0353, "step": 96270 }, { "epoch": 116.14001207000604, "grad_norm": 3.8067407608032227, "learning_rate": 1.9997679635517954e-05, "loss": 0.0352, "step": 96280 }, { "epoch": 116.15208207604104, "grad_norm": 4.3266496658325195, "learning_rate": 1.999767939426316e-05, "loss": 0.036, "step": 96290 }, { "epoch": 116.16415208207604, "grad_norm": 3.7590694427490234, "learning_rate": 1.9997679153008366e-05, "loss": 0.036, "step": 96300 }, { "epoch": 116.17622208811105, "grad_norm": 3.405334949493408, "learning_rate": 1.9997678911753572e-05, "loss": 0.0355, "step": 96310 }, { "epoch": 116.18829209414605, "grad_norm": 3.335386037826538, "learning_rate": 1.9997678670498775e-05, "loss": 0.0381, "step": 96320 }, { "epoch": 116.20036210018105, "grad_norm": 3.7301156520843506, "learning_rate": 1.999767842924398e-05, "loss": 0.0361, "step": 96330 }, { "epoch": 116.21243210621606, "grad_norm": 4.010186195373535, "learning_rate": 1.9997678187989188e-05, "loss": 0.0366, "step": 96340 }, { "epoch": 116.22450211225106, "grad_norm": 3.7457199096679688, "learning_rate": 1.9997677946734394e-05, "loss": 0.0358, "step": 96350 }, { "epoch": 116.23657211828606, "grad_norm": 4.074102878570557, "learning_rate": 1.99976777054796e-05, "loss": 0.0383, "step": 96360 }, { "epoch": 116.24864212432107, "grad_norm": 3.951744318008423, "learning_rate": 1.9997677464224806e-05, "loss": 0.0375, "step": 96370 }, { "epoch": 116.26071213035607, "grad_norm": 4.343123912811279, "learning_rate": 1.9997677222970013e-05, "loss": 0.0376, "step": 96380 }, { "epoch": 116.27278213639107, "grad_norm": 3.728912591934204, "learning_rate": 1.999767698171522e-05, "loss": 0.0357, "step": 96390 }, { "epoch": 116.28485214242608, "grad_norm": 3.5701956748962402, "learning_rate": 1.9997676740460425e-05, "loss": 0.0356, "step": 96400 }, { "epoch": 116.29692214846108, "grad_norm": 4.185500144958496, "learning_rate": 1.999767649920563e-05, "loss": 0.0365, "step": 96410 }, { "epoch": 116.30899215449608, "grad_norm": 4.23551082611084, "learning_rate": 1.9997676257950837e-05, "loss": 0.0391, "step": 96420 }, { "epoch": 116.32106216053108, "grad_norm": 3.8370935916900635, "learning_rate": 1.9997676016696044e-05, "loss": 0.0371, "step": 96430 }, { "epoch": 116.33313216656609, "grad_norm": 3.8148341178894043, "learning_rate": 1.999767577544125e-05, "loss": 0.04, "step": 96440 }, { "epoch": 116.34520217260109, "grad_norm": 4.453991413116455, "learning_rate": 1.9997675534186456e-05, "loss": 0.0363, "step": 96450 }, { "epoch": 116.3572721786361, "grad_norm": 3.5925886631011963, "learning_rate": 1.9997675292931662e-05, "loss": 0.0375, "step": 96460 }, { "epoch": 116.3693421846711, "grad_norm": 4.047980785369873, "learning_rate": 1.999767505167687e-05, "loss": 0.0379, "step": 96470 }, { "epoch": 116.3814121907061, "grad_norm": 4.432343006134033, "learning_rate": 1.9997674810422075e-05, "loss": 0.0397, "step": 96480 }, { "epoch": 116.3934821967411, "grad_norm": 3.9061224460601807, "learning_rate": 1.999767456916728e-05, "loss": 0.0375, "step": 96490 }, { "epoch": 116.40555220277611, "grad_norm": 4.151072025299072, "learning_rate": 1.9997674327912487e-05, "loss": 0.039, "step": 96500 }, { "epoch": 116.40555220277611, "eval_loss": 13.077061653137207, "eval_runtime": 8.1293, "eval_samples_per_second": 85.739, "eval_steps_per_second": 10.825, "step": 96500 }, { "epoch": 116.41762220881111, "grad_norm": 4.30681848526001, "learning_rate": 1.9997674086657693e-05, "loss": 0.0401, "step": 96510 }, { "epoch": 116.42969221484611, "grad_norm": 3.8628039360046387, "learning_rate": 1.99976738454029e-05, "loss": 0.0402, "step": 96520 }, { "epoch": 116.44176222088112, "grad_norm": 4.298835754394531, "learning_rate": 1.9997673604148106e-05, "loss": 0.0401, "step": 96530 }, { "epoch": 116.45383222691612, "grad_norm": 3.932445526123047, "learning_rate": 1.9997673362893312e-05, "loss": 0.0401, "step": 96540 }, { "epoch": 116.46590223295112, "grad_norm": 4.122188091278076, "learning_rate": 1.9997673121638518e-05, "loss": 0.0396, "step": 96550 }, { "epoch": 116.47797223898613, "grad_norm": 4.109341621398926, "learning_rate": 1.9997672880383724e-05, "loss": 0.0402, "step": 96560 }, { "epoch": 116.49004224502113, "grad_norm": 4.837318420410156, "learning_rate": 1.999767263912893e-05, "loss": 0.0411, "step": 96570 }, { "epoch": 116.50211225105613, "grad_norm": 4.408584117889404, "learning_rate": 1.9997672397874137e-05, "loss": 0.0411, "step": 96580 }, { "epoch": 116.51418225709112, "grad_norm": 4.47958517074585, "learning_rate": 1.9997672156619343e-05, "loss": 0.0406, "step": 96590 }, { "epoch": 116.52625226312612, "grad_norm": 4.569081783294678, "learning_rate": 1.999767191536455e-05, "loss": 0.0401, "step": 96600 }, { "epoch": 116.53832226916113, "grad_norm": 4.285853862762451, "learning_rate": 1.9997671674109755e-05, "loss": 0.0409, "step": 96610 }, { "epoch": 116.55039227519613, "grad_norm": 3.8880465030670166, "learning_rate": 1.9997671432854962e-05, "loss": 0.0395, "step": 96620 }, { "epoch": 116.56246228123113, "grad_norm": 4.044765472412109, "learning_rate": 1.9997671191600168e-05, "loss": 0.0398, "step": 96630 }, { "epoch": 116.57453228726614, "grad_norm": 3.8901662826538086, "learning_rate": 1.9997670950345374e-05, "loss": 0.041, "step": 96640 }, { "epoch": 116.58660229330114, "grad_norm": 3.910353660583496, "learning_rate": 1.999767070909058e-05, "loss": 0.0396, "step": 96650 }, { "epoch": 116.59867229933614, "grad_norm": 4.0046892166137695, "learning_rate": 1.9997670467835787e-05, "loss": 0.0423, "step": 96660 }, { "epoch": 116.61074230537115, "grad_norm": 3.808964490890503, "learning_rate": 1.9997670226580993e-05, "loss": 0.0391, "step": 96670 }, { "epoch": 116.62281231140615, "grad_norm": 4.223971843719482, "learning_rate": 1.99976699853262e-05, "loss": 0.042, "step": 96680 }, { "epoch": 116.63488231744115, "grad_norm": 4.443808555603027, "learning_rate": 1.9997669744071405e-05, "loss": 0.0402, "step": 96690 }, { "epoch": 116.64695232347616, "grad_norm": 4.317450046539307, "learning_rate": 1.999766950281661e-05, "loss": 0.0396, "step": 96700 }, { "epoch": 116.65902232951116, "grad_norm": 4.344677925109863, "learning_rate": 1.9997669261561818e-05, "loss": 0.0415, "step": 96710 }, { "epoch": 116.67109233554616, "grad_norm": 4.393735408782959, "learning_rate": 1.9997669020307024e-05, "loss": 0.0398, "step": 96720 }, { "epoch": 116.68316234158117, "grad_norm": 4.461430549621582, "learning_rate": 1.9997668779052227e-05, "loss": 0.0419, "step": 96730 }, { "epoch": 116.69523234761617, "grad_norm": 4.016168594360352, "learning_rate": 1.9997668537797433e-05, "loss": 0.0409, "step": 96740 }, { "epoch": 116.70730235365117, "grad_norm": 4.011114597320557, "learning_rate": 1.999766829654264e-05, "loss": 0.041, "step": 96750 }, { "epoch": 116.71937235968618, "grad_norm": 4.444190502166748, "learning_rate": 1.9997668055287845e-05, "loss": 0.0418, "step": 96760 }, { "epoch": 116.73144236572118, "grad_norm": 4.341176986694336, "learning_rate": 1.999766781403305e-05, "loss": 0.0423, "step": 96770 }, { "epoch": 116.74351237175618, "grad_norm": 4.078677177429199, "learning_rate": 1.9997667572778258e-05, "loss": 0.0423, "step": 96780 }, { "epoch": 116.75558237779119, "grad_norm": 4.222442150115967, "learning_rate": 1.9997667331523464e-05, "loss": 0.0424, "step": 96790 }, { "epoch": 116.76765238382619, "grad_norm": 4.165272235870361, "learning_rate": 1.999766709026867e-05, "loss": 0.0425, "step": 96800 }, { "epoch": 116.77972238986119, "grad_norm": 4.299583911895752, "learning_rate": 1.9997666849013876e-05, "loss": 0.0431, "step": 96810 }, { "epoch": 116.7917923958962, "grad_norm": 4.340622901916504, "learning_rate": 1.9997666607759083e-05, "loss": 0.0418, "step": 96820 }, { "epoch": 116.8038624019312, "grad_norm": 4.1658782958984375, "learning_rate": 1.999766636650429e-05, "loss": 0.0426, "step": 96830 }, { "epoch": 116.8159324079662, "grad_norm": 4.3783650398254395, "learning_rate": 1.9997666125249495e-05, "loss": 0.0409, "step": 96840 }, { "epoch": 116.8280024140012, "grad_norm": 4.040126323699951, "learning_rate": 1.99976658839947e-05, "loss": 0.0408, "step": 96850 }, { "epoch": 116.84007242003621, "grad_norm": 4.465468883514404, "learning_rate": 1.9997665642739907e-05, "loss": 0.0416, "step": 96860 }, { "epoch": 116.85214242607121, "grad_norm": 4.444402694702148, "learning_rate": 1.9997665401485114e-05, "loss": 0.0415, "step": 96870 }, { "epoch": 116.86421243210621, "grad_norm": 4.6751017570495605, "learning_rate": 1.999766516023032e-05, "loss": 0.0429, "step": 96880 }, { "epoch": 116.87628243814122, "grad_norm": 4.10764741897583, "learning_rate": 1.9997664918975526e-05, "loss": 0.0419, "step": 96890 }, { "epoch": 116.88835244417622, "grad_norm": 4.289586067199707, "learning_rate": 1.9997664677720732e-05, "loss": 0.0427, "step": 96900 }, { "epoch": 116.90042245021122, "grad_norm": 3.761408567428589, "learning_rate": 1.999766443646594e-05, "loss": 0.0433, "step": 96910 }, { "epoch": 116.91249245624623, "grad_norm": 4.704137802124023, "learning_rate": 1.9997664195211145e-05, "loss": 0.0415, "step": 96920 }, { "epoch": 116.92456246228123, "grad_norm": 4.31681489944458, "learning_rate": 1.999766395395635e-05, "loss": 0.0437, "step": 96930 }, { "epoch": 116.93663246831623, "grad_norm": 4.3364691734313965, "learning_rate": 1.9997663712701557e-05, "loss": 0.0453, "step": 96940 }, { "epoch": 116.94870247435124, "grad_norm": 4.0408477783203125, "learning_rate": 1.9997663471446763e-05, "loss": 0.044, "step": 96950 }, { "epoch": 116.96077248038624, "grad_norm": 4.941867828369141, "learning_rate": 1.999766323019197e-05, "loss": 0.0428, "step": 96960 }, { "epoch": 116.97284248642124, "grad_norm": 4.121899127960205, "learning_rate": 1.9997662988937176e-05, "loss": 0.0433, "step": 96970 }, { "epoch": 116.98491249245625, "grad_norm": 4.570621490478516, "learning_rate": 1.999766274768238e-05, "loss": 0.0443, "step": 96980 }, { "epoch": 116.99698249849125, "grad_norm": 4.621585369110107, "learning_rate": 1.9997662506427585e-05, "loss": 0.0443, "step": 96990 }, { "epoch": 117.0084490042245, "grad_norm": 3.5019748210906982, "learning_rate": 1.999766226517279e-05, "loss": 0.0326, "step": 97000 }, { "epoch": 117.0084490042245, "eval_loss": 13.063477516174316, "eval_runtime": 8.1301, "eval_samples_per_second": 85.731, "eval_steps_per_second": 10.824, "step": 97000 }, { "epoch": 117.0205190102595, "grad_norm": 3.4713001251220703, "learning_rate": 1.9997662023917997e-05, "loss": 0.0294, "step": 97010 }, { "epoch": 117.03258901629451, "grad_norm": 3.675065040588379, "learning_rate": 1.9997661782663204e-05, "loss": 0.0322, "step": 97020 }, { "epoch": 117.04465902232951, "grad_norm": 3.6480016708374023, "learning_rate": 1.999766154140841e-05, "loss": 0.0329, "step": 97030 }, { "epoch": 117.05672902836451, "grad_norm": 3.1363422870635986, "learning_rate": 1.9997661300153616e-05, "loss": 0.0304, "step": 97040 }, { "epoch": 117.06879903439952, "grad_norm": 3.894763946533203, "learning_rate": 1.9997661058898822e-05, "loss": 0.035, "step": 97050 }, { "epoch": 117.08086904043452, "grad_norm": 3.4824626445770264, "learning_rate": 1.999766081764403e-05, "loss": 0.0349, "step": 97060 }, { "epoch": 117.09293904646952, "grad_norm": 3.924257516860962, "learning_rate": 1.9997660576389238e-05, "loss": 0.0337, "step": 97070 }, { "epoch": 117.10500905250453, "grad_norm": 3.833343505859375, "learning_rate": 1.9997660335134444e-05, "loss": 0.0341, "step": 97080 }, { "epoch": 117.11707905853953, "grad_norm": 3.716158390045166, "learning_rate": 1.999766009387965e-05, "loss": 0.0322, "step": 97090 }, { "epoch": 117.12914906457453, "grad_norm": 3.926288604736328, "learning_rate": 1.9997659852624857e-05, "loss": 0.0362, "step": 97100 }, { "epoch": 117.14121907060954, "grad_norm": 4.100839138031006, "learning_rate": 1.9997659611370063e-05, "loss": 0.0351, "step": 97110 }, { "epoch": 117.15328907664454, "grad_norm": 4.235075950622559, "learning_rate": 1.999765937011527e-05, "loss": 0.035, "step": 97120 }, { "epoch": 117.16535908267954, "grad_norm": 3.582463264465332, "learning_rate": 1.9997659128860475e-05, "loss": 0.0346, "step": 97130 }, { "epoch": 117.17742908871455, "grad_norm": 4.0316548347473145, "learning_rate": 1.999765888760568e-05, "loss": 0.0349, "step": 97140 }, { "epoch": 117.18949909474955, "grad_norm": 4.016798973083496, "learning_rate": 1.9997658646350884e-05, "loss": 0.0368, "step": 97150 }, { "epoch": 117.20156910078455, "grad_norm": 3.9534008502960205, "learning_rate": 1.999765840509609e-05, "loss": 0.0365, "step": 97160 }, { "epoch": 117.21363910681956, "grad_norm": 4.151679039001465, "learning_rate": 1.9997658163841297e-05, "loss": 0.0375, "step": 97170 }, { "epoch": 117.22570911285456, "grad_norm": 3.972480297088623, "learning_rate": 1.9997657922586503e-05, "loss": 0.0381, "step": 97180 }, { "epoch": 117.23777911888956, "grad_norm": 4.14210844039917, "learning_rate": 1.999765768133171e-05, "loss": 0.0375, "step": 97190 }, { "epoch": 117.24984912492457, "grad_norm": 3.9853689670562744, "learning_rate": 1.9997657440076915e-05, "loss": 0.035, "step": 97200 }, { "epoch": 117.26191913095957, "grad_norm": 3.7994534969329834, "learning_rate": 1.999765719882212e-05, "loss": 0.0355, "step": 97210 }, { "epoch": 117.27398913699457, "grad_norm": 4.095526218414307, "learning_rate": 1.9997656957567328e-05, "loss": 0.0386, "step": 97220 }, { "epoch": 117.28605914302958, "grad_norm": 4.154759883880615, "learning_rate": 1.9997656716312534e-05, "loss": 0.0359, "step": 97230 }, { "epoch": 117.29812914906458, "grad_norm": 3.734804630279541, "learning_rate": 1.999765647505774e-05, "loss": 0.0374, "step": 97240 }, { "epoch": 117.31019915509958, "grad_norm": 3.9420359134674072, "learning_rate": 1.9997656233802947e-05, "loss": 0.0383, "step": 97250 }, { "epoch": 117.32226916113459, "grad_norm": 3.7897439002990723, "learning_rate": 1.9997655992548153e-05, "loss": 0.0366, "step": 97260 }, { "epoch": 117.33433916716959, "grad_norm": 3.985924243927002, "learning_rate": 1.999765575129336e-05, "loss": 0.0376, "step": 97270 }, { "epoch": 117.34640917320459, "grad_norm": 4.330294132232666, "learning_rate": 1.9997655510038565e-05, "loss": 0.0382, "step": 97280 }, { "epoch": 117.3584791792396, "grad_norm": 3.576223373413086, "learning_rate": 1.999765526878377e-05, "loss": 0.0379, "step": 97290 }, { "epoch": 117.3705491852746, "grad_norm": 4.332512378692627, "learning_rate": 1.9997655027528978e-05, "loss": 0.0381, "step": 97300 }, { "epoch": 117.3826191913096, "grad_norm": 3.541316270828247, "learning_rate": 1.9997654786274184e-05, "loss": 0.0379, "step": 97310 }, { "epoch": 117.3946891973446, "grad_norm": 4.489776611328125, "learning_rate": 1.999765454501939e-05, "loss": 0.0383, "step": 97320 }, { "epoch": 117.40675920337961, "grad_norm": 3.9789319038391113, "learning_rate": 1.9997654303764596e-05, "loss": 0.038, "step": 97330 }, { "epoch": 117.41882920941461, "grad_norm": 3.870356321334839, "learning_rate": 1.9997654062509802e-05, "loss": 0.0381, "step": 97340 }, { "epoch": 117.43089921544961, "grad_norm": 4.316857814788818, "learning_rate": 1.999765382125501e-05, "loss": 0.0389, "step": 97350 }, { "epoch": 117.44296922148462, "grad_norm": 4.408143043518066, "learning_rate": 1.9997653580000215e-05, "loss": 0.0381, "step": 97360 }, { "epoch": 117.45503922751962, "grad_norm": 3.8796634674072266, "learning_rate": 1.999765333874542e-05, "loss": 0.0377, "step": 97370 }, { "epoch": 117.46710923355462, "grad_norm": 3.993621587753296, "learning_rate": 1.9997653097490627e-05, "loss": 0.0409, "step": 97380 }, { "epoch": 117.47917923958963, "grad_norm": 4.168309211730957, "learning_rate": 1.9997652856235834e-05, "loss": 0.0381, "step": 97390 }, { "epoch": 117.49124924562463, "grad_norm": 3.7350149154663086, "learning_rate": 1.9997652614981036e-05, "loss": 0.0387, "step": 97400 }, { "epoch": 117.50331925165963, "grad_norm": 4.1568522453308105, "learning_rate": 1.9997652373726243e-05, "loss": 0.0389, "step": 97410 }, { "epoch": 117.51538925769462, "grad_norm": 3.8497164249420166, "learning_rate": 1.999765213247145e-05, "loss": 0.0394, "step": 97420 }, { "epoch": 117.52745926372963, "grad_norm": 4.0251569747924805, "learning_rate": 1.9997651891216655e-05, "loss": 0.0399, "step": 97430 }, { "epoch": 117.53952926976463, "grad_norm": 3.8289616107940674, "learning_rate": 1.999765164996186e-05, "loss": 0.0402, "step": 97440 }, { "epoch": 117.55159927579963, "grad_norm": 4.3039140701293945, "learning_rate": 1.9997651408707067e-05, "loss": 0.0388, "step": 97450 }, { "epoch": 117.56366928183463, "grad_norm": 4.318199157714844, "learning_rate": 1.9997651167452274e-05, "loss": 0.0389, "step": 97460 }, { "epoch": 117.57573928786964, "grad_norm": 4.119261741638184, "learning_rate": 1.999765092619748e-05, "loss": 0.0398, "step": 97470 }, { "epoch": 117.58780929390464, "grad_norm": 4.044409275054932, "learning_rate": 1.9997650684942686e-05, "loss": 0.0389, "step": 97480 }, { "epoch": 117.59987929993964, "grad_norm": 4.080377101898193, "learning_rate": 1.9997650443687892e-05, "loss": 0.0403, "step": 97490 }, { "epoch": 117.61194930597465, "grad_norm": 4.558512210845947, "learning_rate": 1.99976502024331e-05, "loss": 0.0412, "step": 97500 }, { "epoch": 117.61194930597465, "eval_loss": 13.101455688476562, "eval_runtime": 8.182, "eval_samples_per_second": 85.187, "eval_steps_per_second": 10.755, "step": 97500 }, { "epoch": 117.62401931200965, "grad_norm": 4.2059712409973145, "learning_rate": 1.9997649961178305e-05, "loss": 0.04, "step": 97510 }, { "epoch": 117.63608931804465, "grad_norm": 4.46605920791626, "learning_rate": 1.999764971992351e-05, "loss": 0.042, "step": 97520 }, { "epoch": 117.64815932407966, "grad_norm": 4.1473541259765625, "learning_rate": 1.9997649478668717e-05, "loss": 0.0383, "step": 97530 }, { "epoch": 117.66022933011466, "grad_norm": 3.973140239715576, "learning_rate": 1.9997649237413923e-05, "loss": 0.0397, "step": 97540 }, { "epoch": 117.67229933614966, "grad_norm": 4.678797245025635, "learning_rate": 1.999764899615913e-05, "loss": 0.0426, "step": 97550 }, { "epoch": 117.68436934218467, "grad_norm": 4.22268533706665, "learning_rate": 1.9997648754904336e-05, "loss": 0.0415, "step": 97560 }, { "epoch": 117.69643934821967, "grad_norm": 4.151378154754639, "learning_rate": 1.9997648513649542e-05, "loss": 0.0412, "step": 97570 }, { "epoch": 117.70850935425467, "grad_norm": 4.212682723999023, "learning_rate": 1.9997648272394748e-05, "loss": 0.0424, "step": 97580 }, { "epoch": 117.72057936028968, "grad_norm": 4.293420791625977, "learning_rate": 1.9997648031139954e-05, "loss": 0.0418, "step": 97590 }, { "epoch": 117.73264936632468, "grad_norm": 4.333093166351318, "learning_rate": 1.999764778988516e-05, "loss": 0.0402, "step": 97600 }, { "epoch": 117.74471937235968, "grad_norm": 4.188202857971191, "learning_rate": 1.9997647548630367e-05, "loss": 0.0415, "step": 97610 }, { "epoch": 117.75678937839469, "grad_norm": 4.003525733947754, "learning_rate": 1.9997647307375573e-05, "loss": 0.0411, "step": 97620 }, { "epoch": 117.76885938442969, "grad_norm": 4.764726161956787, "learning_rate": 1.999764706612078e-05, "loss": 0.0418, "step": 97630 }, { "epoch": 117.78092939046469, "grad_norm": 4.097813129425049, "learning_rate": 1.9997646824865986e-05, "loss": 0.0428, "step": 97640 }, { "epoch": 117.7929993964997, "grad_norm": 4.408695697784424, "learning_rate": 1.9997646583611192e-05, "loss": 0.0425, "step": 97650 }, { "epoch": 117.8050694025347, "grad_norm": 4.065982818603516, "learning_rate": 1.9997646342356398e-05, "loss": 0.0411, "step": 97660 }, { "epoch": 117.8171394085697, "grad_norm": 4.116545677185059, "learning_rate": 1.9997646101101604e-05, "loss": 0.0413, "step": 97670 }, { "epoch": 117.8292094146047, "grad_norm": 4.319790840148926, "learning_rate": 1.999764585984681e-05, "loss": 0.0423, "step": 97680 }, { "epoch": 117.84127942063971, "grad_norm": 4.039966106414795, "learning_rate": 1.9997645618592017e-05, "loss": 0.0422, "step": 97690 }, { "epoch": 117.85334942667471, "grad_norm": 3.816319465637207, "learning_rate": 1.9997645377337223e-05, "loss": 0.0424, "step": 97700 }, { "epoch": 117.86541943270971, "grad_norm": 4.029421806335449, "learning_rate": 1.999764513608243e-05, "loss": 0.0424, "step": 97710 }, { "epoch": 117.87748943874472, "grad_norm": 3.645768880844116, "learning_rate": 1.9997644894827635e-05, "loss": 0.0406, "step": 97720 }, { "epoch": 117.88955944477972, "grad_norm": 4.400444984436035, "learning_rate": 1.999764465357284e-05, "loss": 0.0418, "step": 97730 }, { "epoch": 117.90162945081472, "grad_norm": 4.186114311218262, "learning_rate": 1.9997644412318048e-05, "loss": 0.0419, "step": 97740 }, { "epoch": 117.91369945684973, "grad_norm": 4.579961776733398, "learning_rate": 1.9997644171063254e-05, "loss": 0.0436, "step": 97750 }, { "epoch": 117.92576946288473, "grad_norm": 4.262250900268555, "learning_rate": 1.999764392980846e-05, "loss": 0.0455, "step": 97760 }, { "epoch": 117.93783946891973, "grad_norm": 4.188604354858398, "learning_rate": 1.9997643688553666e-05, "loss": 0.0421, "step": 97770 }, { "epoch": 117.94990947495474, "grad_norm": 3.794123411178589, "learning_rate": 1.9997643447298873e-05, "loss": 0.0426, "step": 97780 }, { "epoch": 117.96197948098974, "grad_norm": 4.136035919189453, "learning_rate": 1.999764320604408e-05, "loss": 0.0428, "step": 97790 }, { "epoch": 117.97404948702474, "grad_norm": 4.410774230957031, "learning_rate": 1.9997642964789285e-05, "loss": 0.0433, "step": 97800 }, { "epoch": 117.98611949305975, "grad_norm": 4.763158321380615, "learning_rate": 1.9997642723534488e-05, "loss": 0.0451, "step": 97810 }, { "epoch": 117.99818949909475, "grad_norm": 4.383734703063965, "learning_rate": 1.9997642482279694e-05, "loss": 0.0428, "step": 97820 }, { "epoch": 118.009656004828, "grad_norm": 3.1138129234313965, "learning_rate": 1.99976422410249e-05, "loss": 0.032, "step": 97830 }, { "epoch": 118.021726010863, "grad_norm": 3.140453577041626, "learning_rate": 1.9997641999770106e-05, "loss": 0.0299, "step": 97840 }, { "epoch": 118.03379601689801, "grad_norm": 3.602283000946045, "learning_rate": 1.9997641758515313e-05, "loss": 0.0309, "step": 97850 }, { "epoch": 118.04586602293301, "grad_norm": 3.613581418991089, "learning_rate": 1.999764151726052e-05, "loss": 0.0304, "step": 97860 }, { "epoch": 118.05793602896802, "grad_norm": 3.302370309829712, "learning_rate": 1.9997641276005725e-05, "loss": 0.0329, "step": 97870 }, { "epoch": 118.07000603500302, "grad_norm": 3.8379571437835693, "learning_rate": 1.999764103475093e-05, "loss": 0.0351, "step": 97880 }, { "epoch": 118.08207604103802, "grad_norm": 3.3719050884246826, "learning_rate": 1.9997640793496138e-05, "loss": 0.0324, "step": 97890 }, { "epoch": 118.09414604707302, "grad_norm": 3.8258752822875977, "learning_rate": 1.9997640552241344e-05, "loss": 0.0339, "step": 97900 }, { "epoch": 118.10621605310803, "grad_norm": 3.9284749031066895, "learning_rate": 1.999764031098655e-05, "loss": 0.0352, "step": 97910 }, { "epoch": 118.11828605914303, "grad_norm": 3.7265217304229736, "learning_rate": 1.9997640069731756e-05, "loss": 0.0329, "step": 97920 }, { "epoch": 118.13035606517803, "grad_norm": 3.6427054405212402, "learning_rate": 1.9997639828476962e-05, "loss": 0.0333, "step": 97930 }, { "epoch": 118.14242607121304, "grad_norm": 3.7889039516448975, "learning_rate": 1.999763958722217e-05, "loss": 0.0353, "step": 97940 }, { "epoch": 118.15449607724804, "grad_norm": 4.322440147399902, "learning_rate": 1.9997639345967375e-05, "loss": 0.0373, "step": 97950 }, { "epoch": 118.16656608328304, "grad_norm": 3.5651497840881348, "learning_rate": 1.999763910471258e-05, "loss": 0.0356, "step": 97960 }, { "epoch": 118.17863608931805, "grad_norm": 3.7309629917144775, "learning_rate": 1.9997638863457787e-05, "loss": 0.0338, "step": 97970 }, { "epoch": 118.19070609535305, "grad_norm": 3.627344846725464, "learning_rate": 1.9997638622202993e-05, "loss": 0.0362, "step": 97980 }, { "epoch": 118.20277610138805, "grad_norm": 3.9725241661071777, "learning_rate": 1.99976383809482e-05, "loss": 0.0335, "step": 97990 }, { "epoch": 118.21484610742306, "grad_norm": 3.675084352493286, "learning_rate": 1.9997638139693406e-05, "loss": 0.0337, "step": 98000 }, { "epoch": 118.21484610742306, "eval_loss": 13.09410572052002, "eval_runtime": 8.1411, "eval_samples_per_second": 85.615, "eval_steps_per_second": 10.809, "step": 98000 }, { "epoch": 118.22691611345806, "grad_norm": 3.486276388168335, "learning_rate": 1.9997637898438612e-05, "loss": 0.0338, "step": 98010 }, { "epoch": 118.23898611949306, "grad_norm": 3.5577096939086914, "learning_rate": 1.999763765718382e-05, "loss": 0.0343, "step": 98020 }, { "epoch": 118.25105612552807, "grad_norm": 4.042799949645996, "learning_rate": 1.9997637415929025e-05, "loss": 0.0363, "step": 98030 }, { "epoch": 118.26312613156307, "grad_norm": 3.7234747409820557, "learning_rate": 1.999763717467423e-05, "loss": 0.0358, "step": 98040 }, { "epoch": 118.27519613759807, "grad_norm": 4.222475051879883, "learning_rate": 1.9997636933419437e-05, "loss": 0.0356, "step": 98050 }, { "epoch": 118.28726614363308, "grad_norm": 4.2627034187316895, "learning_rate": 1.999763669216464e-05, "loss": 0.0373, "step": 98060 }, { "epoch": 118.29933614966808, "grad_norm": 4.1202497482299805, "learning_rate": 1.9997636450909846e-05, "loss": 0.0366, "step": 98070 }, { "epoch": 118.31140615570308, "grad_norm": 3.7329845428466797, "learning_rate": 1.9997636209655052e-05, "loss": 0.0364, "step": 98080 }, { "epoch": 118.32347616173809, "grad_norm": 4.5491814613342285, "learning_rate": 1.999763596840026e-05, "loss": 0.0391, "step": 98090 }, { "epoch": 118.33554616777309, "grad_norm": 3.7982304096221924, "learning_rate": 1.9997635727145465e-05, "loss": 0.0364, "step": 98100 }, { "epoch": 118.34761617380809, "grad_norm": 3.87227725982666, "learning_rate": 1.999763548589067e-05, "loss": 0.0376, "step": 98110 }, { "epoch": 118.3596861798431, "grad_norm": 4.126917839050293, "learning_rate": 1.9997635244635877e-05, "loss": 0.0365, "step": 98120 }, { "epoch": 118.3717561858781, "grad_norm": 4.056488990783691, "learning_rate": 1.9997635003381083e-05, "loss": 0.038, "step": 98130 }, { "epoch": 118.3838261919131, "grad_norm": 4.0648884773254395, "learning_rate": 1.999763476212629e-05, "loss": 0.0394, "step": 98140 }, { "epoch": 118.3958961979481, "grad_norm": Infinity, "learning_rate": 1.99976345208715e-05, "loss": 0.0385, "step": 98150 }, { "epoch": 118.40796620398311, "grad_norm": 4.196243762969971, "learning_rate": 1.9997634279616705e-05, "loss": 0.0383, "step": 98160 }, { "epoch": 118.42003621001811, "grad_norm": 3.9752702713012695, "learning_rate": 1.999763403836191e-05, "loss": 0.0392, "step": 98170 }, { "epoch": 118.43210621605311, "grad_norm": 3.9057090282440186, "learning_rate": 1.9997633797107118e-05, "loss": 0.0393, "step": 98180 }, { "epoch": 118.44417622208812, "grad_norm": 4.289742946624756, "learning_rate": 1.9997633555852324e-05, "loss": 0.0389, "step": 98190 }, { "epoch": 118.45624622812312, "grad_norm": 4.198141098022461, "learning_rate": 1.999763331459753e-05, "loss": 0.0394, "step": 98200 }, { "epoch": 118.46831623415812, "grad_norm": 4.141885757446289, "learning_rate": 1.9997633073342736e-05, "loss": 0.0388, "step": 98210 }, { "epoch": 118.48038624019313, "grad_norm": 3.976717710494995, "learning_rate": 1.999763283208794e-05, "loss": 0.0396, "step": 98220 }, { "epoch": 118.49245624622813, "grad_norm": 4.699911594390869, "learning_rate": 1.9997632590833145e-05, "loss": 0.0401, "step": 98230 }, { "epoch": 118.50452625226312, "grad_norm": 4.464141845703125, "learning_rate": 1.999763234957835e-05, "loss": 0.0398, "step": 98240 }, { "epoch": 118.51659625829812, "grad_norm": 4.346447467803955, "learning_rate": 1.9997632108323558e-05, "loss": 0.0385, "step": 98250 }, { "epoch": 118.52866626433313, "grad_norm": 4.28519868850708, "learning_rate": 1.9997631867068764e-05, "loss": 0.0403, "step": 98260 }, { "epoch": 118.54073627036813, "grad_norm": 4.331839561462402, "learning_rate": 1.999763162581397e-05, "loss": 0.0424, "step": 98270 }, { "epoch": 118.55280627640313, "grad_norm": 3.483093738555908, "learning_rate": 1.9997631384559177e-05, "loss": 0.0393, "step": 98280 }, { "epoch": 118.56487628243814, "grad_norm": 4.205714702606201, "learning_rate": 1.9997631143304383e-05, "loss": 0.0407, "step": 98290 }, { "epoch": 118.57694628847314, "grad_norm": 3.899041175842285, "learning_rate": 1.999763090204959e-05, "loss": 0.0424, "step": 98300 }, { "epoch": 118.58901629450814, "grad_norm": 5.0772552490234375, "learning_rate": 1.9997630660794795e-05, "loss": 0.0439, "step": 98310 }, { "epoch": 118.60108630054314, "grad_norm": 4.505832672119141, "learning_rate": 1.999763041954e-05, "loss": 0.0391, "step": 98320 }, { "epoch": 118.61315630657815, "grad_norm": 4.487829208374023, "learning_rate": 1.9997630178285208e-05, "loss": 0.0407, "step": 98330 }, { "epoch": 118.62522631261315, "grad_norm": 4.618379592895508, "learning_rate": 1.9997629937030414e-05, "loss": 0.0403, "step": 98340 }, { "epoch": 118.63729631864815, "grad_norm": 4.629540920257568, "learning_rate": 1.999762969577562e-05, "loss": 0.0412, "step": 98350 }, { "epoch": 118.64936632468316, "grad_norm": 4.190533638000488, "learning_rate": 1.9997629454520826e-05, "loss": 0.042, "step": 98360 }, { "epoch": 118.66143633071816, "grad_norm": 3.784367561340332, "learning_rate": 1.9997629213266032e-05, "loss": 0.0415, "step": 98370 }, { "epoch": 118.67350633675316, "grad_norm": 4.269538402557373, "learning_rate": 1.999762897201124e-05, "loss": 0.0412, "step": 98380 }, { "epoch": 118.68557634278817, "grad_norm": 4.283115386962891, "learning_rate": 1.9997628730756445e-05, "loss": 0.0432, "step": 98390 }, { "epoch": 118.69764634882317, "grad_norm": 4.338818550109863, "learning_rate": 1.999762848950165e-05, "loss": 0.0417, "step": 98400 }, { "epoch": 118.70971635485817, "grad_norm": 4.534633636474609, "learning_rate": 1.9997628248246857e-05, "loss": 0.0414, "step": 98410 }, { "epoch": 118.72178636089318, "grad_norm": 3.876079797744751, "learning_rate": 1.9997628006992064e-05, "loss": 0.0397, "step": 98420 }, { "epoch": 118.73385636692818, "grad_norm": 4.380492687225342, "learning_rate": 1.999762776573727e-05, "loss": 0.0413, "step": 98430 }, { "epoch": 118.74592637296318, "grad_norm": 4.44437837600708, "learning_rate": 1.9997627524482476e-05, "loss": 0.0415, "step": 98440 }, { "epoch": 118.75799637899819, "grad_norm": 4.406836986541748, "learning_rate": 1.9997627283227682e-05, "loss": 0.0434, "step": 98450 }, { "epoch": 118.77006638503319, "grad_norm": 4.623284816741943, "learning_rate": 1.999762704197289e-05, "loss": 0.0414, "step": 98460 }, { "epoch": 118.78213639106819, "grad_norm": 3.870537519454956, "learning_rate": 1.999762680071809e-05, "loss": 0.0417, "step": 98470 }, { "epoch": 118.7942063971032, "grad_norm": 3.7791011333465576, "learning_rate": 1.9997626559463297e-05, "loss": 0.04, "step": 98480 }, { "epoch": 118.8062764031382, "grad_norm": 3.72683048248291, "learning_rate": 1.9997626318208504e-05, "loss": 0.04, "step": 98490 }, { "epoch": 118.8183464091732, "grad_norm": 4.362721920013428, "learning_rate": 1.999762607695371e-05, "loss": 0.0427, "step": 98500 }, { "epoch": 118.8183464091732, "eval_loss": 13.116865158081055, "eval_runtime": 8.1425, "eval_samples_per_second": 85.6, "eval_steps_per_second": 10.807, "step": 98500 }, { "epoch": 118.8304164152082, "grad_norm": 4.368515968322754, "learning_rate": 1.9997625835698916e-05, "loss": 0.0427, "step": 98510 }, { "epoch": 118.84248642124321, "grad_norm": 3.9229207038879395, "learning_rate": 1.9997625594444122e-05, "loss": 0.0422, "step": 98520 }, { "epoch": 118.85455642727821, "grad_norm": 4.4477081298828125, "learning_rate": 1.999762535318933e-05, "loss": 0.0426, "step": 98530 }, { "epoch": 118.86662643331321, "grad_norm": 4.246710300445557, "learning_rate": 1.9997625111934535e-05, "loss": 0.0431, "step": 98540 }, { "epoch": 118.87869643934822, "grad_norm": 4.020447731018066, "learning_rate": 1.999762487067974e-05, "loss": 0.0433, "step": 98550 }, { "epoch": 118.89076644538322, "grad_norm": 4.409392833709717, "learning_rate": 1.9997624629424947e-05, "loss": 0.0437, "step": 98560 }, { "epoch": 118.90283645141822, "grad_norm": 4.582932949066162, "learning_rate": 1.9997624388170153e-05, "loss": 0.0441, "step": 98570 }, { "epoch": 118.91490645745323, "grad_norm": 4.271304130554199, "learning_rate": 1.999762414691536e-05, "loss": 0.0439, "step": 98580 }, { "epoch": 118.92697646348823, "grad_norm": 4.187241077423096, "learning_rate": 1.9997623905660566e-05, "loss": 0.0438, "step": 98590 }, { "epoch": 118.93904646952323, "grad_norm": 3.9802095890045166, "learning_rate": 1.9997623664405772e-05, "loss": 0.0421, "step": 98600 }, { "epoch": 118.95111647555824, "grad_norm": 4.114495277404785, "learning_rate": 1.9997623423150978e-05, "loss": 0.0424, "step": 98610 }, { "epoch": 118.96318648159324, "grad_norm": 4.577273368835449, "learning_rate": 1.9997623181896184e-05, "loss": 0.0429, "step": 98620 }, { "epoch": 118.97525648762824, "grad_norm": 4.379693031311035, "learning_rate": 1.999762294064139e-05, "loss": 0.0428, "step": 98630 }, { "epoch": 118.98732649366325, "grad_norm": 4.607944965362549, "learning_rate": 1.9997622699386597e-05, "loss": 0.0421, "step": 98640 }, { "epoch": 118.99939649969825, "grad_norm": 4.235135078430176, "learning_rate": 1.9997622458131803e-05, "loss": 0.0432, "step": 98650 }, { "epoch": 119.0108630054315, "grad_norm": 3.7051241397857666, "learning_rate": 1.999762221687701e-05, "loss": 0.0282, "step": 98660 }, { "epoch": 119.0229330114665, "grad_norm": 3.2653746604919434, "learning_rate": 1.9997621975622216e-05, "loss": 0.0302, "step": 98670 }, { "epoch": 119.03500301750151, "grad_norm": 3.1547391414642334, "learning_rate": 1.9997621734367422e-05, "loss": 0.0303, "step": 98680 }, { "epoch": 119.04707302353651, "grad_norm": 3.571423292160034, "learning_rate": 1.9997621493112628e-05, "loss": 0.0334, "step": 98690 }, { "epoch": 119.05914302957152, "grad_norm": 3.547187089920044, "learning_rate": 1.9997621251857834e-05, "loss": 0.0335, "step": 98700 }, { "epoch": 119.07121303560652, "grad_norm": 3.3573050498962402, "learning_rate": 1.999762101060304e-05, "loss": 0.0331, "step": 98710 }, { "epoch": 119.08328304164152, "grad_norm": 3.5641531944274902, "learning_rate": 1.9997620769348247e-05, "loss": 0.0333, "step": 98720 }, { "epoch": 119.09535304767653, "grad_norm": 3.2746505737304688, "learning_rate": 1.9997620528093453e-05, "loss": 0.0347, "step": 98730 }, { "epoch": 119.10742305371153, "grad_norm": 3.962554931640625, "learning_rate": 1.999762028683866e-05, "loss": 0.0323, "step": 98740 }, { "epoch": 119.11949305974653, "grad_norm": 3.8527796268463135, "learning_rate": 1.9997620045583865e-05, "loss": 0.0359, "step": 98750 }, { "epoch": 119.13156306578153, "grad_norm": 3.5064525604248047, "learning_rate": 1.999761980432907e-05, "loss": 0.0356, "step": 98760 }, { "epoch": 119.14363307181654, "grad_norm": 4.092798233032227, "learning_rate": 1.9997619563074278e-05, "loss": 0.034, "step": 98770 }, { "epoch": 119.15570307785154, "grad_norm": 3.795790433883667, "learning_rate": 1.9997619321819484e-05, "loss": 0.034, "step": 98780 }, { "epoch": 119.16777308388654, "grad_norm": 3.5032877922058105, "learning_rate": 1.999761908056469e-05, "loss": 0.0337, "step": 98790 }, { "epoch": 119.17984308992155, "grad_norm": 3.574268102645874, "learning_rate": 1.9997618839309896e-05, "loss": 0.035, "step": 98800 }, { "epoch": 119.19191309595655, "grad_norm": 3.846958875656128, "learning_rate": 1.9997618598055103e-05, "loss": 0.0355, "step": 98810 }, { "epoch": 119.20398310199155, "grad_norm": 3.418672561645508, "learning_rate": 1.999761835680031e-05, "loss": 0.0356, "step": 98820 }, { "epoch": 119.21605310802656, "grad_norm": 3.6601696014404297, "learning_rate": 1.9997618115545515e-05, "loss": 0.0349, "step": 98830 }, { "epoch": 119.22812311406156, "grad_norm": 3.824237108230591, "learning_rate": 1.999761787429072e-05, "loss": 0.0359, "step": 98840 }, { "epoch": 119.24019312009656, "grad_norm": 3.8769235610961914, "learning_rate": 1.9997617633035927e-05, "loss": 0.038, "step": 98850 }, { "epoch": 119.25226312613157, "grad_norm": 4.039586067199707, "learning_rate": 1.9997617391781134e-05, "loss": 0.0367, "step": 98860 }, { "epoch": 119.26433313216657, "grad_norm": 3.952298641204834, "learning_rate": 1.999761715052634e-05, "loss": 0.0364, "step": 98870 }, { "epoch": 119.27640313820157, "grad_norm": 3.893468141555786, "learning_rate": 1.9997616909271546e-05, "loss": 0.0375, "step": 98880 }, { "epoch": 119.28847314423658, "grad_norm": 4.068678855895996, "learning_rate": 1.999761666801675e-05, "loss": 0.0372, "step": 98890 }, { "epoch": 119.30054315027158, "grad_norm": 4.034485340118408, "learning_rate": 1.9997616426761955e-05, "loss": 0.0363, "step": 98900 }, { "epoch": 119.31261315630658, "grad_norm": 3.617765426635742, "learning_rate": 1.999761618550716e-05, "loss": 0.0383, "step": 98910 }, { "epoch": 119.32468316234159, "grad_norm": 3.926459312438965, "learning_rate": 1.9997615944252368e-05, "loss": 0.0362, "step": 98920 }, { "epoch": 119.33675316837659, "grad_norm": 3.5505285263061523, "learning_rate": 1.9997615702997574e-05, "loss": 0.037, "step": 98930 }, { "epoch": 119.34882317441159, "grad_norm": 4.018188953399658, "learning_rate": 1.999761546174278e-05, "loss": 0.0375, "step": 98940 }, { "epoch": 119.3608931804466, "grad_norm": 4.030759811401367, "learning_rate": 1.9997615220487986e-05, "loss": 0.0374, "step": 98950 }, { "epoch": 119.3729631864816, "grad_norm": 3.8509323596954346, "learning_rate": 1.9997614979233192e-05, "loss": 0.0368, "step": 98960 }, { "epoch": 119.3850331925166, "grad_norm": 3.928267002105713, "learning_rate": 1.99976147379784e-05, "loss": 0.0393, "step": 98970 }, { "epoch": 119.3971031985516, "grad_norm": 4.316489219665527, "learning_rate": 1.9997614496723605e-05, "loss": 0.0372, "step": 98980 }, { "epoch": 119.40917320458661, "grad_norm": 4.328305244445801, "learning_rate": 1.999761425546881e-05, "loss": 0.0381, "step": 98990 }, { "epoch": 119.42124321062161, "grad_norm": 3.943575620651245, "learning_rate": 1.9997614014214017e-05, "loss": 0.0383, "step": 99000 }, { "epoch": 119.42124321062161, "eval_loss": 13.107011795043945, "eval_runtime": 8.1385, "eval_samples_per_second": 85.642, "eval_steps_per_second": 10.813, "step": 99000 }, { "epoch": 119.43331321665661, "grad_norm": 3.8750534057617188, "learning_rate": 1.9997613772959223e-05, "loss": 0.0398, "step": 99010 }, { "epoch": 119.44538322269162, "grad_norm": 3.6281485557556152, "learning_rate": 1.999761353170443e-05, "loss": 0.0376, "step": 99020 }, { "epoch": 119.45745322872662, "grad_norm": 4.053534507751465, "learning_rate": 1.9997613290449636e-05, "loss": 0.0368, "step": 99030 }, { "epoch": 119.46952323476162, "grad_norm": 4.692449569702148, "learning_rate": 1.9997613049194842e-05, "loss": 0.0394, "step": 99040 }, { "epoch": 119.48159324079663, "grad_norm": 4.470062255859375, "learning_rate": 1.999761280794005e-05, "loss": 0.0395, "step": 99050 }, { "epoch": 119.49366324683163, "grad_norm": 4.010348796844482, "learning_rate": 1.9997612566685255e-05, "loss": 0.0386, "step": 99060 }, { "epoch": 119.50573325286662, "grad_norm": 4.2084455490112305, "learning_rate": 1.999761232543046e-05, "loss": 0.0385, "step": 99070 }, { "epoch": 119.51780325890162, "grad_norm": 4.788814544677734, "learning_rate": 1.9997612084175667e-05, "loss": 0.0388, "step": 99080 }, { "epoch": 119.52987326493663, "grad_norm": 4.105197429656982, "learning_rate": 1.9997611842920873e-05, "loss": 0.0377, "step": 99090 }, { "epoch": 119.54194327097163, "grad_norm": 4.174549102783203, "learning_rate": 1.999761160166608e-05, "loss": 0.0411, "step": 99100 }, { "epoch": 119.55401327700663, "grad_norm": 4.232937335968018, "learning_rate": 1.9997611360411286e-05, "loss": 0.0386, "step": 99110 }, { "epoch": 119.56608328304164, "grad_norm": 4.375732898712158, "learning_rate": 1.9997611119156492e-05, "loss": 0.0395, "step": 99120 }, { "epoch": 119.57815328907664, "grad_norm": 4.037395000457764, "learning_rate": 1.9997610877901698e-05, "loss": 0.0389, "step": 99130 }, { "epoch": 119.59022329511164, "grad_norm": 4.084381580352783, "learning_rate": 1.99976106366469e-05, "loss": 0.0387, "step": 99140 }, { "epoch": 119.60229330114664, "grad_norm": 4.351567268371582, "learning_rate": 1.9997610395392107e-05, "loss": 0.0408, "step": 99150 }, { "epoch": 119.61436330718165, "grad_norm": 4.07460355758667, "learning_rate": 1.9997610154137313e-05, "loss": 0.0389, "step": 99160 }, { "epoch": 119.62643331321665, "grad_norm": 4.297728538513184, "learning_rate": 1.999760991288252e-05, "loss": 0.0406, "step": 99170 }, { "epoch": 119.63850331925165, "grad_norm": 4.084972858428955, "learning_rate": 1.9997609671627726e-05, "loss": 0.0418, "step": 99180 }, { "epoch": 119.65057332528666, "grad_norm": 3.952864170074463, "learning_rate": 1.9997609430372932e-05, "loss": 0.0414, "step": 99190 }, { "epoch": 119.66264333132166, "grad_norm": 4.193751335144043, "learning_rate": 1.9997609189118138e-05, "loss": 0.041, "step": 99200 }, { "epoch": 119.67471333735666, "grad_norm": 4.1302409172058105, "learning_rate": 1.9997608947863344e-05, "loss": 0.0412, "step": 99210 }, { "epoch": 119.68678334339167, "grad_norm": 4.448808670043945, "learning_rate": 1.999760870660855e-05, "loss": 0.0415, "step": 99220 }, { "epoch": 119.69885334942667, "grad_norm": 4.146193504333496, "learning_rate": 1.999760846535376e-05, "loss": 0.0397, "step": 99230 }, { "epoch": 119.71092335546167, "grad_norm": 4.048373222351074, "learning_rate": 1.9997608224098966e-05, "loss": 0.0405, "step": 99240 }, { "epoch": 119.72299336149668, "grad_norm": 4.252946853637695, "learning_rate": 1.9997607982844173e-05, "loss": 0.0414, "step": 99250 }, { "epoch": 119.73506336753168, "grad_norm": 4.671694755554199, "learning_rate": 1.999760774158938e-05, "loss": 0.042, "step": 99260 }, { "epoch": 119.74713337356668, "grad_norm": 4.070289134979248, "learning_rate": 1.9997607500334585e-05, "loss": 0.0414, "step": 99270 }, { "epoch": 119.75920337960169, "grad_norm": 4.060247421264648, "learning_rate": 1.999760725907979e-05, "loss": 0.0404, "step": 99280 }, { "epoch": 119.77127338563669, "grad_norm": 4.125820636749268, "learning_rate": 1.9997607017824998e-05, "loss": 0.041, "step": 99290 }, { "epoch": 119.78334339167169, "grad_norm": 4.317617893218994, "learning_rate": 1.99976067765702e-05, "loss": 0.0411, "step": 99300 }, { "epoch": 119.7954133977067, "grad_norm": 4.072306156158447, "learning_rate": 1.9997606535315407e-05, "loss": 0.0415, "step": 99310 }, { "epoch": 119.8074834037417, "grad_norm": 4.217224597930908, "learning_rate": 1.9997606294060613e-05, "loss": 0.0393, "step": 99320 }, { "epoch": 119.8195534097767, "grad_norm": 4.344552040100098, "learning_rate": 1.999760605280582e-05, "loss": 0.0417, "step": 99330 }, { "epoch": 119.8316234158117, "grad_norm": 3.9696998596191406, "learning_rate": 1.9997605811551025e-05, "loss": 0.0414, "step": 99340 }, { "epoch": 119.84369342184671, "grad_norm": 4.513197898864746, "learning_rate": 1.999760557029623e-05, "loss": 0.0446, "step": 99350 }, { "epoch": 119.85576342788171, "grad_norm": 4.159745216369629, "learning_rate": 1.9997605329041438e-05, "loss": 0.0415, "step": 99360 }, { "epoch": 119.86783343391672, "grad_norm": 4.3170390129089355, "learning_rate": 1.9997605087786644e-05, "loss": 0.0404, "step": 99370 }, { "epoch": 119.87990343995172, "grad_norm": 3.9942984580993652, "learning_rate": 1.999760484653185e-05, "loss": 0.042, "step": 99380 }, { "epoch": 119.89197344598672, "grad_norm": 4.039107799530029, "learning_rate": 1.9997604605277056e-05, "loss": 0.043, "step": 99390 }, { "epoch": 119.90404345202172, "grad_norm": 4.040081977844238, "learning_rate": 1.9997604364022262e-05, "loss": 0.0419, "step": 99400 }, { "epoch": 119.91611345805673, "grad_norm": 4.03555154800415, "learning_rate": 1.999760412276747e-05, "loss": 0.0401, "step": 99410 }, { "epoch": 119.92818346409173, "grad_norm": 4.128777503967285, "learning_rate": 1.9997603881512675e-05, "loss": 0.0404, "step": 99420 }, { "epoch": 119.94025347012673, "grad_norm": 4.105942249298096, "learning_rate": 1.999760364025788e-05, "loss": 0.044, "step": 99430 }, { "epoch": 119.95232347616174, "grad_norm": 3.920628309249878, "learning_rate": 1.9997603399003087e-05, "loss": 0.0435, "step": 99440 }, { "epoch": 119.96439348219674, "grad_norm": 4.426087856292725, "learning_rate": 1.9997603157748294e-05, "loss": 0.0433, "step": 99450 }, { "epoch": 119.97646348823174, "grad_norm": 3.9126853942871094, "learning_rate": 1.99976029164935e-05, "loss": 0.044, "step": 99460 }, { "epoch": 119.98853349426675, "grad_norm": 3.7566256523132324, "learning_rate": 1.9997602675238706e-05, "loss": 0.0433, "step": 99470 }, { "epoch": 120.0, "grad_norm": 7.016258239746094, "learning_rate": 1.9997602433983912e-05, "loss": 0.0416, "step": 99480 }, { "epoch": 120.012070006035, "grad_norm": 3.5196878910064697, "learning_rate": 1.999760219272912e-05, "loss": 0.028, "step": 99490 }, { "epoch": 120.02414001207, "grad_norm": 3.719773769378662, "learning_rate": 1.9997601951474325e-05, "loss": 0.0312, "step": 99500 }, { "epoch": 120.02414001207, "eval_loss": 13.109515190124512, "eval_runtime": 8.1433, "eval_samples_per_second": 85.592, "eval_steps_per_second": 10.806, "step": 99500 }, { "epoch": 120.03621001810501, "grad_norm": 3.696190595626831, "learning_rate": 1.999760171021953e-05, "loss": 0.0285, "step": 99510 }, { "epoch": 120.04828002414001, "grad_norm": 3.8878705501556396, "learning_rate": 1.9997601468964737e-05, "loss": 0.0318, "step": 99520 }, { "epoch": 120.06035003017502, "grad_norm": 3.147923707962036, "learning_rate": 1.9997601227709943e-05, "loss": 0.0298, "step": 99530 }, { "epoch": 120.07242003621002, "grad_norm": 3.8788435459136963, "learning_rate": 1.999760098645515e-05, "loss": 0.0315, "step": 99540 }, { "epoch": 120.08449004224502, "grad_norm": 3.6032423973083496, "learning_rate": 1.9997600745200352e-05, "loss": 0.0321, "step": 99550 }, { "epoch": 120.09656004828003, "grad_norm": 4.002367973327637, "learning_rate": 1.999760050394556e-05, "loss": 0.0345, "step": 99560 }, { "epoch": 120.10863005431503, "grad_norm": 3.454463481903076, "learning_rate": 1.9997600262690765e-05, "loss": 0.033, "step": 99570 }, { "epoch": 120.12070006035003, "grad_norm": 3.9958415031433105, "learning_rate": 1.999760002143597e-05, "loss": 0.0344, "step": 99580 }, { "epoch": 120.13277006638504, "grad_norm": 3.796790361404419, "learning_rate": 1.9997599780181177e-05, "loss": 0.0337, "step": 99590 }, { "epoch": 120.14484007242004, "grad_norm": 3.879215717315674, "learning_rate": 1.9997599538926383e-05, "loss": 0.0355, "step": 99600 }, { "epoch": 120.15691007845504, "grad_norm": 4.060780048370361, "learning_rate": 1.999759929767159e-05, "loss": 0.0349, "step": 99610 }, { "epoch": 120.16898008449004, "grad_norm": 3.7421905994415283, "learning_rate": 1.9997599056416796e-05, "loss": 0.034, "step": 99620 }, { "epoch": 120.18105009052505, "grad_norm": 3.6241092681884766, "learning_rate": 1.9997598815162002e-05, "loss": 0.0327, "step": 99630 }, { "epoch": 120.19312009656005, "grad_norm": 3.941213369369507, "learning_rate": 1.9997598573907208e-05, "loss": 0.0356, "step": 99640 }, { "epoch": 120.20519010259505, "grad_norm": 3.868720293045044, "learning_rate": 1.9997598332652414e-05, "loss": 0.0358, "step": 99650 }, { "epoch": 120.21726010863006, "grad_norm": 3.614675998687744, "learning_rate": 1.999759809139762e-05, "loss": 0.0369, "step": 99660 }, { "epoch": 120.22933011466506, "grad_norm": 3.659329891204834, "learning_rate": 1.9997597850142827e-05, "loss": 0.0352, "step": 99670 }, { "epoch": 120.24140012070006, "grad_norm": 3.9270029067993164, "learning_rate": 1.9997597608888033e-05, "loss": 0.0356, "step": 99680 }, { "epoch": 120.25347012673507, "grad_norm": 4.357176303863525, "learning_rate": 1.999759736763324e-05, "loss": 0.0381, "step": 99690 }, { "epoch": 120.26554013277007, "grad_norm": 4.01650857925415, "learning_rate": 1.9997597126378446e-05, "loss": 0.0373, "step": 99700 }, { "epoch": 120.27761013880507, "grad_norm": 3.417282819747925, "learning_rate": 1.9997596885123652e-05, "loss": 0.0362, "step": 99710 }, { "epoch": 120.28968014484008, "grad_norm": 3.9088730812072754, "learning_rate": 1.9997596643868858e-05, "loss": 0.0364, "step": 99720 }, { "epoch": 120.30175015087508, "grad_norm": 3.974776029586792, "learning_rate": 1.9997596402614064e-05, "loss": 0.0365, "step": 99730 }, { "epoch": 120.31382015691008, "grad_norm": 4.011575698852539, "learning_rate": 1.999759616135927e-05, "loss": 0.0356, "step": 99740 }, { "epoch": 120.32589016294509, "grad_norm": 3.8373684883117676, "learning_rate": 1.9997595920104477e-05, "loss": 0.0359, "step": 99750 }, { "epoch": 120.33796016898009, "grad_norm": 3.8341095447540283, "learning_rate": 1.9997595678849683e-05, "loss": 0.0357, "step": 99760 }, { "epoch": 120.35003017501509, "grad_norm": 3.9315993785858154, "learning_rate": 1.999759543759489e-05, "loss": 0.0371, "step": 99770 }, { "epoch": 120.3621001810501, "grad_norm": 3.9439001083374023, "learning_rate": 1.9997595196340095e-05, "loss": 0.0394, "step": 99780 }, { "epoch": 120.3741701870851, "grad_norm": 3.8533260822296143, "learning_rate": 1.99975949550853e-05, "loss": 0.0383, "step": 99790 }, { "epoch": 120.3862401931201, "grad_norm": 3.5911433696746826, "learning_rate": 1.9997594713830508e-05, "loss": 0.0359, "step": 99800 }, { "epoch": 120.3983101991551, "grad_norm": 4.090942859649658, "learning_rate": 1.9997594472575714e-05, "loss": 0.0377, "step": 99810 }, { "epoch": 120.41038020519011, "grad_norm": 4.128181457519531, "learning_rate": 1.999759423132092e-05, "loss": 0.0372, "step": 99820 }, { "epoch": 120.42245021122511, "grad_norm": 4.046856880187988, "learning_rate": 1.9997593990066126e-05, "loss": 0.0358, "step": 99830 }, { "epoch": 120.43452021726011, "grad_norm": 3.930279016494751, "learning_rate": 1.9997593748811333e-05, "loss": 0.0377, "step": 99840 }, { "epoch": 120.44659022329512, "grad_norm": 4.201222896575928, "learning_rate": 1.999759350755654e-05, "loss": 0.0377, "step": 99850 }, { "epoch": 120.45866022933012, "grad_norm": 3.741835117340088, "learning_rate": 1.9997593266301745e-05, "loss": 0.0383, "step": 99860 }, { "epoch": 120.47073023536512, "grad_norm": 3.990391731262207, "learning_rate": 1.999759302504695e-05, "loss": 0.0374, "step": 99870 }, { "epoch": 120.48280024140013, "grad_norm": 4.3235321044921875, "learning_rate": 1.9997592783792157e-05, "loss": 0.0396, "step": 99880 }, { "epoch": 120.49487024743513, "grad_norm": 3.9810664653778076, "learning_rate": 1.9997592542537364e-05, "loss": 0.0395, "step": 99890 }, { "epoch": 120.50694025347012, "grad_norm": 3.8914501667022705, "learning_rate": 1.999759230128257e-05, "loss": 0.0393, "step": 99900 }, { "epoch": 120.51901025950512, "grad_norm": 3.6992921829223633, "learning_rate": 1.9997592060027776e-05, "loss": 0.0391, "step": 99910 }, { "epoch": 120.53108026554013, "grad_norm": 3.9870615005493164, "learning_rate": 1.9997591818772982e-05, "loss": 0.0374, "step": 99920 }, { "epoch": 120.54315027157513, "grad_norm": 3.4901745319366455, "learning_rate": 1.999759157751819e-05, "loss": 0.0406, "step": 99930 }, { "epoch": 120.55522027761013, "grad_norm": 3.8963449001312256, "learning_rate": 1.9997591336263395e-05, "loss": 0.0395, "step": 99940 }, { "epoch": 120.56729028364514, "grad_norm": 3.902458429336548, "learning_rate": 1.99975910950086e-05, "loss": 0.0386, "step": 99950 }, { "epoch": 120.57936028968014, "grad_norm": 3.987915515899658, "learning_rate": 1.9997590853753807e-05, "loss": 0.0402, "step": 99960 }, { "epoch": 120.59143029571514, "grad_norm": 4.085399150848389, "learning_rate": 1.999759061249901e-05, "loss": 0.04, "step": 99970 }, { "epoch": 120.60350030175015, "grad_norm": 4.143064022064209, "learning_rate": 1.9997590371244216e-05, "loss": 0.0383, "step": 99980 }, { "epoch": 120.61557030778515, "grad_norm": 4.516252040863037, "learning_rate": 1.9997590129989422e-05, "loss": 0.0394, "step": 99990 }, { "epoch": 120.62764031382015, "grad_norm": 4.2454657554626465, "learning_rate": 1.999758988873463e-05, "loss": 0.0412, "step": 100000 }, { "epoch": 120.62764031382015, "eval_loss": 13.124235153198242, "eval_runtime": 8.1438, "eval_samples_per_second": 85.586, "eval_steps_per_second": 10.806, "step": 100000 }, { "epoch": 120.63971031985515, "grad_norm": 4.202574253082275, "learning_rate": 1.9997589647479835e-05, "loss": 0.0406, "step": 100010 }, { "epoch": 120.65178032589016, "grad_norm": 4.60035514831543, "learning_rate": 1.999758940622504e-05, "loss": 0.0407, "step": 100020 }, { "epoch": 120.66385033192516, "grad_norm": 3.808493137359619, "learning_rate": 1.9997589164970247e-05, "loss": 0.0403, "step": 100030 }, { "epoch": 120.67592033796016, "grad_norm": 4.074012279510498, "learning_rate": 1.9997588923715453e-05, "loss": 0.0397, "step": 100040 }, { "epoch": 120.68799034399517, "grad_norm": 3.68994402885437, "learning_rate": 1.999758868246066e-05, "loss": 0.0405, "step": 100050 }, { "epoch": 120.70006035003017, "grad_norm": 4.114231586456299, "learning_rate": 1.9997588441205866e-05, "loss": 0.0423, "step": 100060 }, { "epoch": 120.71213035606517, "grad_norm": 4.104283332824707, "learning_rate": 1.9997588199951072e-05, "loss": 0.0405, "step": 100070 }, { "epoch": 120.72420036210018, "grad_norm": 4.173361778259277, "learning_rate": 1.999758795869628e-05, "loss": 0.0385, "step": 100080 }, { "epoch": 120.73627036813518, "grad_norm": 4.073586463928223, "learning_rate": 1.9997587717441485e-05, "loss": 0.0373, "step": 100090 }, { "epoch": 120.74834037417018, "grad_norm": 4.477113246917725, "learning_rate": 1.999758747618669e-05, "loss": 0.0404, "step": 100100 }, { "epoch": 120.76041038020519, "grad_norm": 3.9831693172454834, "learning_rate": 1.9997587234931897e-05, "loss": 0.0393, "step": 100110 }, { "epoch": 120.77248038624019, "grad_norm": 3.839715003967285, "learning_rate": 1.9997586993677103e-05, "loss": 0.0418, "step": 100120 }, { "epoch": 120.7845503922752, "grad_norm": 4.30235481262207, "learning_rate": 1.999758675242231e-05, "loss": 0.04, "step": 100130 }, { "epoch": 120.7966203983102, "grad_norm": 4.697261810302734, "learning_rate": 1.9997586511167516e-05, "loss": 0.0416, "step": 100140 }, { "epoch": 120.8086904043452, "grad_norm": 3.717952013015747, "learning_rate": 1.9997586269912722e-05, "loss": 0.0401, "step": 100150 }, { "epoch": 120.8207604103802, "grad_norm": 4.10770320892334, "learning_rate": 1.9997586028657928e-05, "loss": 0.0422, "step": 100160 }, { "epoch": 120.8328304164152, "grad_norm": 4.284467697143555, "learning_rate": 1.9997585787403134e-05, "loss": 0.0424, "step": 100170 }, { "epoch": 120.84490042245021, "grad_norm": 3.9528160095214844, "learning_rate": 1.999758554614834e-05, "loss": 0.0416, "step": 100180 }, { "epoch": 120.85697042848521, "grad_norm": 4.335157871246338, "learning_rate": 1.9997585304893547e-05, "loss": 0.0422, "step": 100190 }, { "epoch": 120.86904043452022, "grad_norm": 4.08420991897583, "learning_rate": 1.9997585063638753e-05, "loss": 0.0427, "step": 100200 }, { "epoch": 120.88111044055522, "grad_norm": 4.1882123947143555, "learning_rate": 1.999758482238396e-05, "loss": 0.044, "step": 100210 }, { "epoch": 120.89318044659022, "grad_norm": 3.942596673965454, "learning_rate": 1.9997584581129162e-05, "loss": 0.0408, "step": 100220 }, { "epoch": 120.90525045262522, "grad_norm": 3.9085822105407715, "learning_rate": 1.9997584339874368e-05, "loss": 0.0411, "step": 100230 }, { "epoch": 120.91732045866023, "grad_norm": 4.116026878356934, "learning_rate": 1.9997584098619574e-05, "loss": 0.044, "step": 100240 }, { "epoch": 120.92939046469523, "grad_norm": 4.191659927368164, "learning_rate": 1.999758385736478e-05, "loss": 0.042, "step": 100250 }, { "epoch": 120.94146047073023, "grad_norm": 4.5647358894348145, "learning_rate": 1.9997583616109987e-05, "loss": 0.0437, "step": 100260 }, { "epoch": 120.95353047676524, "grad_norm": 4.145966529846191, "learning_rate": 1.9997583374855193e-05, "loss": 0.0422, "step": 100270 }, { "epoch": 120.96560048280024, "grad_norm": 3.9680683612823486, "learning_rate": 1.99975831336004e-05, "loss": 0.0424, "step": 100280 }, { "epoch": 120.97767048883524, "grad_norm": 4.040521144866943, "learning_rate": 1.9997582892345605e-05, "loss": 0.041, "step": 100290 }, { "epoch": 120.98974049487025, "grad_norm": 4.010721206665039, "learning_rate": 1.9997582651090812e-05, "loss": 0.0419, "step": 100300 }, { "epoch": 121.0012070006035, "grad_norm": 3.3242685794830322, "learning_rate": 1.999758240983602e-05, "loss": 0.0402, "step": 100310 }, { "epoch": 121.0132770066385, "grad_norm": 3.2751331329345703, "learning_rate": 1.9997582168581228e-05, "loss": 0.0273, "step": 100320 }, { "epoch": 121.0253470126735, "grad_norm": 3.417888641357422, "learning_rate": 1.9997581927326434e-05, "loss": 0.0306, "step": 100330 }, { "epoch": 121.03741701870851, "grad_norm": 3.8739168643951416, "learning_rate": 1.999758168607164e-05, "loss": 0.0296, "step": 100340 }, { "epoch": 121.04948702474351, "grad_norm": 3.1351661682128906, "learning_rate": 1.9997581444816846e-05, "loss": 0.0303, "step": 100350 }, { "epoch": 121.06155703077852, "grad_norm": 3.486982583999634, "learning_rate": 1.9997581203562052e-05, "loss": 0.0313, "step": 100360 }, { "epoch": 121.07362703681352, "grad_norm": 3.303779363632202, "learning_rate": 1.999758096230726e-05, "loss": 0.0302, "step": 100370 }, { "epoch": 121.08569704284852, "grad_norm": 3.979918956756592, "learning_rate": 1.999758072105246e-05, "loss": 0.0347, "step": 100380 }, { "epoch": 121.09776704888353, "grad_norm": 3.42975115776062, "learning_rate": 1.9997580479797668e-05, "loss": 0.032, "step": 100390 }, { "epoch": 121.10983705491853, "grad_norm": 3.868229866027832, "learning_rate": 1.9997580238542874e-05, "loss": 0.0343, "step": 100400 }, { "epoch": 121.12190706095353, "grad_norm": 3.8177759647369385, "learning_rate": 1.999757999728808e-05, "loss": 0.0331, "step": 100410 }, { "epoch": 121.13397706698854, "grad_norm": 3.5683751106262207, "learning_rate": 1.9997579756033286e-05, "loss": 0.0339, "step": 100420 }, { "epoch": 121.14604707302354, "grad_norm": 3.2446744441986084, "learning_rate": 1.9997579514778493e-05, "loss": 0.0322, "step": 100430 }, { "epoch": 121.15811707905854, "grad_norm": 3.878678321838379, "learning_rate": 1.99975792735237e-05, "loss": 0.033, "step": 100440 }, { "epoch": 121.17018708509354, "grad_norm": 3.842479705810547, "learning_rate": 1.9997579032268905e-05, "loss": 0.035, "step": 100450 }, { "epoch": 121.18225709112855, "grad_norm": 3.7656521797180176, "learning_rate": 1.999757879101411e-05, "loss": 0.0336, "step": 100460 }, { "epoch": 121.19432709716355, "grad_norm": 3.341787815093994, "learning_rate": 1.9997578549759317e-05, "loss": 0.0326, "step": 100470 }, { "epoch": 121.20639710319855, "grad_norm": 3.3202099800109863, "learning_rate": 1.9997578308504524e-05, "loss": 0.0347, "step": 100480 }, { "epoch": 121.21846710923356, "grad_norm": 3.94917368888855, "learning_rate": 1.999757806724973e-05, "loss": 0.0344, "step": 100490 }, { "epoch": 121.23053711526856, "grad_norm": 3.8929977416992188, "learning_rate": 1.9997577825994936e-05, "loss": 0.0319, "step": 100500 }, { "epoch": 121.23053711526856, "eval_loss": 13.125794410705566, "eval_runtime": 8.1432, "eval_samples_per_second": 85.593, "eval_steps_per_second": 10.807, "step": 100500 }, { "epoch": 121.24260712130356, "grad_norm": 3.715175151824951, "learning_rate": 1.9997577584740142e-05, "loss": 0.0354, "step": 100510 }, { "epoch": 121.25467712733857, "grad_norm": 3.966130256652832, "learning_rate": 1.999757734348535e-05, "loss": 0.0347, "step": 100520 }, { "epoch": 121.26674713337357, "grad_norm": 3.7094414234161377, "learning_rate": 1.9997577102230555e-05, "loss": 0.034, "step": 100530 }, { "epoch": 121.27881713940857, "grad_norm": 4.57342529296875, "learning_rate": 1.999757686097576e-05, "loss": 0.0372, "step": 100540 }, { "epoch": 121.29088714544358, "grad_norm": 3.82431960105896, "learning_rate": 1.9997576619720967e-05, "loss": 0.0375, "step": 100550 }, { "epoch": 121.30295715147858, "grad_norm": 4.15370512008667, "learning_rate": 1.9997576378466173e-05, "loss": 0.0365, "step": 100560 }, { "epoch": 121.31502715751358, "grad_norm": 3.799741506576538, "learning_rate": 1.999757613721138e-05, "loss": 0.0361, "step": 100570 }, { "epoch": 121.32709716354859, "grad_norm": 3.6344640254974365, "learning_rate": 1.9997575895956586e-05, "loss": 0.0352, "step": 100580 }, { "epoch": 121.33916716958359, "grad_norm": 4.332074165344238, "learning_rate": 1.9997575654701792e-05, "loss": 0.0381, "step": 100590 }, { "epoch": 121.35123717561859, "grad_norm": 3.987273693084717, "learning_rate": 1.9997575413446998e-05, "loss": 0.0388, "step": 100600 }, { "epoch": 121.3633071816536, "grad_norm": 3.6102523803710938, "learning_rate": 1.9997575172192204e-05, "loss": 0.0375, "step": 100610 }, { "epoch": 121.3753771876886, "grad_norm": 3.2528929710388184, "learning_rate": 1.999757493093741e-05, "loss": 0.0374, "step": 100620 }, { "epoch": 121.3874471937236, "grad_norm": 3.7028939723968506, "learning_rate": 1.9997574689682613e-05, "loss": 0.0381, "step": 100630 }, { "epoch": 121.3995171997586, "grad_norm": 4.78230619430542, "learning_rate": 1.999757444842782e-05, "loss": 0.0408, "step": 100640 }, { "epoch": 121.41158720579361, "grad_norm": 4.213773250579834, "learning_rate": 1.9997574207173026e-05, "loss": 0.0381, "step": 100650 }, { "epoch": 121.42365721182861, "grad_norm": 4.022919654846191, "learning_rate": 1.9997573965918232e-05, "loss": 0.0375, "step": 100660 }, { "epoch": 121.43572721786362, "grad_norm": 3.651129961013794, "learning_rate": 1.9997573724663438e-05, "loss": 0.0373, "step": 100670 }, { "epoch": 121.44779722389862, "grad_norm": 3.6195762157440186, "learning_rate": 1.9997573483408645e-05, "loss": 0.0367, "step": 100680 }, { "epoch": 121.45986722993362, "grad_norm": 3.8749568462371826, "learning_rate": 1.999757324215385e-05, "loss": 0.0358, "step": 100690 }, { "epoch": 121.47193723596862, "grad_norm": 4.274978160858154, "learning_rate": 1.9997573000899057e-05, "loss": 0.0364, "step": 100700 }, { "epoch": 121.48400724200363, "grad_norm": 4.177975177764893, "learning_rate": 1.9997572759644263e-05, "loss": 0.0395, "step": 100710 }, { "epoch": 121.49607724803863, "grad_norm": 4.118191242218018, "learning_rate": 1.999757251838947e-05, "loss": 0.0387, "step": 100720 }, { "epoch": 121.50814725407362, "grad_norm": 4.24370813369751, "learning_rate": 1.9997572277134676e-05, "loss": 0.0383, "step": 100730 }, { "epoch": 121.52021726010862, "grad_norm": 3.7583694458007812, "learning_rate": 1.9997572035879882e-05, "loss": 0.0389, "step": 100740 }, { "epoch": 121.53228726614363, "grad_norm": 3.9437265396118164, "learning_rate": 1.9997571794625088e-05, "loss": 0.0385, "step": 100750 }, { "epoch": 121.54435727217863, "grad_norm": 3.931945323944092, "learning_rate": 1.9997571553370294e-05, "loss": 0.0377, "step": 100760 }, { "epoch": 121.55642727821363, "grad_norm": 4.056408405303955, "learning_rate": 1.99975713121155e-05, "loss": 0.0391, "step": 100770 }, { "epoch": 121.56849728424864, "grad_norm": 4.626657009124756, "learning_rate": 1.9997571070860707e-05, "loss": 0.0387, "step": 100780 }, { "epoch": 121.58056729028364, "grad_norm": 4.259347438812256, "learning_rate": 1.9997570829605913e-05, "loss": 0.0399, "step": 100790 }, { "epoch": 121.59263729631864, "grad_norm": 3.999370574951172, "learning_rate": 1.999757058835112e-05, "loss": 0.0399, "step": 100800 }, { "epoch": 121.60470730235365, "grad_norm": 4.062256336212158, "learning_rate": 1.9997570347096325e-05, "loss": 0.0403, "step": 100810 }, { "epoch": 121.61677730838865, "grad_norm": 3.98039174079895, "learning_rate": 1.999757010584153e-05, "loss": 0.0391, "step": 100820 }, { "epoch": 121.62884731442365, "grad_norm": 3.7703113555908203, "learning_rate": 1.9997569864586738e-05, "loss": 0.0396, "step": 100830 }, { "epoch": 121.64091732045866, "grad_norm": 3.587609052658081, "learning_rate": 1.9997569623331944e-05, "loss": 0.0394, "step": 100840 }, { "epoch": 121.65298732649366, "grad_norm": 3.821660041809082, "learning_rate": 1.999756938207715e-05, "loss": 0.0391, "step": 100850 }, { "epoch": 121.66505733252866, "grad_norm": 3.8933181762695312, "learning_rate": 1.9997569140822356e-05, "loss": 0.0393, "step": 100860 }, { "epoch": 121.67712733856366, "grad_norm": 4.10672664642334, "learning_rate": 1.9997568899567563e-05, "loss": 0.0401, "step": 100870 }, { "epoch": 121.68919734459867, "grad_norm": 3.9357833862304688, "learning_rate": 1.999756865831277e-05, "loss": 0.0403, "step": 100880 }, { "epoch": 121.70126735063367, "grad_norm": 3.7023792266845703, "learning_rate": 1.9997568417057975e-05, "loss": 0.0384, "step": 100890 }, { "epoch": 121.71333735666867, "grad_norm": 4.177844524383545, "learning_rate": 1.999756817580318e-05, "loss": 0.0392, "step": 100900 }, { "epoch": 121.72540736270368, "grad_norm": 3.9383747577667236, "learning_rate": 1.9997567934548387e-05, "loss": 0.0397, "step": 100910 }, { "epoch": 121.73747736873868, "grad_norm": 3.9387104511260986, "learning_rate": 1.9997567693293594e-05, "loss": 0.0398, "step": 100920 }, { "epoch": 121.74954737477368, "grad_norm": 4.273258209228516, "learning_rate": 1.99975674520388e-05, "loss": 0.0406, "step": 100930 }, { "epoch": 121.76161738080869, "grad_norm": 4.0443596839904785, "learning_rate": 1.9997567210784006e-05, "loss": 0.0409, "step": 100940 }, { "epoch": 121.77368738684369, "grad_norm": 3.9121475219726562, "learning_rate": 1.9997566969529212e-05, "loss": 0.0419, "step": 100950 }, { "epoch": 121.7857573928787, "grad_norm": 4.360174655914307, "learning_rate": 1.999756672827442e-05, "loss": 0.0402, "step": 100960 }, { "epoch": 121.7978273989137, "grad_norm": 4.603154182434082, "learning_rate": 1.9997566487019625e-05, "loss": 0.0417, "step": 100970 }, { "epoch": 121.8098974049487, "grad_norm": 3.8451709747314453, "learning_rate": 1.999756624576483e-05, "loss": 0.0405, "step": 100980 }, { "epoch": 121.8219674109837, "grad_norm": 3.923220634460449, "learning_rate": 1.9997566004510037e-05, "loss": 0.0395, "step": 100990 }, { "epoch": 121.8340374170187, "grad_norm": 4.4893388748168945, "learning_rate": 1.9997565763255243e-05, "loss": 0.0422, "step": 101000 }, { "epoch": 121.8340374170187, "eval_loss": 13.15639877319336, "eval_runtime": 8.1261, "eval_samples_per_second": 85.773, "eval_steps_per_second": 10.829, "step": 101000 }, { "epoch": 121.84610742305371, "grad_norm": 4.10550594329834, "learning_rate": 1.999756552200045e-05, "loss": 0.0428, "step": 101010 }, { "epoch": 121.85817742908871, "grad_norm": 4.198608875274658, "learning_rate": 1.9997565280745656e-05, "loss": 0.0406, "step": 101020 }, { "epoch": 121.87024743512372, "grad_norm": 3.790095329284668, "learning_rate": 1.9997565039490862e-05, "loss": 0.0415, "step": 101030 }, { "epoch": 121.88231744115872, "grad_norm": 4.417845249176025, "learning_rate": 1.9997564798236065e-05, "loss": 0.0415, "step": 101040 }, { "epoch": 121.89438744719372, "grad_norm": 3.99086594581604, "learning_rate": 1.999756455698127e-05, "loss": 0.039, "step": 101050 }, { "epoch": 121.90645745322873, "grad_norm": 3.84238338470459, "learning_rate": 1.9997564315726477e-05, "loss": 0.0421, "step": 101060 }, { "epoch": 121.91852745926373, "grad_norm": 3.943364381790161, "learning_rate": 1.9997564074471684e-05, "loss": 0.0381, "step": 101070 }, { "epoch": 121.93059746529873, "grad_norm": 4.037533283233643, "learning_rate": 1.999756383321689e-05, "loss": 0.0387, "step": 101080 }, { "epoch": 121.94266747133373, "grad_norm": 3.8901150226593018, "learning_rate": 1.9997563591962096e-05, "loss": 0.0415, "step": 101090 }, { "epoch": 121.95473747736874, "grad_norm": 4.40377950668335, "learning_rate": 1.9997563350707302e-05, "loss": 0.0395, "step": 101100 }, { "epoch": 121.96680748340374, "grad_norm": 4.432181358337402, "learning_rate": 1.999756310945251e-05, "loss": 0.0415, "step": 101110 }, { "epoch": 121.97887748943874, "grad_norm": 4.620677471160889, "learning_rate": 1.9997562868197715e-05, "loss": 0.0426, "step": 101120 }, { "epoch": 121.99094749547375, "grad_norm": 4.681186199188232, "learning_rate": 1.999756262694292e-05, "loss": 0.0442, "step": 101130 }, { "epoch": 122.002414001207, "grad_norm": 3.919302225112915, "learning_rate": 1.9997562385688127e-05, "loss": 0.038, "step": 101140 }, { "epoch": 122.014484007242, "grad_norm": 3.4205539226531982, "learning_rate": 1.9997562144433333e-05, "loss": 0.0299, "step": 101150 }, { "epoch": 122.026554013277, "grad_norm": 3.6942203044891357, "learning_rate": 1.999756190317854e-05, "loss": 0.0285, "step": 101160 }, { "epoch": 122.03862401931201, "grad_norm": 3.391953706741333, "learning_rate": 1.9997561661923746e-05, "loss": 0.0314, "step": 101170 }, { "epoch": 122.05069402534701, "grad_norm": 3.9859471321105957, "learning_rate": 1.9997561420668952e-05, "loss": 0.0329, "step": 101180 }, { "epoch": 122.06276403138202, "grad_norm": 3.2587437629699707, "learning_rate": 1.9997561179414158e-05, "loss": 0.0317, "step": 101190 }, { "epoch": 122.07483403741702, "grad_norm": 3.4373104572296143, "learning_rate": 1.9997560938159364e-05, "loss": 0.0306, "step": 101200 }, { "epoch": 122.08690404345202, "grad_norm": 3.313448905944824, "learning_rate": 1.999756069690457e-05, "loss": 0.0334, "step": 101210 }, { "epoch": 122.09897404948703, "grad_norm": 3.2866783142089844, "learning_rate": 1.9997560455649777e-05, "loss": 0.0324, "step": 101220 }, { "epoch": 122.11104405552203, "grad_norm": 3.371242046356201, "learning_rate": 1.9997560214394983e-05, "loss": 0.032, "step": 101230 }, { "epoch": 122.12311406155703, "grad_norm": 3.7673583030700684, "learning_rate": 1.999755997314019e-05, "loss": 0.0333, "step": 101240 }, { "epoch": 122.13518406759204, "grad_norm": 3.7306764125823975, "learning_rate": 1.9997559731885395e-05, "loss": 0.032, "step": 101250 }, { "epoch": 122.14725407362704, "grad_norm": 3.537174940109253, "learning_rate": 1.99975594906306e-05, "loss": 0.0352, "step": 101260 }, { "epoch": 122.15932407966204, "grad_norm": 3.973825454711914, "learning_rate": 1.9997559249375808e-05, "loss": 0.034, "step": 101270 }, { "epoch": 122.17139408569705, "grad_norm": 3.4969537258148193, "learning_rate": 1.9997559008121014e-05, "loss": 0.0343, "step": 101280 }, { "epoch": 122.18346409173205, "grad_norm": 3.9146339893341064, "learning_rate": 1.9997558766866217e-05, "loss": 0.0331, "step": 101290 }, { "epoch": 122.19553409776705, "grad_norm": 3.8656487464904785, "learning_rate": 1.9997558525611423e-05, "loss": 0.0342, "step": 101300 }, { "epoch": 122.20760410380205, "grad_norm": 3.5343334674835205, "learning_rate": 1.999755828435663e-05, "loss": 0.034, "step": 101310 }, { "epoch": 122.21967410983706, "grad_norm": 4.228699207305908, "learning_rate": 1.9997558043101836e-05, "loss": 0.034, "step": 101320 }, { "epoch": 122.23174411587206, "grad_norm": 3.6222822666168213, "learning_rate": 1.9997557801847042e-05, "loss": 0.0333, "step": 101330 }, { "epoch": 122.24381412190706, "grad_norm": 3.4758081436157227, "learning_rate": 1.9997557560592248e-05, "loss": 0.035, "step": 101340 }, { "epoch": 122.25588412794207, "grad_norm": 3.5741989612579346, "learning_rate": 1.9997557319337454e-05, "loss": 0.0344, "step": 101350 }, { "epoch": 122.26795413397707, "grad_norm": 3.821990489959717, "learning_rate": 1.999755707808266e-05, "loss": 0.0357, "step": 101360 }, { "epoch": 122.28002414001207, "grad_norm": 4.111660957336426, "learning_rate": 1.9997556836827867e-05, "loss": 0.0359, "step": 101370 }, { "epoch": 122.29209414604708, "grad_norm": 3.7051024436950684, "learning_rate": 1.9997556595573073e-05, "loss": 0.0367, "step": 101380 }, { "epoch": 122.30416415208208, "grad_norm": 3.1741645336151123, "learning_rate": 1.9997556354318282e-05, "loss": 0.0346, "step": 101390 }, { "epoch": 122.31623415811708, "grad_norm": 3.802342653274536, "learning_rate": 1.999755611306349e-05, "loss": 0.0363, "step": 101400 }, { "epoch": 122.32830416415209, "grad_norm": 4.069873332977295, "learning_rate": 1.9997555871808695e-05, "loss": 0.035, "step": 101410 }, { "epoch": 122.34037417018709, "grad_norm": 3.9225687980651855, "learning_rate": 1.99975556305539e-05, "loss": 0.0355, "step": 101420 }, { "epoch": 122.3524441762221, "grad_norm": 4.052384376525879, "learning_rate": 1.9997555389299107e-05, "loss": 0.035, "step": 101430 }, { "epoch": 122.3645141822571, "grad_norm": 4.384099960327148, "learning_rate": 1.9997555148044313e-05, "loss": 0.0377, "step": 101440 }, { "epoch": 122.3765841882921, "grad_norm": 3.9954562187194824, "learning_rate": 1.999755490678952e-05, "loss": 0.0377, "step": 101450 }, { "epoch": 122.3886541943271, "grad_norm": 4.172965049743652, "learning_rate": 1.9997554665534723e-05, "loss": 0.0376, "step": 101460 }, { "epoch": 122.4007242003621, "grad_norm": 4.227593421936035, "learning_rate": 1.999755442427993e-05, "loss": 0.0391, "step": 101470 }, { "epoch": 122.41279420639711, "grad_norm": 4.184483051300049, "learning_rate": 1.9997554183025135e-05, "loss": 0.0385, "step": 101480 }, { "epoch": 122.42486421243211, "grad_norm": 3.6761600971221924, "learning_rate": 1.999755394177034e-05, "loss": 0.038, "step": 101490 }, { "epoch": 122.43693421846712, "grad_norm": 4.229344367980957, "learning_rate": 1.9997553700515547e-05, "loss": 0.0364, "step": 101500 }, { "epoch": 122.43693421846712, "eval_loss": 13.157330513000488, "eval_runtime": 8.1399, "eval_samples_per_second": 85.628, "eval_steps_per_second": 10.811, "step": 101500 }, { "epoch": 122.44900422450212, "grad_norm": 3.7412827014923096, "learning_rate": 1.9997553459260754e-05, "loss": 0.037, "step": 101510 }, { "epoch": 122.46107423053712, "grad_norm": 3.797206401824951, "learning_rate": 1.999755321800596e-05, "loss": 0.0363, "step": 101520 }, { "epoch": 122.47314423657213, "grad_norm": 3.7478015422821045, "learning_rate": 1.9997552976751166e-05, "loss": 0.0378, "step": 101530 }, { "epoch": 122.48521424260713, "grad_norm": 4.354527950286865, "learning_rate": 1.9997552735496372e-05, "loss": 0.0366, "step": 101540 }, { "epoch": 122.49728424864213, "grad_norm": 3.85866641998291, "learning_rate": 1.999755249424158e-05, "loss": 0.0391, "step": 101550 }, { "epoch": 122.50935425467712, "grad_norm": 4.01302433013916, "learning_rate": 1.9997552252986785e-05, "loss": 0.0385, "step": 101560 }, { "epoch": 122.52142426071212, "grad_norm": 3.571206569671631, "learning_rate": 1.999755201173199e-05, "loss": 0.0365, "step": 101570 }, { "epoch": 122.53349426674713, "grad_norm": 4.254507064819336, "learning_rate": 1.9997551770477197e-05, "loss": 0.0373, "step": 101580 }, { "epoch": 122.54556427278213, "grad_norm": 3.6187782287597656, "learning_rate": 1.9997551529222403e-05, "loss": 0.0381, "step": 101590 }, { "epoch": 122.55763427881713, "grad_norm": 4.11676025390625, "learning_rate": 1.999755128796761e-05, "loss": 0.0377, "step": 101600 }, { "epoch": 122.56970428485214, "grad_norm": 3.80855655670166, "learning_rate": 1.9997551046712816e-05, "loss": 0.0369, "step": 101610 }, { "epoch": 122.58177429088714, "grad_norm": 4.253232002258301, "learning_rate": 1.9997550805458022e-05, "loss": 0.0394, "step": 101620 }, { "epoch": 122.59384429692214, "grad_norm": 3.761000394821167, "learning_rate": 1.9997550564203228e-05, "loss": 0.0376, "step": 101630 }, { "epoch": 122.60591430295715, "grad_norm": 4.164488315582275, "learning_rate": 1.9997550322948434e-05, "loss": 0.0401, "step": 101640 }, { "epoch": 122.61798430899215, "grad_norm": 3.943253517150879, "learning_rate": 1.999755008169364e-05, "loss": 0.0393, "step": 101650 }, { "epoch": 122.63005431502715, "grad_norm": 4.1845574378967285, "learning_rate": 1.9997549840438847e-05, "loss": 0.0393, "step": 101660 }, { "epoch": 122.64212432106216, "grad_norm": 4.120703220367432, "learning_rate": 1.9997549599184053e-05, "loss": 0.0384, "step": 101670 }, { "epoch": 122.65419432709716, "grad_norm": 4.108944892883301, "learning_rate": 1.999754935792926e-05, "loss": 0.0387, "step": 101680 }, { "epoch": 122.66626433313216, "grad_norm": 3.890263557434082, "learning_rate": 1.9997549116674465e-05, "loss": 0.0392, "step": 101690 }, { "epoch": 122.67833433916717, "grad_norm": 3.8378007411956787, "learning_rate": 1.9997548875419672e-05, "loss": 0.0385, "step": 101700 }, { "epoch": 122.69040434520217, "grad_norm": 4.495421886444092, "learning_rate": 1.9997548634164875e-05, "loss": 0.0396, "step": 101710 }, { "epoch": 122.70247435123717, "grad_norm": 4.255934715270996, "learning_rate": 1.999754839291008e-05, "loss": 0.0416, "step": 101720 }, { "epoch": 122.71454435727217, "grad_norm": 3.817821979522705, "learning_rate": 1.9997548151655287e-05, "loss": 0.0393, "step": 101730 }, { "epoch": 122.72661436330718, "grad_norm": 4.12547492980957, "learning_rate": 1.9997547910400493e-05, "loss": 0.0396, "step": 101740 }, { "epoch": 122.73868436934218, "grad_norm": 4.17122745513916, "learning_rate": 1.99975476691457e-05, "loss": 0.039, "step": 101750 }, { "epoch": 122.75075437537718, "grad_norm": 4.121119022369385, "learning_rate": 1.9997547427890906e-05, "loss": 0.0391, "step": 101760 }, { "epoch": 122.76282438141219, "grad_norm": 4.123166084289551, "learning_rate": 1.9997547186636112e-05, "loss": 0.0384, "step": 101770 }, { "epoch": 122.77489438744719, "grad_norm": 4.217471599578857, "learning_rate": 1.9997546945381318e-05, "loss": 0.0404, "step": 101780 }, { "epoch": 122.7869643934822, "grad_norm": 3.976078748703003, "learning_rate": 1.9997546704126524e-05, "loss": 0.0416, "step": 101790 }, { "epoch": 122.7990343995172, "grad_norm": 3.706962823867798, "learning_rate": 1.999754646287173e-05, "loss": 0.0422, "step": 101800 }, { "epoch": 122.8111044055522, "grad_norm": 4.33259916305542, "learning_rate": 1.9997546221616937e-05, "loss": 0.0418, "step": 101810 }, { "epoch": 122.8231744115872, "grad_norm": 4.274216651916504, "learning_rate": 1.9997545980362143e-05, "loss": 0.0395, "step": 101820 }, { "epoch": 122.8352444176222, "grad_norm": 3.9903154373168945, "learning_rate": 1.999754573910735e-05, "loss": 0.0427, "step": 101830 }, { "epoch": 122.84731442365721, "grad_norm": 4.5307841300964355, "learning_rate": 1.9997545497852555e-05, "loss": 0.0403, "step": 101840 }, { "epoch": 122.85938442969221, "grad_norm": 4.113959789276123, "learning_rate": 1.999754525659776e-05, "loss": 0.0434, "step": 101850 }, { "epoch": 122.87145443572722, "grad_norm": 4.842182159423828, "learning_rate": 1.9997545015342968e-05, "loss": 0.0434, "step": 101860 }, { "epoch": 122.88352444176222, "grad_norm": 3.9237380027770996, "learning_rate": 1.9997544774088174e-05, "loss": 0.0406, "step": 101870 }, { "epoch": 122.89559444779722, "grad_norm": 4.262727737426758, "learning_rate": 1.999754453283338e-05, "loss": 0.0404, "step": 101880 }, { "epoch": 122.90766445383223, "grad_norm": 4.214314937591553, "learning_rate": 1.9997544291578586e-05, "loss": 0.043, "step": 101890 }, { "epoch": 122.91973445986723, "grad_norm": 4.430696964263916, "learning_rate": 1.9997544050323793e-05, "loss": 0.0426, "step": 101900 }, { "epoch": 122.93180446590223, "grad_norm": 4.627969741821289, "learning_rate": 1.9997543809069e-05, "loss": 0.0418, "step": 101910 }, { "epoch": 122.94387447193724, "grad_norm": 4.145615577697754, "learning_rate": 1.9997543567814205e-05, "loss": 0.0407, "step": 101920 }, { "epoch": 122.95594447797224, "grad_norm": 4.42965841293335, "learning_rate": 1.999754332655941e-05, "loss": 0.0428, "step": 101930 }, { "epoch": 122.96801448400724, "grad_norm": 4.064378261566162, "learning_rate": 1.9997543085304617e-05, "loss": 0.0429, "step": 101940 }, { "epoch": 122.98008449004224, "grad_norm": 4.004546642303467, "learning_rate": 1.9997542844049824e-05, "loss": 0.0412, "step": 101950 }, { "epoch": 122.99215449607725, "grad_norm": 3.977198839187622, "learning_rate": 1.999754260279503e-05, "loss": 0.0413, "step": 101960 }, { "epoch": 123.0036210018105, "grad_norm": 2.937232255935669, "learning_rate": 1.9997542361540236e-05, "loss": 0.0363, "step": 101970 }, { "epoch": 123.0156910078455, "grad_norm": 3.817650556564331, "learning_rate": 1.9997542120285442e-05, "loss": 0.0261, "step": 101980 }, { "epoch": 123.02776101388051, "grad_norm": 4.197822570800781, "learning_rate": 1.999754187903065e-05, "loss": 0.0301, "step": 101990 }, { "epoch": 123.03983101991551, "grad_norm": 3.1217339038848877, "learning_rate": 1.9997541637775855e-05, "loss": 0.0301, "step": 102000 }, { "epoch": 123.03983101991551, "eval_loss": 13.164206504821777, "eval_runtime": 8.1312, "eval_samples_per_second": 85.719, "eval_steps_per_second": 10.822, "step": 102000 }, { "epoch": 123.05190102595051, "grad_norm": 3.6603076457977295, "learning_rate": 1.999754139652106e-05, "loss": 0.0297, "step": 102010 }, { "epoch": 123.06397103198552, "grad_norm": 3.30346417427063, "learning_rate": 1.9997541155266267e-05, "loss": 0.0301, "step": 102020 }, { "epoch": 123.07604103802052, "grad_norm": 3.439556360244751, "learning_rate": 1.9997540914011473e-05, "loss": 0.0324, "step": 102030 }, { "epoch": 123.08811104405552, "grad_norm": 3.5417075157165527, "learning_rate": 1.999754067275668e-05, "loss": 0.0321, "step": 102040 }, { "epoch": 123.10018105009053, "grad_norm": 3.8078901767730713, "learning_rate": 1.9997540431501886e-05, "loss": 0.035, "step": 102050 }, { "epoch": 123.11225105612553, "grad_norm": 3.823148250579834, "learning_rate": 1.9997540190247092e-05, "loss": 0.0347, "step": 102060 }, { "epoch": 123.12432106216053, "grad_norm": 3.5086586475372314, "learning_rate": 1.9997539948992298e-05, "loss": 0.0318, "step": 102070 }, { "epoch": 123.13639106819554, "grad_norm": 3.7224771976470947, "learning_rate": 1.9997539707737505e-05, "loss": 0.0319, "step": 102080 }, { "epoch": 123.14846107423054, "grad_norm": 3.992947816848755, "learning_rate": 1.999753946648271e-05, "loss": 0.0334, "step": 102090 }, { "epoch": 123.16053108026554, "grad_norm": 4.043177127838135, "learning_rate": 1.9997539225227917e-05, "loss": 0.0321, "step": 102100 }, { "epoch": 123.17260108630055, "grad_norm": 3.6772382259368896, "learning_rate": 1.9997538983973123e-05, "loss": 0.0326, "step": 102110 }, { "epoch": 123.18467109233555, "grad_norm": 3.8963708877563477, "learning_rate": 1.9997538742718326e-05, "loss": 0.0327, "step": 102120 }, { "epoch": 123.19674109837055, "grad_norm": 3.665192127227783, "learning_rate": 1.9997538501463532e-05, "loss": 0.0347, "step": 102130 }, { "epoch": 123.20881110440556, "grad_norm": 3.5913777351379395, "learning_rate": 1.999753826020874e-05, "loss": 0.0332, "step": 102140 }, { "epoch": 123.22088111044056, "grad_norm": 3.7735283374786377, "learning_rate": 1.9997538018953945e-05, "loss": 0.0349, "step": 102150 }, { "epoch": 123.23295111647556, "grad_norm": 3.994901180267334, "learning_rate": 1.999753777769915e-05, "loss": 0.0361, "step": 102160 }, { "epoch": 123.24502112251056, "grad_norm": 3.7591354846954346, "learning_rate": 1.9997537536444357e-05, "loss": 0.034, "step": 102170 }, { "epoch": 123.25709112854557, "grad_norm": 4.132763385772705, "learning_rate": 1.9997537295189563e-05, "loss": 0.0363, "step": 102180 }, { "epoch": 123.26916113458057, "grad_norm": 4.040220260620117, "learning_rate": 1.999753705393477e-05, "loss": 0.037, "step": 102190 }, { "epoch": 123.28123114061557, "grad_norm": 3.8573896884918213, "learning_rate": 1.9997536812679976e-05, "loss": 0.0364, "step": 102200 }, { "epoch": 123.29330114665058, "grad_norm": 4.305671691894531, "learning_rate": 1.9997536571425182e-05, "loss": 0.0362, "step": 102210 }, { "epoch": 123.30537115268558, "grad_norm": 3.3169994354248047, "learning_rate": 1.9997536330170388e-05, "loss": 0.0362, "step": 102220 }, { "epoch": 123.31744115872058, "grad_norm": 3.9364113807678223, "learning_rate": 1.9997536088915594e-05, "loss": 0.0359, "step": 102230 }, { "epoch": 123.32951116475559, "grad_norm": 3.4720492362976074, "learning_rate": 1.99975358476608e-05, "loss": 0.0356, "step": 102240 }, { "epoch": 123.34158117079059, "grad_norm": 3.875925064086914, "learning_rate": 1.9997535606406007e-05, "loss": 0.0358, "step": 102250 }, { "epoch": 123.3536511768256, "grad_norm": 4.194551944732666, "learning_rate": 1.9997535365151213e-05, "loss": 0.0362, "step": 102260 }, { "epoch": 123.3657211828606, "grad_norm": 3.463158369064331, "learning_rate": 1.999753512389642e-05, "loss": 0.035, "step": 102270 }, { "epoch": 123.3777911888956, "grad_norm": 3.365001678466797, "learning_rate": 1.9997534882641625e-05, "loss": 0.035, "step": 102280 }, { "epoch": 123.3898611949306, "grad_norm": 3.6067698001861572, "learning_rate": 1.999753464138683e-05, "loss": 0.0366, "step": 102290 }, { "epoch": 123.4019312009656, "grad_norm": 3.8629279136657715, "learning_rate": 1.9997534400132038e-05, "loss": 0.038, "step": 102300 }, { "epoch": 123.41400120700061, "grad_norm": 4.072794437408447, "learning_rate": 1.9997534158877244e-05, "loss": 0.0359, "step": 102310 }, { "epoch": 123.42607121303561, "grad_norm": 3.93298077583313, "learning_rate": 1.999753391762245e-05, "loss": 0.0365, "step": 102320 }, { "epoch": 123.43814121907062, "grad_norm": 4.134716510772705, "learning_rate": 1.9997533676367656e-05, "loss": 0.0388, "step": 102330 }, { "epoch": 123.45021122510562, "grad_norm": 3.689547061920166, "learning_rate": 1.9997533435112863e-05, "loss": 0.0364, "step": 102340 }, { "epoch": 123.46228123114062, "grad_norm": 4.015503406524658, "learning_rate": 1.999753319385807e-05, "loss": 0.0383, "step": 102350 }, { "epoch": 123.47435123717563, "grad_norm": 4.102064609527588, "learning_rate": 1.9997532952603275e-05, "loss": 0.0374, "step": 102360 }, { "epoch": 123.48642124321063, "grad_norm": 4.314011096954346, "learning_rate": 1.9997532711348478e-05, "loss": 0.0393, "step": 102370 }, { "epoch": 123.49849124924563, "grad_norm": 4.086897373199463, "learning_rate": 1.9997532470093684e-05, "loss": 0.0383, "step": 102380 }, { "epoch": 123.51056125528062, "grad_norm": 3.6827926635742188, "learning_rate": 1.999753222883889e-05, "loss": 0.0386, "step": 102390 }, { "epoch": 123.52263126131562, "grad_norm": 4.25535249710083, "learning_rate": 1.9997531987584097e-05, "loss": 0.0379, "step": 102400 }, { "epoch": 123.53470126735063, "grad_norm": 3.9171767234802246, "learning_rate": 1.9997531746329303e-05, "loss": 0.0365, "step": 102410 }, { "epoch": 123.54677127338563, "grad_norm": 3.697216510772705, "learning_rate": 1.999753150507451e-05, "loss": 0.037, "step": 102420 }, { "epoch": 123.55884127942063, "grad_norm": 3.9460537433624268, "learning_rate": 1.9997531263819715e-05, "loss": 0.0387, "step": 102430 }, { "epoch": 123.57091128545564, "grad_norm": 5.017760276794434, "learning_rate": 1.999753102256492e-05, "loss": 0.0382, "step": 102440 }, { "epoch": 123.58298129149064, "grad_norm": 4.030055522918701, "learning_rate": 1.9997530781310128e-05, "loss": 0.039, "step": 102450 }, { "epoch": 123.59505129752564, "grad_norm": 4.237151622772217, "learning_rate": 1.9997530540055334e-05, "loss": 0.0376, "step": 102460 }, { "epoch": 123.60712130356065, "grad_norm": 4.097663402557373, "learning_rate": 1.9997530298800544e-05, "loss": 0.0394, "step": 102470 }, { "epoch": 123.61919130959565, "grad_norm": 4.236487865447998, "learning_rate": 1.999753005754575e-05, "loss": 0.0371, "step": 102480 }, { "epoch": 123.63126131563065, "grad_norm": 4.3772735595703125, "learning_rate": 1.9997529816290956e-05, "loss": 0.0387, "step": 102490 }, { "epoch": 123.64333132166566, "grad_norm": 4.073724746704102, "learning_rate": 1.9997529575036162e-05, "loss": 0.0398, "step": 102500 }, { "epoch": 123.64333132166566, "eval_loss": 13.16679573059082, "eval_runtime": 8.1959, "eval_samples_per_second": 85.043, "eval_steps_per_second": 10.737, "step": 102500 }, { "epoch": 123.65540132770066, "grad_norm": 4.105683326721191, "learning_rate": 1.999752933378137e-05, "loss": 0.0389, "step": 102510 }, { "epoch": 123.66747133373566, "grad_norm": 3.7437989711761475, "learning_rate": 1.9997529092526575e-05, "loss": 0.0394, "step": 102520 }, { "epoch": 123.67954133977067, "grad_norm": 3.9749526977539062, "learning_rate": 1.999752885127178e-05, "loss": 0.0381, "step": 102530 }, { "epoch": 123.69161134580567, "grad_norm": 3.878948450088501, "learning_rate": 1.9997528610016984e-05, "loss": 0.0385, "step": 102540 }, { "epoch": 123.70368135184067, "grad_norm": 3.838670253753662, "learning_rate": 1.999752836876219e-05, "loss": 0.0377, "step": 102550 }, { "epoch": 123.71575135787567, "grad_norm": 4.026249408721924, "learning_rate": 1.9997528127507396e-05, "loss": 0.0389, "step": 102560 }, { "epoch": 123.72782136391068, "grad_norm": 4.15952730178833, "learning_rate": 1.9997527886252602e-05, "loss": 0.04, "step": 102570 }, { "epoch": 123.73989136994568, "grad_norm": 3.9499430656433105, "learning_rate": 1.999752764499781e-05, "loss": 0.0389, "step": 102580 }, { "epoch": 123.75196137598068, "grad_norm": 4.704310417175293, "learning_rate": 1.9997527403743015e-05, "loss": 0.0408, "step": 102590 }, { "epoch": 123.76403138201569, "grad_norm": 3.6821653842926025, "learning_rate": 1.999752716248822e-05, "loss": 0.0412, "step": 102600 }, { "epoch": 123.77610138805069, "grad_norm": 4.34420919418335, "learning_rate": 1.9997526921233427e-05, "loss": 0.0386, "step": 102610 }, { "epoch": 123.7881713940857, "grad_norm": 3.9705400466918945, "learning_rate": 1.9997526679978633e-05, "loss": 0.0398, "step": 102620 }, { "epoch": 123.8002414001207, "grad_norm": 4.216616153717041, "learning_rate": 1.999752643872384e-05, "loss": 0.0387, "step": 102630 }, { "epoch": 123.8123114061557, "grad_norm": 4.175386905670166, "learning_rate": 1.9997526197469046e-05, "loss": 0.0392, "step": 102640 }, { "epoch": 123.8243814121907, "grad_norm": 4.389620780944824, "learning_rate": 1.9997525956214252e-05, "loss": 0.041, "step": 102650 }, { "epoch": 123.8364514182257, "grad_norm": 4.030927658081055, "learning_rate": 1.9997525714959458e-05, "loss": 0.0425, "step": 102660 }, { "epoch": 123.84852142426071, "grad_norm": 4.431715965270996, "learning_rate": 1.9997525473704664e-05, "loss": 0.0423, "step": 102670 }, { "epoch": 123.86059143029571, "grad_norm": 3.7407736778259277, "learning_rate": 1.999752523244987e-05, "loss": 0.0401, "step": 102680 }, { "epoch": 123.87266143633072, "grad_norm": 4.05575704574585, "learning_rate": 1.9997524991195077e-05, "loss": 0.0414, "step": 102690 }, { "epoch": 123.88473144236572, "grad_norm": 3.9999494552612305, "learning_rate": 1.9997524749940283e-05, "loss": 0.041, "step": 102700 }, { "epoch": 123.89680144840072, "grad_norm": 3.8680131435394287, "learning_rate": 1.999752450868549e-05, "loss": 0.0416, "step": 102710 }, { "epoch": 123.90887145443573, "grad_norm": 4.02586030960083, "learning_rate": 1.9997524267430696e-05, "loss": 0.0412, "step": 102720 }, { "epoch": 123.92094146047073, "grad_norm": 4.022602558135986, "learning_rate": 1.9997524026175902e-05, "loss": 0.041, "step": 102730 }, { "epoch": 123.93301146650573, "grad_norm": 3.510498046875, "learning_rate": 1.9997523784921108e-05, "loss": 0.0389, "step": 102740 }, { "epoch": 123.94508147254074, "grad_norm": 4.005276679992676, "learning_rate": 1.9997523543666314e-05, "loss": 0.0431, "step": 102750 }, { "epoch": 123.95715147857574, "grad_norm": 4.515440940856934, "learning_rate": 1.999752330241152e-05, "loss": 0.0436, "step": 102760 }, { "epoch": 123.96922148461074, "grad_norm": 4.073414325714111, "learning_rate": 1.9997523061156727e-05, "loss": 0.0408, "step": 102770 }, { "epoch": 123.98129149064575, "grad_norm": 4.155215263366699, "learning_rate": 1.9997522819901933e-05, "loss": 0.0395, "step": 102780 }, { "epoch": 123.99336149668075, "grad_norm": 4.519393444061279, "learning_rate": 1.9997522578647136e-05, "loss": 0.0425, "step": 102790 }, { "epoch": 124.004828002414, "grad_norm": 3.5928051471710205, "learning_rate": 1.9997522337392342e-05, "loss": 0.0369, "step": 102800 }, { "epoch": 124.016898008449, "grad_norm": 3.0540478229522705, "learning_rate": 1.9997522096137548e-05, "loss": 0.0281, "step": 102810 }, { "epoch": 124.02896801448401, "grad_norm": 3.477036237716675, "learning_rate": 1.9997521854882754e-05, "loss": 0.0291, "step": 102820 }, { "epoch": 124.04103802051901, "grad_norm": 3.4144608974456787, "learning_rate": 1.999752161362796e-05, "loss": 0.0305, "step": 102830 }, { "epoch": 124.05310802655401, "grad_norm": 4.085505962371826, "learning_rate": 1.9997521372373167e-05, "loss": 0.0302, "step": 102840 }, { "epoch": 124.06517803258902, "grad_norm": 3.642261505126953, "learning_rate": 1.9997521131118373e-05, "loss": 0.0308, "step": 102850 }, { "epoch": 124.07724803862402, "grad_norm": 3.6408352851867676, "learning_rate": 1.999752088986358e-05, "loss": 0.0322, "step": 102860 }, { "epoch": 124.08931804465902, "grad_norm": 3.421506643295288, "learning_rate": 1.9997520648608785e-05, "loss": 0.0303, "step": 102870 }, { "epoch": 124.10138805069403, "grad_norm": 3.7480757236480713, "learning_rate": 1.999752040735399e-05, "loss": 0.0305, "step": 102880 }, { "epoch": 124.11345805672903, "grad_norm": 3.722200632095337, "learning_rate": 1.9997520166099198e-05, "loss": 0.0336, "step": 102890 }, { "epoch": 124.12552806276403, "grad_norm": 3.365433931350708, "learning_rate": 1.9997519924844404e-05, "loss": 0.0325, "step": 102900 }, { "epoch": 124.13759806879904, "grad_norm": 3.4822170734405518, "learning_rate": 1.999751968358961e-05, "loss": 0.0334, "step": 102910 }, { "epoch": 124.14966807483404, "grad_norm": 3.8089261054992676, "learning_rate": 1.9997519442334816e-05, "loss": 0.0336, "step": 102920 }, { "epoch": 124.16173808086904, "grad_norm": 3.7902934551239014, "learning_rate": 1.9997519201080023e-05, "loss": 0.0341, "step": 102930 }, { "epoch": 124.17380808690405, "grad_norm": 3.49137282371521, "learning_rate": 1.999751895982523e-05, "loss": 0.0325, "step": 102940 }, { "epoch": 124.18587809293905, "grad_norm": 4.057884216308594, "learning_rate": 1.9997518718570435e-05, "loss": 0.0343, "step": 102950 }, { "epoch": 124.19794809897405, "grad_norm": 3.385340452194214, "learning_rate": 1.999751847731564e-05, "loss": 0.0357, "step": 102960 }, { "epoch": 124.21001810500906, "grad_norm": 3.9398372173309326, "learning_rate": 1.9997518236060848e-05, "loss": 0.0351, "step": 102970 }, { "epoch": 124.22208811104406, "grad_norm": 3.440150499343872, "learning_rate": 1.9997517994806054e-05, "loss": 0.0341, "step": 102980 }, { "epoch": 124.23415811707906, "grad_norm": 3.7039713859558105, "learning_rate": 1.999751775355126e-05, "loss": 0.0344, "step": 102990 }, { "epoch": 124.24622812311407, "grad_norm": 4.004123210906982, "learning_rate": 1.9997517512296466e-05, "loss": 0.0347, "step": 103000 }, { "epoch": 124.24622812311407, "eval_loss": 13.1647310256958, "eval_runtime": 8.1828, "eval_samples_per_second": 85.179, "eval_steps_per_second": 10.754, "step": 103000 }, { "epoch": 124.25829812914907, "grad_norm": 3.558043956756592, "learning_rate": 1.9997517271041672e-05, "loss": 0.0345, "step": 103010 }, { "epoch": 124.27036813518407, "grad_norm": 3.579047441482544, "learning_rate": 1.999751702978688e-05, "loss": 0.0336, "step": 103020 }, { "epoch": 124.28243814121907, "grad_norm": 3.835944890975952, "learning_rate": 1.9997516788532085e-05, "loss": 0.0356, "step": 103030 }, { "epoch": 124.29450814725408, "grad_norm": 4.093378067016602, "learning_rate": 1.999751654727729e-05, "loss": 0.035, "step": 103040 }, { "epoch": 124.30657815328908, "grad_norm": 3.7436766624450684, "learning_rate": 1.9997516306022497e-05, "loss": 0.0359, "step": 103050 }, { "epoch": 124.31864815932408, "grad_norm": 3.76882004737854, "learning_rate": 1.9997516064767703e-05, "loss": 0.0344, "step": 103060 }, { "epoch": 124.33071816535909, "grad_norm": 4.285013675689697, "learning_rate": 1.999751582351291e-05, "loss": 0.0378, "step": 103070 }, { "epoch": 124.34278817139409, "grad_norm": 3.551450490951538, "learning_rate": 1.9997515582258116e-05, "loss": 0.0347, "step": 103080 }, { "epoch": 124.3548581774291, "grad_norm": 4.503753185272217, "learning_rate": 1.9997515341003322e-05, "loss": 0.0372, "step": 103090 }, { "epoch": 124.3669281834641, "grad_norm": 4.043615818023682, "learning_rate": 1.999751509974853e-05, "loss": 0.0343, "step": 103100 }, { "epoch": 124.3789981894991, "grad_norm": 4.462504863739014, "learning_rate": 1.9997514858493735e-05, "loss": 0.0377, "step": 103110 }, { "epoch": 124.3910681955341, "grad_norm": 3.9452004432678223, "learning_rate": 1.999751461723894e-05, "loss": 0.0359, "step": 103120 }, { "epoch": 124.4031382015691, "grad_norm": 4.01383113861084, "learning_rate": 1.9997514375984147e-05, "loss": 0.0368, "step": 103130 }, { "epoch": 124.41520820760411, "grad_norm": 4.048867225646973, "learning_rate": 1.9997514134729353e-05, "loss": 0.0391, "step": 103140 }, { "epoch": 124.42727821363911, "grad_norm": 4.181109428405762, "learning_rate": 1.999751389347456e-05, "loss": 0.0374, "step": 103150 }, { "epoch": 124.43934821967412, "grad_norm": 3.8125014305114746, "learning_rate": 1.9997513652219766e-05, "loss": 0.0371, "step": 103160 }, { "epoch": 124.45141822570912, "grad_norm": 4.081457614898682, "learning_rate": 1.9997513410964972e-05, "loss": 0.0379, "step": 103170 }, { "epoch": 124.46348823174412, "grad_norm": 3.7811481952667236, "learning_rate": 1.9997513169710178e-05, "loss": 0.0385, "step": 103180 }, { "epoch": 124.47555823777913, "grad_norm": 4.586080551147461, "learning_rate": 1.9997512928455384e-05, "loss": 0.0408, "step": 103190 }, { "epoch": 124.48762824381413, "grad_norm": 3.996260404586792, "learning_rate": 1.9997512687200587e-05, "loss": 0.0376, "step": 103200 }, { "epoch": 124.49969824984913, "grad_norm": 3.843972682952881, "learning_rate": 1.9997512445945793e-05, "loss": 0.0384, "step": 103210 }, { "epoch": 124.51176825588412, "grad_norm": 4.0296454429626465, "learning_rate": 1.9997512204691e-05, "loss": 0.0372, "step": 103220 }, { "epoch": 124.52383826191912, "grad_norm": 3.4682552814483643, "learning_rate": 1.9997511963436206e-05, "loss": 0.0386, "step": 103230 }, { "epoch": 124.53590826795413, "grad_norm": 4.041471481323242, "learning_rate": 1.9997511722181412e-05, "loss": 0.038, "step": 103240 }, { "epoch": 124.54797827398913, "grad_norm": 3.645902633666992, "learning_rate": 1.9997511480926618e-05, "loss": 0.0373, "step": 103250 }, { "epoch": 124.56004828002413, "grad_norm": 4.275269031524658, "learning_rate": 1.9997511239671824e-05, "loss": 0.0385, "step": 103260 }, { "epoch": 124.57211828605914, "grad_norm": 4.164779186248779, "learning_rate": 1.999751099841703e-05, "loss": 0.0369, "step": 103270 }, { "epoch": 124.58418829209414, "grad_norm": 3.7674455642700195, "learning_rate": 1.9997510757162237e-05, "loss": 0.0393, "step": 103280 }, { "epoch": 124.59625829812914, "grad_norm": 4.242009162902832, "learning_rate": 1.9997510515907443e-05, "loss": 0.0402, "step": 103290 }, { "epoch": 124.60832830416415, "grad_norm": 3.545309066772461, "learning_rate": 1.999751027465265e-05, "loss": 0.0375, "step": 103300 }, { "epoch": 124.62039831019915, "grad_norm": 3.48175311088562, "learning_rate": 1.9997510033397855e-05, "loss": 0.0381, "step": 103310 }, { "epoch": 124.63246831623415, "grad_norm": 3.742319345474243, "learning_rate": 1.999750979214306e-05, "loss": 0.0369, "step": 103320 }, { "epoch": 124.64453832226916, "grad_norm": 4.081806182861328, "learning_rate": 1.9997509550888268e-05, "loss": 0.0393, "step": 103330 }, { "epoch": 124.65660832830416, "grad_norm": 4.013361930847168, "learning_rate": 1.9997509309633474e-05, "loss": 0.0381, "step": 103340 }, { "epoch": 124.66867833433916, "grad_norm": 3.922020435333252, "learning_rate": 1.999750906837868e-05, "loss": 0.0386, "step": 103350 }, { "epoch": 124.68074834037417, "grad_norm": 3.7669358253479004, "learning_rate": 1.9997508827123887e-05, "loss": 0.0381, "step": 103360 }, { "epoch": 124.69281834640917, "grad_norm": 4.01035737991333, "learning_rate": 1.9997508585869093e-05, "loss": 0.0395, "step": 103370 }, { "epoch": 124.70488835244417, "grad_norm": 4.278314113616943, "learning_rate": 1.99975083446143e-05, "loss": 0.0397, "step": 103380 }, { "epoch": 124.71695835847918, "grad_norm": 3.818532943725586, "learning_rate": 1.9997508103359505e-05, "loss": 0.0395, "step": 103390 }, { "epoch": 124.72902836451418, "grad_norm": 4.324321269989014, "learning_rate": 1.999750786210471e-05, "loss": 0.0396, "step": 103400 }, { "epoch": 124.74109837054918, "grad_norm": 4.003323554992676, "learning_rate": 1.9997507620849918e-05, "loss": 0.0392, "step": 103410 }, { "epoch": 124.75316837658418, "grad_norm": 4.025503158569336, "learning_rate": 1.9997507379595124e-05, "loss": 0.0393, "step": 103420 }, { "epoch": 124.76523838261919, "grad_norm": 3.754833221435547, "learning_rate": 1.999750713834033e-05, "loss": 0.0407, "step": 103430 }, { "epoch": 124.77730838865419, "grad_norm": 4.369052886962891, "learning_rate": 1.9997506897085536e-05, "loss": 0.0383, "step": 103440 }, { "epoch": 124.7893783946892, "grad_norm": 3.8547112941741943, "learning_rate": 1.999750665583074e-05, "loss": 0.0401, "step": 103450 }, { "epoch": 124.8014484007242, "grad_norm": 4.0167555809021, "learning_rate": 1.9997506414575945e-05, "loss": 0.0404, "step": 103460 }, { "epoch": 124.8135184067592, "grad_norm": 3.9183051586151123, "learning_rate": 1.999750617332115e-05, "loss": 0.0395, "step": 103470 }, { "epoch": 124.8255884127942, "grad_norm": 4.220966339111328, "learning_rate": 1.9997505932066358e-05, "loss": 0.0394, "step": 103480 }, { "epoch": 124.83765841882921, "grad_norm": 3.8615245819091797, "learning_rate": 1.9997505690811564e-05, "loss": 0.0386, "step": 103490 }, { "epoch": 124.84972842486421, "grad_norm": 3.97182035446167, "learning_rate": 1.999750544955677e-05, "loss": 0.039, "step": 103500 }, { "epoch": 124.84972842486421, "eval_loss": 13.185165405273438, "eval_runtime": 8.1601, "eval_samples_per_second": 85.416, "eval_steps_per_second": 10.784, "step": 103500 }, { "epoch": 124.86179843089921, "grad_norm": 4.455431938171387, "learning_rate": 1.9997505208301976e-05, "loss": 0.0407, "step": 103510 }, { "epoch": 124.87386843693422, "grad_norm": 4.2678985595703125, "learning_rate": 1.9997504967047183e-05, "loss": 0.0408, "step": 103520 }, { "epoch": 124.88593844296922, "grad_norm": 3.9323272705078125, "learning_rate": 1.999750472579239e-05, "loss": 0.0408, "step": 103530 }, { "epoch": 124.89800844900422, "grad_norm": 3.9307010173797607, "learning_rate": 1.9997504484537595e-05, "loss": 0.0401, "step": 103540 }, { "epoch": 124.91007845503923, "grad_norm": 4.056146144866943, "learning_rate": 1.9997504243282805e-05, "loss": 0.0417, "step": 103550 }, { "epoch": 124.92214846107423, "grad_norm": 4.1162190437316895, "learning_rate": 1.999750400202801e-05, "loss": 0.0419, "step": 103560 }, { "epoch": 124.93421846710923, "grad_norm": 4.389387607574463, "learning_rate": 1.9997503760773217e-05, "loss": 0.0409, "step": 103570 }, { "epoch": 124.94628847314424, "grad_norm": 4.124205589294434, "learning_rate": 1.9997503519518423e-05, "loss": 0.0405, "step": 103580 }, { "epoch": 124.95835847917924, "grad_norm": 4.611246109008789, "learning_rate": 1.999750327826363e-05, "loss": 0.0386, "step": 103590 }, { "epoch": 124.97042848521424, "grad_norm": 4.043389320373535, "learning_rate": 1.9997503037008836e-05, "loss": 0.0401, "step": 103600 }, { "epoch": 124.98249849124925, "grad_norm": 4.58873176574707, "learning_rate": 1.999750279575404e-05, "loss": 0.0416, "step": 103610 }, { "epoch": 124.99456849728425, "grad_norm": 3.9591500759124756, "learning_rate": 1.9997502554499245e-05, "loss": 0.0426, "step": 103620 }, { "epoch": 125.0060350030175, "grad_norm": 3.075655698776245, "learning_rate": 1.999750231324445e-05, "loss": 0.0343, "step": 103630 }, { "epoch": 125.0181050090525, "grad_norm": 3.500788927078247, "learning_rate": 1.9997502071989657e-05, "loss": 0.0264, "step": 103640 }, { "epoch": 125.03017501508751, "grad_norm": 3.466682195663452, "learning_rate": 1.9997501830734863e-05, "loss": 0.0277, "step": 103650 }, { "epoch": 125.04224502112251, "grad_norm": 3.8258554935455322, "learning_rate": 1.999750158948007e-05, "loss": 0.0282, "step": 103660 }, { "epoch": 125.05431502715751, "grad_norm": 3.321725845336914, "learning_rate": 1.9997501348225276e-05, "loss": 0.0306, "step": 103670 }, { "epoch": 125.06638503319252, "grad_norm": 3.556785821914673, "learning_rate": 1.9997501106970482e-05, "loss": 0.0304, "step": 103680 }, { "epoch": 125.07845503922752, "grad_norm": 3.6526293754577637, "learning_rate": 1.9997500865715688e-05, "loss": 0.0305, "step": 103690 }, { "epoch": 125.09052504526252, "grad_norm": 3.2142586708068848, "learning_rate": 1.9997500624460894e-05, "loss": 0.03, "step": 103700 }, { "epoch": 125.10259505129753, "grad_norm": 3.330766439437866, "learning_rate": 1.99975003832061e-05, "loss": 0.0314, "step": 103710 }, { "epoch": 125.11466505733253, "grad_norm": 3.951737403869629, "learning_rate": 1.9997500141951307e-05, "loss": 0.0323, "step": 103720 }, { "epoch": 125.12673506336753, "grad_norm": 3.8092775344848633, "learning_rate": 1.9997499900696513e-05, "loss": 0.0316, "step": 103730 }, { "epoch": 125.13880506940254, "grad_norm": 3.494060754776001, "learning_rate": 1.999749965944172e-05, "loss": 0.0316, "step": 103740 }, { "epoch": 125.15087507543754, "grad_norm": 3.218435764312744, "learning_rate": 1.9997499418186926e-05, "loss": 0.0306, "step": 103750 }, { "epoch": 125.16294508147254, "grad_norm": 3.675804615020752, "learning_rate": 1.9997499176932132e-05, "loss": 0.0331, "step": 103760 }, { "epoch": 125.17501508750755, "grad_norm": 3.6283106803894043, "learning_rate": 1.9997498935677338e-05, "loss": 0.0328, "step": 103770 }, { "epoch": 125.18708509354255, "grad_norm": 3.4406158924102783, "learning_rate": 1.9997498694422544e-05, "loss": 0.0334, "step": 103780 }, { "epoch": 125.19915509957755, "grad_norm": 3.6505942344665527, "learning_rate": 1.999749845316775e-05, "loss": 0.0346, "step": 103790 }, { "epoch": 125.21122510561256, "grad_norm": 3.5701987743377686, "learning_rate": 1.9997498211912957e-05, "loss": 0.0334, "step": 103800 }, { "epoch": 125.22329511164756, "grad_norm": 4.145625114440918, "learning_rate": 1.9997497970658163e-05, "loss": 0.0343, "step": 103810 }, { "epoch": 125.23536511768256, "grad_norm": 3.9378061294555664, "learning_rate": 1.999749772940337e-05, "loss": 0.0341, "step": 103820 }, { "epoch": 125.24743512371757, "grad_norm": 3.891249179840088, "learning_rate": 1.9997497488148575e-05, "loss": 0.0354, "step": 103830 }, { "epoch": 125.25950512975257, "grad_norm": 3.6267378330230713, "learning_rate": 1.999749724689378e-05, "loss": 0.0327, "step": 103840 }, { "epoch": 125.27157513578757, "grad_norm": 3.816889524459839, "learning_rate": 1.9997497005638988e-05, "loss": 0.0354, "step": 103850 }, { "epoch": 125.28364514182257, "grad_norm": 3.796379804611206, "learning_rate": 1.999749676438419e-05, "loss": 0.0358, "step": 103860 }, { "epoch": 125.29571514785758, "grad_norm": 3.2019879817962646, "learning_rate": 1.9997496523129397e-05, "loss": 0.0353, "step": 103870 }, { "epoch": 125.30778515389258, "grad_norm": 3.634916305541992, "learning_rate": 1.9997496281874603e-05, "loss": 0.0361, "step": 103880 }, { "epoch": 125.31985515992758, "grad_norm": 4.128862380981445, "learning_rate": 1.999749604061981e-05, "loss": 0.0372, "step": 103890 }, { "epoch": 125.33192516596259, "grad_norm": 3.8202691078186035, "learning_rate": 1.9997495799365015e-05, "loss": 0.0368, "step": 103900 }, { "epoch": 125.34399517199759, "grad_norm": 3.668783187866211, "learning_rate": 1.999749555811022e-05, "loss": 0.0374, "step": 103910 }, { "epoch": 125.3560651780326, "grad_norm": 3.9208972454071045, "learning_rate": 1.9997495316855428e-05, "loss": 0.0372, "step": 103920 }, { "epoch": 125.3681351840676, "grad_norm": 3.6763997077941895, "learning_rate": 1.9997495075600634e-05, "loss": 0.0363, "step": 103930 }, { "epoch": 125.3802051901026, "grad_norm": 3.764601707458496, "learning_rate": 1.999749483434584e-05, "loss": 0.0359, "step": 103940 }, { "epoch": 125.3922751961376, "grad_norm": 3.7482054233551025, "learning_rate": 1.9997494593091046e-05, "loss": 0.0364, "step": 103950 }, { "epoch": 125.4043452021726, "grad_norm": 4.102211952209473, "learning_rate": 1.9997494351836253e-05, "loss": 0.0354, "step": 103960 }, { "epoch": 125.41641520820761, "grad_norm": 3.9627153873443604, "learning_rate": 1.999749411058146e-05, "loss": 0.0367, "step": 103970 }, { "epoch": 125.42848521424261, "grad_norm": 3.676480770111084, "learning_rate": 1.9997493869326665e-05, "loss": 0.0361, "step": 103980 }, { "epoch": 125.44055522027762, "grad_norm": 3.771580934524536, "learning_rate": 1.999749362807187e-05, "loss": 0.036, "step": 103990 }, { "epoch": 125.45262522631262, "grad_norm": 3.6746535301208496, "learning_rate": 1.9997493386817078e-05, "loss": 0.0345, "step": 104000 }, { "epoch": 125.45262522631262, "eval_loss": 13.189966201782227, "eval_runtime": 8.1265, "eval_samples_per_second": 85.769, "eval_steps_per_second": 10.829, "step": 104000 }, { "epoch": 125.46469523234762, "grad_norm": 3.944791316986084, "learning_rate": 1.9997493145562284e-05, "loss": 0.0381, "step": 104010 }, { "epoch": 125.47676523838263, "grad_norm": 3.971637487411499, "learning_rate": 1.999749290430749e-05, "loss": 0.0371, "step": 104020 }, { "epoch": 125.48883524441763, "grad_norm": 4.465236663818359, "learning_rate": 1.9997492663052696e-05, "loss": 0.0376, "step": 104030 }, { "epoch": 125.50090525045263, "grad_norm": 3.8445892333984375, "learning_rate": 1.9997492421797902e-05, "loss": 0.0346, "step": 104040 }, { "epoch": 125.51297525648762, "grad_norm": 3.699817657470703, "learning_rate": 1.999749218054311e-05, "loss": 0.0365, "step": 104050 }, { "epoch": 125.52504526252262, "grad_norm": 4.141017436981201, "learning_rate": 1.9997491939288315e-05, "loss": 0.0361, "step": 104060 }, { "epoch": 125.53711526855763, "grad_norm": 4.111244201660156, "learning_rate": 1.999749169803352e-05, "loss": 0.039, "step": 104070 }, { "epoch": 125.54918527459263, "grad_norm": 3.809488296508789, "learning_rate": 1.9997491456778727e-05, "loss": 0.0365, "step": 104080 }, { "epoch": 125.56125528062763, "grad_norm": 3.817265510559082, "learning_rate": 1.9997491215523933e-05, "loss": 0.0374, "step": 104090 }, { "epoch": 125.57332528666264, "grad_norm": 3.901932954788208, "learning_rate": 1.999749097426914e-05, "loss": 0.0357, "step": 104100 }, { "epoch": 125.58539529269764, "grad_norm": 3.7357370853424072, "learning_rate": 1.9997490733014346e-05, "loss": 0.0362, "step": 104110 }, { "epoch": 125.59746529873264, "grad_norm": 3.879915475845337, "learning_rate": 1.9997490491759552e-05, "loss": 0.0403, "step": 104120 }, { "epoch": 125.60953530476765, "grad_norm": 3.805753469467163, "learning_rate": 1.999749025050476e-05, "loss": 0.0371, "step": 104130 }, { "epoch": 125.62160531080265, "grad_norm": 4.06612491607666, "learning_rate": 1.9997490009249965e-05, "loss": 0.038, "step": 104140 }, { "epoch": 125.63367531683765, "grad_norm": 4.338367938995361, "learning_rate": 1.999748976799517e-05, "loss": 0.0386, "step": 104150 }, { "epoch": 125.64574532287266, "grad_norm": 3.8300282955169678, "learning_rate": 1.9997489526740377e-05, "loss": 0.0377, "step": 104160 }, { "epoch": 125.65781532890766, "grad_norm": 4.268022537231445, "learning_rate": 1.9997489285485583e-05, "loss": 0.0395, "step": 104170 }, { "epoch": 125.66988533494266, "grad_norm": 3.7832179069519043, "learning_rate": 1.999748904423079e-05, "loss": 0.0397, "step": 104180 }, { "epoch": 125.68195534097767, "grad_norm": 3.947859287261963, "learning_rate": 1.9997488802975996e-05, "loss": 0.0387, "step": 104190 }, { "epoch": 125.69402534701267, "grad_norm": 3.7658870220184326, "learning_rate": 1.9997488561721202e-05, "loss": 0.0372, "step": 104200 }, { "epoch": 125.70609535304767, "grad_norm": 3.8779358863830566, "learning_rate": 1.9997488320466408e-05, "loss": 0.0397, "step": 104210 }, { "epoch": 125.71816535908268, "grad_norm": 4.121956825256348, "learning_rate": 1.9997488079211614e-05, "loss": 0.0367, "step": 104220 }, { "epoch": 125.73023536511768, "grad_norm": 3.824507713317871, "learning_rate": 1.999748783795682e-05, "loss": 0.0391, "step": 104230 }, { "epoch": 125.74230537115268, "grad_norm": 3.7720906734466553, "learning_rate": 1.9997487596702027e-05, "loss": 0.037, "step": 104240 }, { "epoch": 125.75437537718769, "grad_norm": 4.197772026062012, "learning_rate": 1.9997487355447233e-05, "loss": 0.0412, "step": 104250 }, { "epoch": 125.76644538322269, "grad_norm": 4.30076789855957, "learning_rate": 1.999748711419244e-05, "loss": 0.0409, "step": 104260 }, { "epoch": 125.77851538925769, "grad_norm": 5.023313999176025, "learning_rate": 1.9997486872937645e-05, "loss": 0.0405, "step": 104270 }, { "epoch": 125.7905853952927, "grad_norm": 4.090775966644287, "learning_rate": 1.9997486631682848e-05, "loss": 0.0414, "step": 104280 }, { "epoch": 125.8026554013277, "grad_norm": 3.8327083587646484, "learning_rate": 1.9997486390428054e-05, "loss": 0.0385, "step": 104290 }, { "epoch": 125.8147254073627, "grad_norm": 4.344112873077393, "learning_rate": 1.999748614917326e-05, "loss": 0.0378, "step": 104300 }, { "epoch": 125.8267954133977, "grad_norm": 3.8940696716308594, "learning_rate": 1.9997485907918467e-05, "loss": 0.0384, "step": 104310 }, { "epoch": 125.83886541943271, "grad_norm": 3.5944204330444336, "learning_rate": 1.9997485666663673e-05, "loss": 0.0394, "step": 104320 }, { "epoch": 125.85093542546771, "grad_norm": 4.191755294799805, "learning_rate": 1.999748542540888e-05, "loss": 0.0387, "step": 104330 }, { "epoch": 125.86300543150271, "grad_norm": 5.048124313354492, "learning_rate": 1.9997485184154085e-05, "loss": 0.041, "step": 104340 }, { "epoch": 125.87507543753772, "grad_norm": 4.013645648956299, "learning_rate": 1.999748494289929e-05, "loss": 0.0397, "step": 104350 }, { "epoch": 125.88714544357272, "grad_norm": 4.20139741897583, "learning_rate": 1.9997484701644498e-05, "loss": 0.0414, "step": 104360 }, { "epoch": 125.89921544960772, "grad_norm": 4.062135696411133, "learning_rate": 1.9997484460389704e-05, "loss": 0.0402, "step": 104370 }, { "epoch": 125.91128545564273, "grad_norm": 3.67556095123291, "learning_rate": 1.999748421913491e-05, "loss": 0.041, "step": 104380 }, { "epoch": 125.92335546167773, "grad_norm": 4.473588943481445, "learning_rate": 1.9997483977880117e-05, "loss": 0.0419, "step": 104390 }, { "epoch": 125.93542546771273, "grad_norm": 4.172473907470703, "learning_rate": 1.9997483736625323e-05, "loss": 0.0417, "step": 104400 }, { "epoch": 125.94749547374774, "grad_norm": 4.326968193054199, "learning_rate": 1.999748349537053e-05, "loss": 0.0397, "step": 104410 }, { "epoch": 125.95956547978274, "grad_norm": 4.261809825897217, "learning_rate": 1.9997483254115735e-05, "loss": 0.0423, "step": 104420 }, { "epoch": 125.97163548581774, "grad_norm": 4.20654296875, "learning_rate": 1.999748301286094e-05, "loss": 0.0429, "step": 104430 }, { "epoch": 125.98370549185275, "grad_norm": 3.9357926845550537, "learning_rate": 1.9997482771606148e-05, "loss": 0.0414, "step": 104440 }, { "epoch": 125.99577549788775, "grad_norm": 4.596401214599609, "learning_rate": 1.9997482530351354e-05, "loss": 0.0399, "step": 104450 }, { "epoch": 126.007242003621, "grad_norm": 3.353881597518921, "learning_rate": 1.999748228909656e-05, "loss": 0.0327, "step": 104460 }, { "epoch": 126.019312009656, "grad_norm": 3.0558712482452393, "learning_rate": 1.9997482047841766e-05, "loss": 0.0267, "step": 104470 }, { "epoch": 126.03138201569101, "grad_norm": 3.4446475505828857, "learning_rate": 1.9997481806586972e-05, "loss": 0.0289, "step": 104480 }, { "epoch": 126.04345202172601, "grad_norm": 3.923973798751831, "learning_rate": 1.999748156533218e-05, "loss": 0.0288, "step": 104490 }, { "epoch": 126.05552202776101, "grad_norm": 3.651850938796997, "learning_rate": 1.9997481324077385e-05, "loss": 0.0287, "step": 104500 }, { "epoch": 126.05552202776101, "eval_loss": 13.182999610900879, "eval_runtime": 8.1289, "eval_samples_per_second": 85.743, "eval_steps_per_second": 10.826, "step": 104500 }, { "epoch": 126.06759203379602, "grad_norm": 3.3089828491210938, "learning_rate": 1.999748108282259e-05, "loss": 0.0299, "step": 104510 }, { "epoch": 126.07966203983102, "grad_norm": 3.5375494956970215, "learning_rate": 1.9997480841567797e-05, "loss": 0.0303, "step": 104520 }, { "epoch": 126.09173204586602, "grad_norm": 3.4817450046539307, "learning_rate": 1.9997480600313e-05, "loss": 0.031, "step": 104530 }, { "epoch": 126.10380205190103, "grad_norm": 3.353086233139038, "learning_rate": 1.9997480359058206e-05, "loss": 0.0307, "step": 104540 }, { "epoch": 126.11587205793603, "grad_norm": 3.7334554195404053, "learning_rate": 1.9997480117803413e-05, "loss": 0.0327, "step": 104550 }, { "epoch": 126.12794206397103, "grad_norm": 3.6732373237609863, "learning_rate": 1.999747987654862e-05, "loss": 0.0302, "step": 104560 }, { "epoch": 126.14001207000604, "grad_norm": 3.6277310848236084, "learning_rate": 1.9997479635293825e-05, "loss": 0.0337, "step": 104570 }, { "epoch": 126.15208207604104, "grad_norm": 3.622469663619995, "learning_rate": 1.999747939403903e-05, "loss": 0.0337, "step": 104580 }, { "epoch": 126.16415208207604, "grad_norm": 3.3168420791625977, "learning_rate": 1.9997479152784237e-05, "loss": 0.0323, "step": 104590 }, { "epoch": 126.17622208811105, "grad_norm": 3.646611452102661, "learning_rate": 1.9997478911529444e-05, "loss": 0.0319, "step": 104600 }, { "epoch": 126.18829209414605, "grad_norm": 3.0494391918182373, "learning_rate": 1.999747867027465e-05, "loss": 0.0352, "step": 104610 }, { "epoch": 126.20036210018105, "grad_norm": 4.066513538360596, "learning_rate": 1.9997478429019856e-05, "loss": 0.0335, "step": 104620 }, { "epoch": 126.21243210621606, "grad_norm": 3.639935255050659, "learning_rate": 1.9997478187765066e-05, "loss": 0.0329, "step": 104630 }, { "epoch": 126.22450211225106, "grad_norm": 3.9471070766448975, "learning_rate": 1.9997477946510272e-05, "loss": 0.0329, "step": 104640 }, { "epoch": 126.23657211828606, "grad_norm": 4.060919284820557, "learning_rate": 1.9997477705255478e-05, "loss": 0.0338, "step": 104650 }, { "epoch": 126.24864212432107, "grad_norm": 3.744008779525757, "learning_rate": 1.9997477464000684e-05, "loss": 0.0351, "step": 104660 }, { "epoch": 126.26071213035607, "grad_norm": 3.4979753494262695, "learning_rate": 1.999747722274589e-05, "loss": 0.033, "step": 104670 }, { "epoch": 126.27278213639107, "grad_norm": 4.703860282897949, "learning_rate": 1.9997476981491097e-05, "loss": 0.0358, "step": 104680 }, { "epoch": 126.28485214242608, "grad_norm": 3.967235565185547, "learning_rate": 1.99974767402363e-05, "loss": 0.0355, "step": 104690 }, { "epoch": 126.29692214846108, "grad_norm": 3.574798583984375, "learning_rate": 1.9997476498981506e-05, "loss": 0.0342, "step": 104700 }, { "epoch": 126.30899215449608, "grad_norm": 3.4133968353271484, "learning_rate": 1.9997476257726712e-05, "loss": 0.0331, "step": 104710 }, { "epoch": 126.32106216053108, "grad_norm": 3.4074573516845703, "learning_rate": 1.9997476016471918e-05, "loss": 0.0324, "step": 104720 }, { "epoch": 126.33313216656609, "grad_norm": 3.708089828491211, "learning_rate": 1.9997475775217124e-05, "loss": 0.0358, "step": 104730 }, { "epoch": 126.34520217260109, "grad_norm": 3.764387607574463, "learning_rate": 1.999747553396233e-05, "loss": 0.0344, "step": 104740 }, { "epoch": 126.3572721786361, "grad_norm": 3.8714656829833984, "learning_rate": 1.9997475292707537e-05, "loss": 0.0368, "step": 104750 }, { "epoch": 126.3693421846711, "grad_norm": 3.451746702194214, "learning_rate": 1.9997475051452743e-05, "loss": 0.0345, "step": 104760 }, { "epoch": 126.3814121907061, "grad_norm": 4.281095027923584, "learning_rate": 1.999747481019795e-05, "loss": 0.0368, "step": 104770 }, { "epoch": 126.3934821967411, "grad_norm": 3.852691650390625, "learning_rate": 1.9997474568943156e-05, "loss": 0.0351, "step": 104780 }, { "epoch": 126.40555220277611, "grad_norm": 3.9938442707061768, "learning_rate": 1.9997474327688362e-05, "loss": 0.0356, "step": 104790 }, { "epoch": 126.41762220881111, "grad_norm": 3.8555550575256348, "learning_rate": 1.9997474086433568e-05, "loss": 0.0372, "step": 104800 }, { "epoch": 126.42969221484611, "grad_norm": 3.5392584800720215, "learning_rate": 1.9997473845178774e-05, "loss": 0.0358, "step": 104810 }, { "epoch": 126.44176222088112, "grad_norm": 3.7072653770446777, "learning_rate": 1.999747360392398e-05, "loss": 0.0379, "step": 104820 }, { "epoch": 126.45383222691612, "grad_norm": 3.5372352600097656, "learning_rate": 1.9997473362669187e-05, "loss": 0.036, "step": 104830 }, { "epoch": 126.46590223295112, "grad_norm": 3.7765326499938965, "learning_rate": 1.9997473121414393e-05, "loss": 0.0362, "step": 104840 }, { "epoch": 126.47797223898613, "grad_norm": 4.009402751922607, "learning_rate": 1.99974728801596e-05, "loss": 0.035, "step": 104850 }, { "epoch": 126.49004224502113, "grad_norm": 4.1846923828125, "learning_rate": 1.9997472638904805e-05, "loss": 0.0397, "step": 104860 }, { "epoch": 126.50211225105613, "grad_norm": 4.068490982055664, "learning_rate": 1.999747239765001e-05, "loss": 0.0363, "step": 104870 }, { "epoch": 126.51418225709112, "grad_norm": 4.076079368591309, "learning_rate": 1.9997472156395218e-05, "loss": 0.0361, "step": 104880 }, { "epoch": 126.52625226312612, "grad_norm": 3.958242177963257, "learning_rate": 1.9997471915140424e-05, "loss": 0.0379, "step": 104890 }, { "epoch": 126.53832226916113, "grad_norm": 3.8066561222076416, "learning_rate": 1.999747167388563e-05, "loss": 0.0347, "step": 104900 }, { "epoch": 126.55039227519613, "grad_norm": 3.6274867057800293, "learning_rate": 1.9997471432630836e-05, "loss": 0.0374, "step": 104910 }, { "epoch": 126.56246228123113, "grad_norm": 3.8393394947052, "learning_rate": 1.9997471191376043e-05, "loss": 0.0367, "step": 104920 }, { "epoch": 126.57453228726614, "grad_norm": 3.977912664413452, "learning_rate": 1.999747095012125e-05, "loss": 0.0376, "step": 104930 }, { "epoch": 126.58660229330114, "grad_norm": 4.259322166442871, "learning_rate": 1.999747070886645e-05, "loss": 0.037, "step": 104940 }, { "epoch": 126.59867229933614, "grad_norm": 3.7173173427581787, "learning_rate": 1.9997470467611658e-05, "loss": 0.0362, "step": 104950 }, { "epoch": 126.61074230537115, "grad_norm": 3.946216344833374, "learning_rate": 1.9997470226356864e-05, "loss": 0.0379, "step": 104960 }, { "epoch": 126.62281231140615, "grad_norm": 3.9892373085021973, "learning_rate": 1.999746998510207e-05, "loss": 0.0382, "step": 104970 }, { "epoch": 126.63488231744115, "grad_norm": 4.143880844116211, "learning_rate": 1.9997469743847276e-05, "loss": 0.0365, "step": 104980 }, { "epoch": 126.64695232347616, "grad_norm": 3.8074231147766113, "learning_rate": 1.9997469502592483e-05, "loss": 0.0385, "step": 104990 }, { "epoch": 126.65902232951116, "grad_norm": 3.722472906112671, "learning_rate": 1.999746926133769e-05, "loss": 0.0371, "step": 105000 }, { "epoch": 126.65902232951116, "eval_loss": 13.21947956085205, "eval_runtime": 8.1175, "eval_samples_per_second": 85.864, "eval_steps_per_second": 10.841, "step": 105000 }, { "epoch": 126.67109233554616, "grad_norm": 4.224952697753906, "learning_rate": 1.9997469020082895e-05, "loss": 0.0394, "step": 105010 }, { "epoch": 126.68316234158117, "grad_norm": 3.9150497913360596, "learning_rate": 1.99974687788281e-05, "loss": 0.0403, "step": 105020 }, { "epoch": 126.69523234761617, "grad_norm": 3.7687366008758545, "learning_rate": 1.9997468537573308e-05, "loss": 0.0384, "step": 105030 }, { "epoch": 126.70730235365117, "grad_norm": 3.987532377243042, "learning_rate": 1.9997468296318514e-05, "loss": 0.0374, "step": 105040 }, { "epoch": 126.71937235968618, "grad_norm": 4.154472351074219, "learning_rate": 1.999746805506372e-05, "loss": 0.0389, "step": 105050 }, { "epoch": 126.73144236572118, "grad_norm": 4.2290496826171875, "learning_rate": 1.9997467813808926e-05, "loss": 0.0387, "step": 105060 }, { "epoch": 126.74351237175618, "grad_norm": 4.326128959655762, "learning_rate": 1.9997467572554132e-05, "loss": 0.0395, "step": 105070 }, { "epoch": 126.75558237779119, "grad_norm": 3.789950370788574, "learning_rate": 1.999746733129934e-05, "loss": 0.0382, "step": 105080 }, { "epoch": 126.76765238382619, "grad_norm": 4.501706600189209, "learning_rate": 1.9997467090044545e-05, "loss": 0.0402, "step": 105090 }, { "epoch": 126.77972238986119, "grad_norm": 4.048575401306152, "learning_rate": 1.999746684878975e-05, "loss": 0.0415, "step": 105100 }, { "epoch": 126.7917923958962, "grad_norm": 4.165543079376221, "learning_rate": 1.9997466607534957e-05, "loss": 0.0396, "step": 105110 }, { "epoch": 126.8038624019312, "grad_norm": 3.874908208847046, "learning_rate": 1.9997466366280163e-05, "loss": 0.0402, "step": 105120 }, { "epoch": 126.8159324079662, "grad_norm": 4.1441969871521, "learning_rate": 1.999746612502537e-05, "loss": 0.0386, "step": 105130 }, { "epoch": 126.8280024140012, "grad_norm": 3.9879775047302246, "learning_rate": 1.9997465883770576e-05, "loss": 0.0396, "step": 105140 }, { "epoch": 126.84007242003621, "grad_norm": 3.947923421859741, "learning_rate": 1.9997465642515782e-05, "loss": 0.0418, "step": 105150 }, { "epoch": 126.85214242607121, "grad_norm": 4.055355072021484, "learning_rate": 1.999746540126099e-05, "loss": 0.0377, "step": 105160 }, { "epoch": 126.86421243210621, "grad_norm": 4.3548264503479, "learning_rate": 1.9997465160006195e-05, "loss": 0.0393, "step": 105170 }, { "epoch": 126.87628243814122, "grad_norm": 4.028517723083496, "learning_rate": 1.99974649187514e-05, "loss": 0.0392, "step": 105180 }, { "epoch": 126.88835244417622, "grad_norm": 4.059386253356934, "learning_rate": 1.9997464677496607e-05, "loss": 0.039, "step": 105190 }, { "epoch": 126.90042245021122, "grad_norm": 3.9101450443267822, "learning_rate": 1.9997464436241813e-05, "loss": 0.04, "step": 105200 }, { "epoch": 126.91249245624623, "grad_norm": 3.743612051010132, "learning_rate": 1.999746419498702e-05, "loss": 0.0394, "step": 105210 }, { "epoch": 126.92456246228123, "grad_norm": 4.057277202606201, "learning_rate": 1.9997463953732226e-05, "loss": 0.0407, "step": 105220 }, { "epoch": 126.93663246831623, "grad_norm": 3.900531530380249, "learning_rate": 1.9997463712477432e-05, "loss": 0.041, "step": 105230 }, { "epoch": 126.94870247435124, "grad_norm": 3.9367220401763916, "learning_rate": 1.9997463471222638e-05, "loss": 0.0383, "step": 105240 }, { "epoch": 126.96077248038624, "grad_norm": 4.242306709289551, "learning_rate": 1.9997463229967844e-05, "loss": 0.0405, "step": 105250 }, { "epoch": 126.97284248642124, "grad_norm": 4.652480125427246, "learning_rate": 1.999746298871305e-05, "loss": 0.0416, "step": 105260 }, { "epoch": 126.98491249245625, "grad_norm": 3.804253339767456, "learning_rate": 1.9997462747458257e-05, "loss": 0.0391, "step": 105270 }, { "epoch": 126.99698249849125, "grad_norm": 4.102553844451904, "learning_rate": 1.9997462506203463e-05, "loss": 0.0428, "step": 105280 }, { "epoch": 127.0084490042245, "grad_norm": 3.0250258445739746, "learning_rate": 1.999746226494867e-05, "loss": 0.0307, "step": 105290 }, { "epoch": 127.0205190102595, "grad_norm": 3.5491318702697754, "learning_rate": 1.9997462023693875e-05, "loss": 0.0256, "step": 105300 }, { "epoch": 127.03258901629451, "grad_norm": 4.374000072479248, "learning_rate": 1.999746178243908e-05, "loss": 0.0304, "step": 105310 }, { "epoch": 127.04465902232951, "grad_norm": 3.6984031200408936, "learning_rate": 1.9997461541184288e-05, "loss": 0.0308, "step": 105320 }, { "epoch": 127.05672902836451, "grad_norm": 3.209815740585327, "learning_rate": 1.9997461299929494e-05, "loss": 0.0276, "step": 105330 }, { "epoch": 127.06879903439952, "grad_norm": 3.791944742202759, "learning_rate": 1.99974610586747e-05, "loss": 0.03, "step": 105340 }, { "epoch": 127.08086904043452, "grad_norm": 3.526400089263916, "learning_rate": 1.9997460817419903e-05, "loss": 0.029, "step": 105350 }, { "epoch": 127.09293904646952, "grad_norm": 3.609755516052246, "learning_rate": 1.999746057616511e-05, "loss": 0.031, "step": 105360 }, { "epoch": 127.10500905250453, "grad_norm": 3.4131033420562744, "learning_rate": 1.9997460334910315e-05, "loss": 0.0292, "step": 105370 }, { "epoch": 127.11707905853953, "grad_norm": 3.3170454502105713, "learning_rate": 1.9997460093655522e-05, "loss": 0.031, "step": 105380 }, { "epoch": 127.12914906457453, "grad_norm": 3.448610544204712, "learning_rate": 1.9997459852400728e-05, "loss": 0.0323, "step": 105390 }, { "epoch": 127.14121907060954, "grad_norm": 3.8906188011169434, "learning_rate": 1.9997459611145934e-05, "loss": 0.0336, "step": 105400 }, { "epoch": 127.15328907664454, "grad_norm": 3.6119017601013184, "learning_rate": 1.999745936989114e-05, "loss": 0.0339, "step": 105410 }, { "epoch": 127.16535908267954, "grad_norm": 3.7609543800354004, "learning_rate": 1.9997459128636347e-05, "loss": 0.0318, "step": 105420 }, { "epoch": 127.17742908871455, "grad_norm": 4.020812511444092, "learning_rate": 1.9997458887381553e-05, "loss": 0.035, "step": 105430 }, { "epoch": 127.18949909474955, "grad_norm": 3.3947672843933105, "learning_rate": 1.999745864612676e-05, "loss": 0.0347, "step": 105440 }, { "epoch": 127.20156910078455, "grad_norm": 3.7990665435791016, "learning_rate": 1.9997458404871965e-05, "loss": 0.0331, "step": 105450 }, { "epoch": 127.21363910681956, "grad_norm": 3.602625846862793, "learning_rate": 1.999745816361717e-05, "loss": 0.0313, "step": 105460 }, { "epoch": 127.22570911285456, "grad_norm": 3.822829246520996, "learning_rate": 1.9997457922362378e-05, "loss": 0.034, "step": 105470 }, { "epoch": 127.23777911888956, "grad_norm": 3.785083770751953, "learning_rate": 1.9997457681107584e-05, "loss": 0.0331, "step": 105480 }, { "epoch": 127.24984912492457, "grad_norm": 3.821129322052002, "learning_rate": 1.999745743985279e-05, "loss": 0.0344, "step": 105490 }, { "epoch": 127.26191913095957, "grad_norm": 3.510563373565674, "learning_rate": 1.9997457198597996e-05, "loss": 0.0354, "step": 105500 }, { "epoch": 127.26191913095957, "eval_loss": 13.223978996276855, "eval_runtime": 8.1288, "eval_samples_per_second": 85.744, "eval_steps_per_second": 10.826, "step": 105500 }, { "epoch": 127.27398913699457, "grad_norm": 3.346865177154541, "learning_rate": 1.9997456957343203e-05, "loss": 0.0337, "step": 105510 }, { "epoch": 127.28605914302958, "grad_norm": 3.9146413803100586, "learning_rate": 1.999745671608841e-05, "loss": 0.0358, "step": 105520 }, { "epoch": 127.29812914906458, "grad_norm": 3.8317794799804688, "learning_rate": 1.9997456474833615e-05, "loss": 0.0358, "step": 105530 }, { "epoch": 127.31019915509958, "grad_norm": 3.60233211517334, "learning_rate": 1.999745623357882e-05, "loss": 0.0355, "step": 105540 }, { "epoch": 127.32226916113459, "grad_norm": 3.406189441680908, "learning_rate": 1.9997455992324027e-05, "loss": 0.0355, "step": 105550 }, { "epoch": 127.33433916716959, "grad_norm": 4.200047969818115, "learning_rate": 1.9997455751069234e-05, "loss": 0.0342, "step": 105560 }, { "epoch": 127.34640917320459, "grad_norm": 3.85675311088562, "learning_rate": 1.999745550981444e-05, "loss": 0.0361, "step": 105570 }, { "epoch": 127.3584791792396, "grad_norm": 4.04126501083374, "learning_rate": 1.9997455268559646e-05, "loss": 0.0353, "step": 105580 }, { "epoch": 127.3705491852746, "grad_norm": 3.7039997577667236, "learning_rate": 1.9997455027304852e-05, "loss": 0.0343, "step": 105590 }, { "epoch": 127.3826191913096, "grad_norm": 3.8251214027404785, "learning_rate": 1.9997454786050055e-05, "loss": 0.0364, "step": 105600 }, { "epoch": 127.3946891973446, "grad_norm": 3.8841609954833984, "learning_rate": 1.999745454479526e-05, "loss": 0.0346, "step": 105610 }, { "epoch": 127.40675920337961, "grad_norm": 3.8714566230773926, "learning_rate": 1.9997454303540467e-05, "loss": 0.0353, "step": 105620 }, { "epoch": 127.41882920941461, "grad_norm": 3.794137954711914, "learning_rate": 1.9997454062285674e-05, "loss": 0.0355, "step": 105630 }, { "epoch": 127.43089921544961, "grad_norm": 4.129099369049072, "learning_rate": 1.999745382103088e-05, "loss": 0.0374, "step": 105640 }, { "epoch": 127.44296922148462, "grad_norm": 3.6607141494750977, "learning_rate": 1.9997453579776086e-05, "loss": 0.0343, "step": 105650 }, { "epoch": 127.45503922751962, "grad_norm": 3.6196882724761963, "learning_rate": 1.9997453338521292e-05, "loss": 0.0369, "step": 105660 }, { "epoch": 127.46710923355462, "grad_norm": 3.6427321434020996, "learning_rate": 1.99974530972665e-05, "loss": 0.0363, "step": 105670 }, { "epoch": 127.47917923958963, "grad_norm": 3.656543254852295, "learning_rate": 1.9997452856011705e-05, "loss": 0.0367, "step": 105680 }, { "epoch": 127.49124924562463, "grad_norm": 4.192199230194092, "learning_rate": 1.999745261475691e-05, "loss": 0.0381, "step": 105690 }, { "epoch": 127.50331925165963, "grad_norm": 3.6178412437438965, "learning_rate": 1.9997452373502117e-05, "loss": 0.0348, "step": 105700 }, { "epoch": 127.51538925769462, "grad_norm": 4.216136932373047, "learning_rate": 1.9997452132247327e-05, "loss": 0.0374, "step": 105710 }, { "epoch": 127.52745926372963, "grad_norm": 3.447960138320923, "learning_rate": 1.9997451890992533e-05, "loss": 0.0362, "step": 105720 }, { "epoch": 127.53952926976463, "grad_norm": 3.756300926208496, "learning_rate": 1.999745164973774e-05, "loss": 0.0376, "step": 105730 }, { "epoch": 127.55159927579963, "grad_norm": 3.7379696369171143, "learning_rate": 1.9997451408482945e-05, "loss": 0.0373, "step": 105740 }, { "epoch": 127.56366928183463, "grad_norm": 3.9144177436828613, "learning_rate": 1.999745116722815e-05, "loss": 0.038, "step": 105750 }, { "epoch": 127.57573928786964, "grad_norm": 3.5486063957214355, "learning_rate": 1.9997450925973358e-05, "loss": 0.0367, "step": 105760 }, { "epoch": 127.58780929390464, "grad_norm": 4.312173843383789, "learning_rate": 1.999745068471856e-05, "loss": 0.0375, "step": 105770 }, { "epoch": 127.59987929993964, "grad_norm": 3.784712791442871, "learning_rate": 1.9997450443463767e-05, "loss": 0.0381, "step": 105780 }, { "epoch": 127.61194930597465, "grad_norm": 3.524336099624634, "learning_rate": 1.9997450202208973e-05, "loss": 0.0373, "step": 105790 }, { "epoch": 127.62401931200965, "grad_norm": 3.8359439373016357, "learning_rate": 1.999744996095418e-05, "loss": 0.037, "step": 105800 }, { "epoch": 127.63608931804465, "grad_norm": 4.409093856811523, "learning_rate": 1.9997449719699386e-05, "loss": 0.0365, "step": 105810 }, { "epoch": 127.64815932407966, "grad_norm": 4.060808181762695, "learning_rate": 1.9997449478444592e-05, "loss": 0.0393, "step": 105820 }, { "epoch": 127.66022933011466, "grad_norm": 3.982482671737671, "learning_rate": 1.9997449237189798e-05, "loss": 0.0374, "step": 105830 }, { "epoch": 127.67229933614966, "grad_norm": 3.729369878768921, "learning_rate": 1.9997448995935004e-05, "loss": 0.0381, "step": 105840 }, { "epoch": 127.68436934218467, "grad_norm": 4.45756196975708, "learning_rate": 1.999744875468021e-05, "loss": 0.038, "step": 105850 }, { "epoch": 127.69643934821967, "grad_norm": 3.808323383331299, "learning_rate": 1.9997448513425417e-05, "loss": 0.039, "step": 105860 }, { "epoch": 127.70850935425467, "grad_norm": 4.124446868896484, "learning_rate": 1.9997448272170623e-05, "loss": 0.0384, "step": 105870 }, { "epoch": 127.72057936028968, "grad_norm": 4.425503253936768, "learning_rate": 1.999744803091583e-05, "loss": 0.0375, "step": 105880 }, { "epoch": 127.73264936632468, "grad_norm": 4.146828651428223, "learning_rate": 1.9997447789661035e-05, "loss": 0.0403, "step": 105890 }, { "epoch": 127.74471937235968, "grad_norm": 4.19843864440918, "learning_rate": 1.999744754840624e-05, "loss": 0.0396, "step": 105900 }, { "epoch": 127.75678937839469, "grad_norm": 4.270370960235596, "learning_rate": 1.9997447307151448e-05, "loss": 0.0398, "step": 105910 }, { "epoch": 127.76885938442969, "grad_norm": 3.872523546218872, "learning_rate": 1.9997447065896654e-05, "loss": 0.0401, "step": 105920 }, { "epoch": 127.78092939046469, "grad_norm": 4.402710914611816, "learning_rate": 1.999744682464186e-05, "loss": 0.0385, "step": 105930 }, { "epoch": 127.7929993964997, "grad_norm": 3.928279399871826, "learning_rate": 1.9997446583387066e-05, "loss": 0.0369, "step": 105940 }, { "epoch": 127.8050694025347, "grad_norm": 3.9713425636291504, "learning_rate": 1.9997446342132273e-05, "loss": 0.0354, "step": 105950 }, { "epoch": 127.8171394085697, "grad_norm": 4.562175750732422, "learning_rate": 1.999744610087748e-05, "loss": 0.0402, "step": 105960 }, { "epoch": 127.8292094146047, "grad_norm": 3.877537965774536, "learning_rate": 1.9997445859622685e-05, "loss": 0.0385, "step": 105970 }, { "epoch": 127.84127942063971, "grad_norm": 3.816357135772705, "learning_rate": 1.999744561836789e-05, "loss": 0.0385, "step": 105980 }, { "epoch": 127.85334942667471, "grad_norm": 3.941638946533203, "learning_rate": 1.9997445377113097e-05, "loss": 0.0389, "step": 105990 }, { "epoch": 127.86541943270971, "grad_norm": 3.8010356426239014, "learning_rate": 1.9997445135858304e-05, "loss": 0.039, "step": 106000 }, { "epoch": 127.86541943270971, "eval_loss": 13.229677200317383, "eval_runtime": 8.1296, "eval_samples_per_second": 85.736, "eval_steps_per_second": 10.825, "step": 106000 }, { "epoch": 127.87748943874472, "grad_norm": 4.090133190155029, "learning_rate": 1.999744489460351e-05, "loss": 0.0382, "step": 106010 }, { "epoch": 127.88955944477972, "grad_norm": 3.8262948989868164, "learning_rate": 1.9997444653348713e-05, "loss": 0.0384, "step": 106020 }, { "epoch": 127.90162945081472, "grad_norm": 4.081496715545654, "learning_rate": 1.999744441209392e-05, "loss": 0.04, "step": 106030 }, { "epoch": 127.91369945684973, "grad_norm": 4.322754859924316, "learning_rate": 1.9997444170839125e-05, "loss": 0.0379, "step": 106040 }, { "epoch": 127.92576946288473, "grad_norm": 3.6590793132781982, "learning_rate": 1.999744392958433e-05, "loss": 0.0389, "step": 106050 }, { "epoch": 127.93783946891973, "grad_norm": 4.1588358879089355, "learning_rate": 1.9997443688329538e-05, "loss": 0.0408, "step": 106060 }, { "epoch": 127.94990947495474, "grad_norm": 4.257883071899414, "learning_rate": 1.9997443447074744e-05, "loss": 0.0395, "step": 106070 }, { "epoch": 127.96197948098974, "grad_norm": 3.779195785522461, "learning_rate": 1.999744320581995e-05, "loss": 0.0394, "step": 106080 }, { "epoch": 127.97404948702474, "grad_norm": 3.8614537715911865, "learning_rate": 1.9997442964565156e-05, "loss": 0.039, "step": 106090 }, { "epoch": 127.98611949305975, "grad_norm": 4.235400199890137, "learning_rate": 1.9997442723310362e-05, "loss": 0.0403, "step": 106100 }, { "epoch": 127.99818949909475, "grad_norm": 4.120976448059082, "learning_rate": 1.999744248205557e-05, "loss": 0.0416, "step": 106110 }, { "epoch": 128.009656004828, "grad_norm": 3.620523691177368, "learning_rate": 1.9997442240800775e-05, "loss": 0.032, "step": 106120 }, { "epoch": 128.021726010863, "grad_norm": 3.1977269649505615, "learning_rate": 1.999744199954598e-05, "loss": 0.0275, "step": 106130 }, { "epoch": 128.033796016898, "grad_norm": 3.407205104827881, "learning_rate": 1.9997441758291187e-05, "loss": 0.0289, "step": 106140 }, { "epoch": 128.045866022933, "grad_norm": 3.762573719024658, "learning_rate": 1.9997441517036394e-05, "loss": 0.0335, "step": 106150 }, { "epoch": 128.05793602896802, "grad_norm": 3.764331102371216, "learning_rate": 1.99974412757816e-05, "loss": 0.0304, "step": 106160 }, { "epoch": 128.07000603500302, "grad_norm": 3.817950963973999, "learning_rate": 1.9997441034526806e-05, "loss": 0.0319, "step": 106170 }, { "epoch": 128.08207604103802, "grad_norm": 3.542853593826294, "learning_rate": 1.9997440793272012e-05, "loss": 0.0289, "step": 106180 }, { "epoch": 128.09414604707302, "grad_norm": 3.3725476264953613, "learning_rate": 1.999744055201722e-05, "loss": 0.0325, "step": 106190 }, { "epoch": 128.10621605310803, "grad_norm": 3.902345657348633, "learning_rate": 1.9997440310762425e-05, "loss": 0.0327, "step": 106200 }, { "epoch": 128.11828605914303, "grad_norm": 3.6051785945892334, "learning_rate": 1.999744006950763e-05, "loss": 0.0314, "step": 106210 }, { "epoch": 128.13035606517803, "grad_norm": 3.7055273056030273, "learning_rate": 1.9997439828252837e-05, "loss": 0.0331, "step": 106220 }, { "epoch": 128.14242607121304, "grad_norm": 3.563387393951416, "learning_rate": 1.9997439586998043e-05, "loss": 0.03, "step": 106230 }, { "epoch": 128.15449607724804, "grad_norm": 3.741122007369995, "learning_rate": 1.999743934574325e-05, "loss": 0.0312, "step": 106240 }, { "epoch": 128.16656608328304, "grad_norm": 3.476752281188965, "learning_rate": 1.9997439104488456e-05, "loss": 0.0309, "step": 106250 }, { "epoch": 128.17863608931805, "grad_norm": 3.247579574584961, "learning_rate": 1.9997438863233662e-05, "loss": 0.0329, "step": 106260 }, { "epoch": 128.19070609535305, "grad_norm": 3.73722243309021, "learning_rate": 1.9997438621978868e-05, "loss": 0.0356, "step": 106270 }, { "epoch": 128.20277610138805, "grad_norm": 3.7438788414001465, "learning_rate": 1.9997438380724074e-05, "loss": 0.0343, "step": 106280 }, { "epoch": 128.21484610742306, "grad_norm": 3.574286937713623, "learning_rate": 1.999743813946928e-05, "loss": 0.0358, "step": 106290 }, { "epoch": 128.22691611345806, "grad_norm": 4.060514450073242, "learning_rate": 1.9997437898214487e-05, "loss": 0.0349, "step": 106300 }, { "epoch": 128.23898611949306, "grad_norm": 2.920351028442383, "learning_rate": 1.9997437656959693e-05, "loss": 0.0337, "step": 106310 }, { "epoch": 128.25105612552807, "grad_norm": 3.8006718158721924, "learning_rate": 1.99974374157049e-05, "loss": 0.0332, "step": 106320 }, { "epoch": 128.26312613156307, "grad_norm": 3.6441032886505127, "learning_rate": 1.9997437174450105e-05, "loss": 0.034, "step": 106330 }, { "epoch": 128.27519613759807, "grad_norm": 3.642228364944458, "learning_rate": 1.999743693319531e-05, "loss": 0.0327, "step": 106340 }, { "epoch": 128.28726614363308, "grad_norm": 3.6598548889160156, "learning_rate": 1.9997436691940518e-05, "loss": 0.035, "step": 106350 }, { "epoch": 128.29933614966808, "grad_norm": 3.7866432666778564, "learning_rate": 1.9997436450685724e-05, "loss": 0.0338, "step": 106360 }, { "epoch": 128.31140615570308, "grad_norm": 3.6374824047088623, "learning_rate": 1.999743620943093e-05, "loss": 0.0346, "step": 106370 }, { "epoch": 128.32347616173809, "grad_norm": 4.031858444213867, "learning_rate": 1.9997435968176136e-05, "loss": 0.0362, "step": 106380 }, { "epoch": 128.3355461677731, "grad_norm": 3.6017353534698486, "learning_rate": 1.9997435726921343e-05, "loss": 0.0334, "step": 106390 }, { "epoch": 128.3476161738081, "grad_norm": 3.225700616836548, "learning_rate": 1.999743548566655e-05, "loss": 0.035, "step": 106400 }, { "epoch": 128.3596861798431, "grad_norm": 3.8732264041900635, "learning_rate": 1.9997435244411755e-05, "loss": 0.0348, "step": 106410 }, { "epoch": 128.3717561858781, "grad_norm": 3.3421921730041504, "learning_rate": 1.999743500315696e-05, "loss": 0.034, "step": 106420 }, { "epoch": 128.3838261919131, "grad_norm": 3.706026554107666, "learning_rate": 1.9997434761902164e-05, "loss": 0.0345, "step": 106430 }, { "epoch": 128.3958961979481, "grad_norm": 3.6915977001190186, "learning_rate": 1.999743452064737e-05, "loss": 0.0337, "step": 106440 }, { "epoch": 128.4079662039831, "grad_norm": 3.8054733276367188, "learning_rate": 1.9997434279392577e-05, "loss": 0.0346, "step": 106450 }, { "epoch": 128.4200362100181, "grad_norm": 3.8203234672546387, "learning_rate": 1.9997434038137783e-05, "loss": 0.0355, "step": 106460 }, { "epoch": 128.4321062160531, "grad_norm": 3.44744610786438, "learning_rate": 1.999743379688299e-05, "loss": 0.0344, "step": 106470 }, { "epoch": 128.44417622208812, "grad_norm": 3.570030689239502, "learning_rate": 1.9997433555628195e-05, "loss": 0.0365, "step": 106480 }, { "epoch": 128.45624622812312, "grad_norm": 4.437401294708252, "learning_rate": 1.99974333143734e-05, "loss": 0.0381, "step": 106490 }, { "epoch": 128.46831623415812, "grad_norm": 4.100881576538086, "learning_rate": 1.9997433073118608e-05, "loss": 0.0378, "step": 106500 }, { "epoch": 128.46831623415812, "eval_loss": 13.23456859588623, "eval_runtime": 8.1259, "eval_samples_per_second": 85.775, "eval_steps_per_second": 10.83, "step": 106500 }, { "epoch": 128.48038624019313, "grad_norm": 3.8640248775482178, "learning_rate": 1.9997432831863814e-05, "loss": 0.0349, "step": 106510 }, { "epoch": 128.49245624622813, "grad_norm": 3.9478368759155273, "learning_rate": 1.999743259060902e-05, "loss": 0.0378, "step": 106520 }, { "epoch": 128.50452625226313, "grad_norm": 3.9457192420959473, "learning_rate": 1.9997432349354226e-05, "loss": 0.0374, "step": 106530 }, { "epoch": 128.51659625829814, "grad_norm": 3.6464881896972656, "learning_rate": 1.9997432108099433e-05, "loss": 0.0346, "step": 106540 }, { "epoch": 128.52866626433314, "grad_norm": 3.870703935623169, "learning_rate": 1.999743186684464e-05, "loss": 0.0366, "step": 106550 }, { "epoch": 128.54073627036814, "grad_norm": 3.721479654312134, "learning_rate": 1.9997431625589845e-05, "loss": 0.0355, "step": 106560 }, { "epoch": 128.55280627640315, "grad_norm": 3.969998598098755, "learning_rate": 1.999743138433505e-05, "loss": 0.0381, "step": 106570 }, { "epoch": 128.56487628243815, "grad_norm": 4.056046009063721, "learning_rate": 1.9997431143080257e-05, "loss": 0.0371, "step": 106580 }, { "epoch": 128.57694628847315, "grad_norm": 4.044219970703125, "learning_rate": 1.9997430901825464e-05, "loss": 0.0365, "step": 106590 }, { "epoch": 128.58901629450816, "grad_norm": 3.796543598175049, "learning_rate": 1.999743066057067e-05, "loss": 0.0384, "step": 106600 }, { "epoch": 128.60108630054316, "grad_norm": 3.595862865447998, "learning_rate": 1.9997430419315876e-05, "loss": 0.0378, "step": 106610 }, { "epoch": 128.61315630657816, "grad_norm": 3.9051852226257324, "learning_rate": 1.9997430178061082e-05, "loss": 0.0377, "step": 106620 }, { "epoch": 128.62522631261317, "grad_norm": 3.9597086906433105, "learning_rate": 1.999742993680629e-05, "loss": 0.036, "step": 106630 }, { "epoch": 128.63729631864817, "grad_norm": 3.707063674926758, "learning_rate": 1.9997429695551495e-05, "loss": 0.037, "step": 106640 }, { "epoch": 128.64936632468317, "grad_norm": 4.12360954284668, "learning_rate": 1.99974294542967e-05, "loss": 0.0397, "step": 106650 }, { "epoch": 128.66143633071817, "grad_norm": 3.9174416065216064, "learning_rate": 1.9997429213041907e-05, "loss": 0.038, "step": 106660 }, { "epoch": 128.67350633675318, "grad_norm": 4.100393772125244, "learning_rate": 1.9997428971787113e-05, "loss": 0.0374, "step": 106670 }, { "epoch": 128.68557634278818, "grad_norm": 4.19284200668335, "learning_rate": 1.9997428730532316e-05, "loss": 0.039, "step": 106680 }, { "epoch": 128.69764634882318, "grad_norm": 4.037303447723389, "learning_rate": 1.9997428489277522e-05, "loss": 0.0377, "step": 106690 }, { "epoch": 128.7097163548582, "grad_norm": 4.183250904083252, "learning_rate": 1.999742824802273e-05, "loss": 0.038, "step": 106700 }, { "epoch": 128.7217863608932, "grad_norm": 4.008931636810303, "learning_rate": 1.9997428006767935e-05, "loss": 0.0376, "step": 106710 }, { "epoch": 128.7338563669282, "grad_norm": 3.6734819412231445, "learning_rate": 1.999742776551314e-05, "loss": 0.0386, "step": 106720 }, { "epoch": 128.7459263729632, "grad_norm": 3.894526720046997, "learning_rate": 1.9997427524258347e-05, "loss": 0.0386, "step": 106730 }, { "epoch": 128.7579963789982, "grad_norm": 3.884938955307007, "learning_rate": 1.9997427283003553e-05, "loss": 0.0369, "step": 106740 }, { "epoch": 128.7700663850332, "grad_norm": 4.091691017150879, "learning_rate": 1.999742704174876e-05, "loss": 0.0396, "step": 106750 }, { "epoch": 128.7821363910682, "grad_norm": 3.714036464691162, "learning_rate": 1.9997426800493966e-05, "loss": 0.0394, "step": 106760 }, { "epoch": 128.7942063971032, "grad_norm": 4.0579681396484375, "learning_rate": 1.9997426559239172e-05, "loss": 0.037, "step": 106770 }, { "epoch": 128.8062764031382, "grad_norm": 4.176459312438965, "learning_rate": 1.999742631798438e-05, "loss": 0.039, "step": 106780 }, { "epoch": 128.81834640917322, "grad_norm": 4.3413825035095215, "learning_rate": 1.9997426076729588e-05, "loss": 0.0384, "step": 106790 }, { "epoch": 128.83041641520822, "grad_norm": 3.9501218795776367, "learning_rate": 1.9997425835474794e-05, "loss": 0.0382, "step": 106800 }, { "epoch": 128.84248642124322, "grad_norm": 4.447125434875488, "learning_rate": 1.999742559422e-05, "loss": 0.0405, "step": 106810 }, { "epoch": 128.85455642727823, "grad_norm": 4.084684371948242, "learning_rate": 1.9997425352965207e-05, "loss": 0.0387, "step": 106820 }, { "epoch": 128.86662643331323, "grad_norm": 4.201877593994141, "learning_rate": 1.9997425111710413e-05, "loss": 0.0388, "step": 106830 }, { "epoch": 128.87869643934823, "grad_norm": 3.638167381286621, "learning_rate": 1.999742487045562e-05, "loss": 0.0401, "step": 106840 }, { "epoch": 128.89076644538324, "grad_norm": 3.956766366958618, "learning_rate": 1.9997424629200822e-05, "loss": 0.0405, "step": 106850 }, { "epoch": 128.90283645141824, "grad_norm": 4.225254058837891, "learning_rate": 1.9997424387946028e-05, "loss": 0.0392, "step": 106860 }, { "epoch": 128.91490645745324, "grad_norm": 4.1307291984558105, "learning_rate": 1.9997424146691234e-05, "loss": 0.0394, "step": 106870 }, { "epoch": 128.92697646348824, "grad_norm": 4.3373894691467285, "learning_rate": 1.999742390543644e-05, "loss": 0.0356, "step": 106880 }, { "epoch": 128.93904646952325, "grad_norm": 3.985267400741577, "learning_rate": 1.9997423664181647e-05, "loss": 0.04, "step": 106890 }, { "epoch": 128.95111647555825, "grad_norm": 4.230645179748535, "learning_rate": 1.9997423422926853e-05, "loss": 0.0391, "step": 106900 }, { "epoch": 128.96318648159325, "grad_norm": 4.642493724822998, "learning_rate": 1.999742318167206e-05, "loss": 0.0391, "step": 106910 }, { "epoch": 128.97525648762826, "grad_norm": 3.906426191329956, "learning_rate": 1.9997422940417265e-05, "loss": 0.0392, "step": 106920 }, { "epoch": 128.98732649366326, "grad_norm": 4.309357166290283, "learning_rate": 1.999742269916247e-05, "loss": 0.0403, "step": 106930 }, { "epoch": 128.99939649969826, "grad_norm": 3.759989023208618, "learning_rate": 1.9997422457907678e-05, "loss": 0.041, "step": 106940 }, { "epoch": 129.0108630054315, "grad_norm": 3.2814714908599854, "learning_rate": 1.9997422216652884e-05, "loss": 0.0267, "step": 106950 }, { "epoch": 129.0229330114665, "grad_norm": 3.6060404777526855, "learning_rate": 1.999742197539809e-05, "loss": 0.0258, "step": 106960 }, { "epoch": 129.0350030175015, "grad_norm": 3.785395383834839, "learning_rate": 1.9997421734143296e-05, "loss": 0.0285, "step": 106970 }, { "epoch": 129.0470730235365, "grad_norm": 3.425898551940918, "learning_rate": 1.9997421492888503e-05, "loss": 0.0286, "step": 106980 }, { "epoch": 129.05914302957152, "grad_norm": 3.587158441543579, "learning_rate": 1.999742125163371e-05, "loss": 0.0297, "step": 106990 }, { "epoch": 129.07121303560652, "grad_norm": 3.011753797531128, "learning_rate": 1.9997421010378915e-05, "loss": 0.029, "step": 107000 }, { "epoch": 129.07121303560652, "eval_loss": 13.236371994018555, "eval_runtime": 8.1174, "eval_samples_per_second": 85.865, "eval_steps_per_second": 10.841, "step": 107000 }, { "epoch": 129.08328304164152, "grad_norm": 3.7427377700805664, "learning_rate": 1.999742076912412e-05, "loss": 0.0294, "step": 107010 }, { "epoch": 129.09535304767653, "grad_norm": 3.4153354167938232, "learning_rate": 1.9997420527869327e-05, "loss": 0.0312, "step": 107020 }, { "epoch": 129.10742305371153, "grad_norm": 3.25234055519104, "learning_rate": 1.9997420286614534e-05, "loss": 0.0301, "step": 107030 }, { "epoch": 129.11949305974653, "grad_norm": 3.848102331161499, "learning_rate": 1.999742004535974e-05, "loss": 0.0331, "step": 107040 }, { "epoch": 129.13156306578153, "grad_norm": 3.1963613033294678, "learning_rate": 1.9997419804104946e-05, "loss": 0.0326, "step": 107050 }, { "epoch": 129.14363307181654, "grad_norm": 3.6754705905914307, "learning_rate": 1.9997419562850152e-05, "loss": 0.0333, "step": 107060 }, { "epoch": 129.15570307785154, "grad_norm": 3.500598907470703, "learning_rate": 1.999741932159536e-05, "loss": 0.0317, "step": 107070 }, { "epoch": 129.16777308388654, "grad_norm": 3.430286407470703, "learning_rate": 1.9997419080340565e-05, "loss": 0.0322, "step": 107080 }, { "epoch": 129.17984308992155, "grad_norm": 3.54921555519104, "learning_rate": 1.999741883908577e-05, "loss": 0.0347, "step": 107090 }, { "epoch": 129.19191309595655, "grad_norm": 3.688532590866089, "learning_rate": 1.9997418597830974e-05, "loss": 0.0326, "step": 107100 }, { "epoch": 129.20398310199155, "grad_norm": 3.4601101875305176, "learning_rate": 1.999741835657618e-05, "loss": 0.0314, "step": 107110 }, { "epoch": 129.21605310802656, "grad_norm": 3.482274293899536, "learning_rate": 1.9997418115321386e-05, "loss": 0.0324, "step": 107120 }, { "epoch": 129.22812311406156, "grad_norm": 3.6427154541015625, "learning_rate": 1.9997417874066592e-05, "loss": 0.0334, "step": 107130 }, { "epoch": 129.24019312009656, "grad_norm": 3.335888385772705, "learning_rate": 1.99974176328118e-05, "loss": 0.0309, "step": 107140 }, { "epoch": 129.25226312613157, "grad_norm": 3.345783233642578, "learning_rate": 1.9997417391557005e-05, "loss": 0.0345, "step": 107150 }, { "epoch": 129.26433313216657, "grad_norm": 3.61519455909729, "learning_rate": 1.999741715030221e-05, "loss": 0.0356, "step": 107160 }, { "epoch": 129.27640313820157, "grad_norm": 3.6770248413085938, "learning_rate": 1.9997416909047417e-05, "loss": 0.0336, "step": 107170 }, { "epoch": 129.28847314423658, "grad_norm": 4.015854835510254, "learning_rate": 1.9997416667792624e-05, "loss": 0.0347, "step": 107180 }, { "epoch": 129.30054315027158, "grad_norm": 3.597231388092041, "learning_rate": 1.999741642653783e-05, "loss": 0.034, "step": 107190 }, { "epoch": 129.31261315630658, "grad_norm": 3.601647138595581, "learning_rate": 1.9997416185283036e-05, "loss": 0.0334, "step": 107200 }, { "epoch": 129.32468316234159, "grad_norm": 4.24143123626709, "learning_rate": 1.9997415944028242e-05, "loss": 0.0343, "step": 107210 }, { "epoch": 129.3367531683766, "grad_norm": 4.344263076782227, "learning_rate": 1.999741570277345e-05, "loss": 0.034, "step": 107220 }, { "epoch": 129.3488231744116, "grad_norm": 3.783318281173706, "learning_rate": 1.9997415461518655e-05, "loss": 0.033, "step": 107230 }, { "epoch": 129.3608931804466, "grad_norm": 4.104169845581055, "learning_rate": 1.999741522026386e-05, "loss": 0.0381, "step": 107240 }, { "epoch": 129.3729631864816, "grad_norm": 3.6000993251800537, "learning_rate": 1.9997414979009067e-05, "loss": 0.0352, "step": 107250 }, { "epoch": 129.3850331925166, "grad_norm": 3.425969362258911, "learning_rate": 1.9997414737754273e-05, "loss": 0.0368, "step": 107260 }, { "epoch": 129.3971031985516, "grad_norm": 3.742663860321045, "learning_rate": 1.999741449649948e-05, "loss": 0.0358, "step": 107270 }, { "epoch": 129.4091732045866, "grad_norm": 3.6623952388763428, "learning_rate": 1.9997414255244686e-05, "loss": 0.0354, "step": 107280 }, { "epoch": 129.4212432106216, "grad_norm": 3.4578757286071777, "learning_rate": 1.9997414013989892e-05, "loss": 0.0346, "step": 107290 }, { "epoch": 129.43331321665661, "grad_norm": 3.938404083251953, "learning_rate": 1.9997413772735098e-05, "loss": 0.0368, "step": 107300 }, { "epoch": 129.44538322269162, "grad_norm": 4.017608165740967, "learning_rate": 1.9997413531480304e-05, "loss": 0.0361, "step": 107310 }, { "epoch": 129.45745322872662, "grad_norm": 3.9934496879577637, "learning_rate": 1.999741329022551e-05, "loss": 0.0362, "step": 107320 }, { "epoch": 129.46952323476162, "grad_norm": 3.7229220867156982, "learning_rate": 1.9997413048970717e-05, "loss": 0.0342, "step": 107330 }, { "epoch": 129.48159324079663, "grad_norm": 3.741126298904419, "learning_rate": 1.9997412807715923e-05, "loss": 0.0363, "step": 107340 }, { "epoch": 129.49366324683163, "grad_norm": 3.950167655944824, "learning_rate": 1.999741256646113e-05, "loss": 0.036, "step": 107350 }, { "epoch": 129.50573325286663, "grad_norm": 3.9353268146514893, "learning_rate": 1.9997412325206335e-05, "loss": 0.0355, "step": 107360 }, { "epoch": 129.51780325890164, "grad_norm": 3.9526939392089844, "learning_rate": 1.999741208395154e-05, "loss": 0.0357, "step": 107370 }, { "epoch": 129.52987326493664, "grad_norm": 3.9711170196533203, "learning_rate": 1.9997411842696748e-05, "loss": 0.0382, "step": 107380 }, { "epoch": 129.54194327097164, "grad_norm": 4.100218772888184, "learning_rate": 1.9997411601441954e-05, "loss": 0.0377, "step": 107390 }, { "epoch": 129.55401327700665, "grad_norm": 3.79697847366333, "learning_rate": 1.999741136018716e-05, "loss": 0.0362, "step": 107400 }, { "epoch": 129.56608328304165, "grad_norm": 3.8655059337615967, "learning_rate": 1.9997411118932366e-05, "loss": 0.0369, "step": 107410 }, { "epoch": 129.57815328907665, "grad_norm": 3.9783642292022705, "learning_rate": 1.9997410877677573e-05, "loss": 0.0372, "step": 107420 }, { "epoch": 129.59022329511166, "grad_norm": 3.8470208644866943, "learning_rate": 1.999741063642278e-05, "loss": 0.0374, "step": 107430 }, { "epoch": 129.60229330114666, "grad_norm": 3.8407697677612305, "learning_rate": 1.9997410395167985e-05, "loss": 0.0361, "step": 107440 }, { "epoch": 129.61436330718166, "grad_norm": 4.018423557281494, "learning_rate": 1.999741015391319e-05, "loss": 0.0371, "step": 107450 }, { "epoch": 129.62643331321667, "grad_norm": 3.9905571937561035, "learning_rate": 1.9997409912658398e-05, "loss": 0.0365, "step": 107460 }, { "epoch": 129.63850331925167, "grad_norm": 3.878638982772827, "learning_rate": 1.9997409671403604e-05, "loss": 0.036, "step": 107470 }, { "epoch": 129.65057332528667, "grad_norm": 3.697707176208496, "learning_rate": 1.999740943014881e-05, "loss": 0.0359, "step": 107480 }, { "epoch": 129.66264333132168, "grad_norm": 4.044388771057129, "learning_rate": 1.9997409188894016e-05, "loss": 0.0364, "step": 107490 }, { "epoch": 129.67471333735668, "grad_norm": 4.09474515914917, "learning_rate": 1.9997408947639222e-05, "loss": 0.0368, "step": 107500 }, { "epoch": 129.67471333735668, "eval_loss": 13.252025604248047, "eval_runtime": 8.1285, "eval_samples_per_second": 85.748, "eval_steps_per_second": 10.826, "step": 107500 }, { "epoch": 129.68678334339168, "grad_norm": 4.384340763092041, "learning_rate": 1.9997408706384425e-05, "loss": 0.0371, "step": 107510 }, { "epoch": 129.69885334942668, "grad_norm": 3.8686556816101074, "learning_rate": 1.999740846512963e-05, "loss": 0.0384, "step": 107520 }, { "epoch": 129.7109233554617, "grad_norm": 3.8198459148406982, "learning_rate": 1.9997408223874838e-05, "loss": 0.0353, "step": 107530 }, { "epoch": 129.7229933614967, "grad_norm": 3.7862517833709717, "learning_rate": 1.9997407982620044e-05, "loss": 0.0377, "step": 107540 }, { "epoch": 129.7350633675317, "grad_norm": 3.8046321868896484, "learning_rate": 1.999740774136525e-05, "loss": 0.0375, "step": 107550 }, { "epoch": 129.7471333735667, "grad_norm": 4.281036376953125, "learning_rate": 1.9997407500110456e-05, "loss": 0.0391, "step": 107560 }, { "epoch": 129.7592033796017, "grad_norm": 3.8367669582366943, "learning_rate": 1.9997407258855663e-05, "loss": 0.0375, "step": 107570 }, { "epoch": 129.7712733856367, "grad_norm": 3.73404598236084, "learning_rate": 1.999740701760087e-05, "loss": 0.0391, "step": 107580 }, { "epoch": 129.7833433916717, "grad_norm": 4.661542892456055, "learning_rate": 1.9997406776346075e-05, "loss": 0.0386, "step": 107590 }, { "epoch": 129.7954133977067, "grad_norm": 3.9552383422851562, "learning_rate": 1.999740653509128e-05, "loss": 0.0395, "step": 107600 }, { "epoch": 129.8074834037417, "grad_norm": 3.9945762157440186, "learning_rate": 1.9997406293836487e-05, "loss": 0.0397, "step": 107610 }, { "epoch": 129.81955340977672, "grad_norm": 3.9479572772979736, "learning_rate": 1.9997406052581694e-05, "loss": 0.0384, "step": 107620 }, { "epoch": 129.83162341581172, "grad_norm": 4.162992477416992, "learning_rate": 1.99974058113269e-05, "loss": 0.0382, "step": 107630 }, { "epoch": 129.84369342184672, "grad_norm": 4.316402912139893, "learning_rate": 1.9997405570072106e-05, "loss": 0.0403, "step": 107640 }, { "epoch": 129.85576342788173, "grad_norm": 4.072427272796631, "learning_rate": 1.9997405328817312e-05, "loss": 0.0393, "step": 107650 }, { "epoch": 129.86783343391673, "grad_norm": 4.118744850158691, "learning_rate": 1.999740508756252e-05, "loss": 0.0386, "step": 107660 }, { "epoch": 129.87990343995173, "grad_norm": 3.6508002281188965, "learning_rate": 1.9997404846307725e-05, "loss": 0.0393, "step": 107670 }, { "epoch": 129.89197344598674, "grad_norm": 3.975681781768799, "learning_rate": 1.999740460505293e-05, "loss": 0.0385, "step": 107680 }, { "epoch": 129.90404345202174, "grad_norm": 4.713472366333008, "learning_rate": 1.9997404363798137e-05, "loss": 0.0391, "step": 107690 }, { "epoch": 129.91611345805674, "grad_norm": 3.941526412963867, "learning_rate": 1.9997404122543343e-05, "loss": 0.0398, "step": 107700 }, { "epoch": 129.92818346409175, "grad_norm": 4.228131294250488, "learning_rate": 1.999740388128855e-05, "loss": 0.038, "step": 107710 }, { "epoch": 129.94025347012675, "grad_norm": 4.221652030944824, "learning_rate": 1.9997403640033756e-05, "loss": 0.0394, "step": 107720 }, { "epoch": 129.95232347616175, "grad_norm": 3.8783764839172363, "learning_rate": 1.9997403398778962e-05, "loss": 0.0388, "step": 107730 }, { "epoch": 129.96439348219675, "grad_norm": 4.486849784851074, "learning_rate": 1.9997403157524168e-05, "loss": 0.0386, "step": 107740 }, { "epoch": 129.97646348823176, "grad_norm": 4.267953395843506, "learning_rate": 1.9997402916269374e-05, "loss": 0.0398, "step": 107750 }, { "epoch": 129.98853349426676, "grad_norm": 4.202721118927002, "learning_rate": 1.9997402675014577e-05, "loss": 0.0396, "step": 107760 }, { "epoch": 130.0, "grad_norm": 7.609777927398682, "learning_rate": 1.9997402433759783e-05, "loss": 0.039, "step": 107770 }, { "epoch": 130.012070006035, "grad_norm": 2.982539653778076, "learning_rate": 1.999740219250499e-05, "loss": 0.0258, "step": 107780 }, { "epoch": 130.02414001207, "grad_norm": 3.176417589187622, "learning_rate": 1.9997401951250196e-05, "loss": 0.0296, "step": 107790 }, { "epoch": 130.036210018105, "grad_norm": 3.443838596343994, "learning_rate": 1.9997401709995402e-05, "loss": 0.0263, "step": 107800 }, { "epoch": 130.04828002414, "grad_norm": 2.797913074493408, "learning_rate": 1.999740146874061e-05, "loss": 0.0294, "step": 107810 }, { "epoch": 130.06035003017502, "grad_norm": 3.402472734451294, "learning_rate": 1.9997401227485815e-05, "loss": 0.0283, "step": 107820 }, { "epoch": 130.07242003621002, "grad_norm": 3.4818596839904785, "learning_rate": 1.999740098623102e-05, "loss": 0.0298, "step": 107830 }, { "epoch": 130.08449004224502, "grad_norm": 3.1154701709747314, "learning_rate": 1.9997400744976227e-05, "loss": 0.0308, "step": 107840 }, { "epoch": 130.09656004828003, "grad_norm": 3.1439332962036133, "learning_rate": 1.9997400503721433e-05, "loss": 0.0318, "step": 107850 }, { "epoch": 130.10863005431503, "grad_norm": 3.7378127574920654, "learning_rate": 1.999740026246664e-05, "loss": 0.0321, "step": 107860 }, { "epoch": 130.12070006035003, "grad_norm": 3.8136777877807617, "learning_rate": 1.999740002121185e-05, "loss": 0.033, "step": 107870 }, { "epoch": 130.13277006638504, "grad_norm": 3.5278913974761963, "learning_rate": 1.9997399779957055e-05, "loss": 0.029, "step": 107880 }, { "epoch": 130.14484007242004, "grad_norm": 3.4084291458129883, "learning_rate": 1.999739953870226e-05, "loss": 0.0311, "step": 107890 }, { "epoch": 130.15691007845504, "grad_norm": 3.227289915084839, "learning_rate": 1.9997399297447468e-05, "loss": 0.0293, "step": 107900 }, { "epoch": 130.16898008449004, "grad_norm": 3.623901844024658, "learning_rate": 1.9997399056192674e-05, "loss": 0.0316, "step": 107910 }, { "epoch": 130.18105009052505, "grad_norm": 3.830101728439331, "learning_rate": 1.9997398814937877e-05, "loss": 0.0321, "step": 107920 }, { "epoch": 130.19312009656005, "grad_norm": 3.9785711765289307, "learning_rate": 1.9997398573683083e-05, "loss": 0.0315, "step": 107930 }, { "epoch": 130.20519010259505, "grad_norm": 3.28955078125, "learning_rate": 1.999739833242829e-05, "loss": 0.0331, "step": 107940 }, { "epoch": 130.21726010863006, "grad_norm": 3.4740211963653564, "learning_rate": 1.9997398091173495e-05, "loss": 0.0346, "step": 107950 }, { "epoch": 130.22933011466506, "grad_norm": 4.07211971282959, "learning_rate": 1.99973978499187e-05, "loss": 0.0342, "step": 107960 }, { "epoch": 130.24140012070006, "grad_norm": 3.578465700149536, "learning_rate": 1.9997397608663908e-05, "loss": 0.0324, "step": 107970 }, { "epoch": 130.25347012673507, "grad_norm": 3.529881238937378, "learning_rate": 1.9997397367409114e-05, "loss": 0.0335, "step": 107980 }, { "epoch": 130.26554013277007, "grad_norm": 3.672250270843506, "learning_rate": 1.999739712615432e-05, "loss": 0.0327, "step": 107990 }, { "epoch": 130.27761013880507, "grad_norm": 4.085530757904053, "learning_rate": 1.9997396884899526e-05, "loss": 0.0357, "step": 108000 }, { "epoch": 130.27761013880507, "eval_loss": 13.26317310333252, "eval_runtime": 8.2017, "eval_samples_per_second": 84.983, "eval_steps_per_second": 10.73, "step": 108000 }, { "epoch": 130.28968014484008, "grad_norm": 3.400355815887451, "learning_rate": 1.9997396643644733e-05, "loss": 0.0346, "step": 108010 }, { "epoch": 130.30175015087508, "grad_norm": 4.004819393157959, "learning_rate": 1.999739640238994e-05, "loss": 0.0355, "step": 108020 }, { "epoch": 130.31382015691008, "grad_norm": 3.8067550659179688, "learning_rate": 1.9997396161135145e-05, "loss": 0.033, "step": 108030 }, { "epoch": 130.3258901629451, "grad_norm": 3.647030830383301, "learning_rate": 1.999739591988035e-05, "loss": 0.0331, "step": 108040 }, { "epoch": 130.3379601689801, "grad_norm": 3.589888334274292, "learning_rate": 1.9997395678625557e-05, "loss": 0.0337, "step": 108050 }, { "epoch": 130.3500301750151, "grad_norm": 3.7472074031829834, "learning_rate": 1.9997395437370764e-05, "loss": 0.0324, "step": 108060 }, { "epoch": 130.3621001810501, "grad_norm": 3.8670551776885986, "learning_rate": 1.999739519611597e-05, "loss": 0.0349, "step": 108070 }, { "epoch": 130.3741701870851, "grad_norm": 3.8986644744873047, "learning_rate": 1.9997394954861176e-05, "loss": 0.0329, "step": 108080 }, { "epoch": 130.3862401931201, "grad_norm": 3.8147683143615723, "learning_rate": 1.9997394713606382e-05, "loss": 0.0335, "step": 108090 }, { "epoch": 130.3983101991551, "grad_norm": 3.507509231567383, "learning_rate": 1.999739447235159e-05, "loss": 0.0337, "step": 108100 }, { "epoch": 130.4103802051901, "grad_norm": 3.4985556602478027, "learning_rate": 1.9997394231096795e-05, "loss": 0.0333, "step": 108110 }, { "epoch": 130.4224502112251, "grad_norm": 3.584829568862915, "learning_rate": 1.9997393989842e-05, "loss": 0.0347, "step": 108120 }, { "epoch": 130.43452021726011, "grad_norm": 3.579745292663574, "learning_rate": 1.9997393748587207e-05, "loss": 0.0351, "step": 108130 }, { "epoch": 130.44659022329512, "grad_norm": 4.554984092712402, "learning_rate": 1.9997393507332413e-05, "loss": 0.0377, "step": 108140 }, { "epoch": 130.45866022933012, "grad_norm": 3.7968926429748535, "learning_rate": 1.999739326607762e-05, "loss": 0.0343, "step": 108150 }, { "epoch": 130.47073023536512, "grad_norm": 3.9629416465759277, "learning_rate": 1.9997393024822826e-05, "loss": 0.0343, "step": 108160 }, { "epoch": 130.48280024140013, "grad_norm": 3.671898126602173, "learning_rate": 1.999739278356803e-05, "loss": 0.0359, "step": 108170 }, { "epoch": 130.49487024743513, "grad_norm": 3.747340440750122, "learning_rate": 1.9997392542313235e-05, "loss": 0.0356, "step": 108180 }, { "epoch": 130.50694025347013, "grad_norm": 3.996915102005005, "learning_rate": 1.999739230105844e-05, "loss": 0.0348, "step": 108190 }, { "epoch": 130.51901025950514, "grad_norm": 3.4382498264312744, "learning_rate": 1.9997392059803647e-05, "loss": 0.036, "step": 108200 }, { "epoch": 130.53108026554014, "grad_norm": 4.001558303833008, "learning_rate": 1.9997391818548854e-05, "loss": 0.0352, "step": 108210 }, { "epoch": 130.54315027157514, "grad_norm": 3.9629647731781006, "learning_rate": 1.999739157729406e-05, "loss": 0.0383, "step": 108220 }, { "epoch": 130.55522027761015, "grad_norm": 3.5993564128875732, "learning_rate": 1.9997391336039266e-05, "loss": 0.0371, "step": 108230 }, { "epoch": 130.56729028364515, "grad_norm": 3.6505486965179443, "learning_rate": 1.9997391094784472e-05, "loss": 0.0347, "step": 108240 }, { "epoch": 130.57936028968015, "grad_norm": 3.716597080230713, "learning_rate": 1.999739085352968e-05, "loss": 0.0367, "step": 108250 }, { "epoch": 130.59143029571516, "grad_norm": 4.339423656463623, "learning_rate": 1.9997390612274885e-05, "loss": 0.0368, "step": 108260 }, { "epoch": 130.60350030175016, "grad_norm": 4.244614601135254, "learning_rate": 1.999739037102009e-05, "loss": 0.0367, "step": 108270 }, { "epoch": 130.61557030778516, "grad_norm": 3.736776113510132, "learning_rate": 1.9997390129765297e-05, "loss": 0.0394, "step": 108280 }, { "epoch": 130.62764031382017, "grad_norm": 3.8346409797668457, "learning_rate": 1.9997389888510503e-05, "loss": 0.0351, "step": 108290 }, { "epoch": 130.63971031985517, "grad_norm": 3.835477590560913, "learning_rate": 1.999738964725571e-05, "loss": 0.0378, "step": 108300 }, { "epoch": 130.65178032589017, "grad_norm": 4.057408332824707, "learning_rate": 1.9997389406000916e-05, "loss": 0.038, "step": 108310 }, { "epoch": 130.66385033192518, "grad_norm": 3.804732322692871, "learning_rate": 1.9997389164746122e-05, "loss": 0.0374, "step": 108320 }, { "epoch": 130.67592033796018, "grad_norm": 3.8522374629974365, "learning_rate": 1.9997388923491328e-05, "loss": 0.0362, "step": 108330 }, { "epoch": 130.68799034399518, "grad_norm": 4.177990436553955, "learning_rate": 1.9997388682236534e-05, "loss": 0.0369, "step": 108340 }, { "epoch": 130.70006035003018, "grad_norm": 3.9673609733581543, "learning_rate": 1.999738844098174e-05, "loss": 0.0362, "step": 108350 }, { "epoch": 130.7121303560652, "grad_norm": 3.9087653160095215, "learning_rate": 1.9997388199726947e-05, "loss": 0.0378, "step": 108360 }, { "epoch": 130.7242003621002, "grad_norm": 3.8522350788116455, "learning_rate": 1.9997387958472153e-05, "loss": 0.039, "step": 108370 }, { "epoch": 130.7362703681352, "grad_norm": 3.803393840789795, "learning_rate": 1.999738771721736e-05, "loss": 0.0378, "step": 108380 }, { "epoch": 130.7483403741702, "grad_norm": 3.6629114151000977, "learning_rate": 1.9997387475962565e-05, "loss": 0.038, "step": 108390 }, { "epoch": 130.7604103802052, "grad_norm": 3.637873649597168, "learning_rate": 1.999738723470777e-05, "loss": 0.0383, "step": 108400 }, { "epoch": 130.7724803862402, "grad_norm": 3.8497183322906494, "learning_rate": 1.9997386993452978e-05, "loss": 0.0374, "step": 108410 }, { "epoch": 130.7845503922752, "grad_norm": 3.633521795272827, "learning_rate": 1.9997386752198184e-05, "loss": 0.0376, "step": 108420 }, { "epoch": 130.7966203983102, "grad_norm": 4.18262243270874, "learning_rate": 1.999738651094339e-05, "loss": 0.0389, "step": 108430 }, { "epoch": 130.8086904043452, "grad_norm": 4.103598117828369, "learning_rate": 1.9997386269688597e-05, "loss": 0.0371, "step": 108440 }, { "epoch": 130.82076041038022, "grad_norm": 4.064061641693115, "learning_rate": 1.9997386028433803e-05, "loss": 0.0403, "step": 108450 }, { "epoch": 130.83283041641522, "grad_norm": 4.019898414611816, "learning_rate": 1.999738578717901e-05, "loss": 0.038, "step": 108460 }, { "epoch": 130.84490042245022, "grad_norm": 4.236594200134277, "learning_rate": 1.9997385545924215e-05, "loss": 0.0381, "step": 108470 }, { "epoch": 130.85697042848523, "grad_norm": 4.087411880493164, "learning_rate": 1.999738530466942e-05, "loss": 0.0378, "step": 108480 }, { "epoch": 130.86904043452023, "grad_norm": 3.917309284210205, "learning_rate": 1.9997385063414628e-05, "loss": 0.038, "step": 108490 }, { "epoch": 130.88111044055523, "grad_norm": 3.9263784885406494, "learning_rate": 1.9997384822159834e-05, "loss": 0.0385, "step": 108500 }, { "epoch": 130.88111044055523, "eval_loss": 13.258132934570312, "eval_runtime": 8.2113, "eval_samples_per_second": 84.883, "eval_steps_per_second": 10.717, "step": 108500 }, { "epoch": 130.89318044659024, "grad_norm": 3.897325038909912, "learning_rate": 1.999738458090504e-05, "loss": 0.0376, "step": 108510 }, { "epoch": 130.90525045262524, "grad_norm": 4.685945987701416, "learning_rate": 1.9997384339650246e-05, "loss": 0.0399, "step": 108520 }, { "epoch": 130.91732045866024, "grad_norm": 3.872180461883545, "learning_rate": 1.9997384098395452e-05, "loss": 0.0379, "step": 108530 }, { "epoch": 130.92939046469525, "grad_norm": 4.1012959480285645, "learning_rate": 1.999738385714066e-05, "loss": 0.0398, "step": 108540 }, { "epoch": 130.94146047073025, "grad_norm": 4.337082386016846, "learning_rate": 1.9997383615885865e-05, "loss": 0.0396, "step": 108550 }, { "epoch": 130.95353047676525, "grad_norm": 3.868129253387451, "learning_rate": 1.999738337463107e-05, "loss": 0.0383, "step": 108560 }, { "epoch": 130.96560048280026, "grad_norm": 3.998293161392212, "learning_rate": 1.9997383133376277e-05, "loss": 0.0401, "step": 108570 }, { "epoch": 130.97767048883526, "grad_norm": 4.012851715087891, "learning_rate": 1.9997382892121484e-05, "loss": 0.0402, "step": 108580 }, { "epoch": 130.98974049487026, "grad_norm": 4.33420467376709, "learning_rate": 1.9997382650866686e-05, "loss": 0.039, "step": 108590 }, { "epoch": 131.0012070006035, "grad_norm": 3.276201009750366, "learning_rate": 1.9997382409611893e-05, "loss": 0.0379, "step": 108600 }, { "epoch": 131.0132770066385, "grad_norm": 3.3640246391296387, "learning_rate": 1.99973821683571e-05, "loss": 0.0247, "step": 108610 }, { "epoch": 131.0253470126735, "grad_norm": 3.4750680923461914, "learning_rate": 1.9997381927102305e-05, "loss": 0.028, "step": 108620 }, { "epoch": 131.0374170187085, "grad_norm": 3.3243026733398438, "learning_rate": 1.999738168584751e-05, "loss": 0.0274, "step": 108630 }, { "epoch": 131.0494870247435, "grad_norm": 3.467484474182129, "learning_rate": 1.9997381444592717e-05, "loss": 0.0252, "step": 108640 }, { "epoch": 131.06155703077852, "grad_norm": 3.204939365386963, "learning_rate": 1.9997381203337924e-05, "loss": 0.03, "step": 108650 }, { "epoch": 131.07362703681352, "grad_norm": 3.3889219760894775, "learning_rate": 1.999738096208313e-05, "loss": 0.0295, "step": 108660 }, { "epoch": 131.08569704284852, "grad_norm": 3.5844686031341553, "learning_rate": 1.9997380720828336e-05, "loss": 0.0287, "step": 108670 }, { "epoch": 131.09776704888353, "grad_norm": 3.4173829555511475, "learning_rate": 1.9997380479573542e-05, "loss": 0.0314, "step": 108680 }, { "epoch": 131.10983705491853, "grad_norm": 3.4364426136016846, "learning_rate": 1.999738023831875e-05, "loss": 0.0309, "step": 108690 }, { "epoch": 131.12190706095353, "grad_norm": 4.34568452835083, "learning_rate": 1.9997379997063955e-05, "loss": 0.0319, "step": 108700 }, { "epoch": 131.13397706698854, "grad_norm": 3.582200288772583, "learning_rate": 1.999737975580916e-05, "loss": 0.0301, "step": 108710 }, { "epoch": 131.14604707302354, "grad_norm": 3.739755630493164, "learning_rate": 1.9997379514554367e-05, "loss": 0.0312, "step": 108720 }, { "epoch": 131.15811707905854, "grad_norm": 3.442918062210083, "learning_rate": 1.9997379273299573e-05, "loss": 0.0302, "step": 108730 }, { "epoch": 131.17018708509354, "grad_norm": 3.2498719692230225, "learning_rate": 1.999737903204478e-05, "loss": 0.0313, "step": 108740 }, { "epoch": 131.18225709112855, "grad_norm": 3.545823335647583, "learning_rate": 1.9997378790789986e-05, "loss": 0.0309, "step": 108750 }, { "epoch": 131.19432709716355, "grad_norm": 3.361091375350952, "learning_rate": 1.9997378549535192e-05, "loss": 0.0299, "step": 108760 }, { "epoch": 131.20639710319855, "grad_norm": 3.7895100116729736, "learning_rate": 1.9997378308280398e-05, "loss": 0.0317, "step": 108770 }, { "epoch": 131.21846710923356, "grad_norm": 3.556504487991333, "learning_rate": 1.9997378067025604e-05, "loss": 0.0322, "step": 108780 }, { "epoch": 131.23053711526856, "grad_norm": 3.771742105484009, "learning_rate": 1.999737782577081e-05, "loss": 0.0338, "step": 108790 }, { "epoch": 131.24260712130356, "grad_norm": 3.5860495567321777, "learning_rate": 1.9997377584516017e-05, "loss": 0.0319, "step": 108800 }, { "epoch": 131.25467712733857, "grad_norm": 3.6079320907592773, "learning_rate": 1.9997377343261223e-05, "loss": 0.0329, "step": 108810 }, { "epoch": 131.26674713337357, "grad_norm": 3.544574737548828, "learning_rate": 1.999737710200643e-05, "loss": 0.0328, "step": 108820 }, { "epoch": 131.27881713940857, "grad_norm": 3.5877065658569336, "learning_rate": 1.9997376860751636e-05, "loss": 0.0338, "step": 108830 }, { "epoch": 131.29088714544358, "grad_norm": 3.7047626972198486, "learning_rate": 1.999737661949684e-05, "loss": 0.033, "step": 108840 }, { "epoch": 131.30295715147858, "grad_norm": 3.9753074645996094, "learning_rate": 1.9997376378242045e-05, "loss": 0.034, "step": 108850 }, { "epoch": 131.31502715751358, "grad_norm": 3.59648060798645, "learning_rate": 1.999737613698725e-05, "loss": 0.0337, "step": 108860 }, { "epoch": 131.3270971635486, "grad_norm": 3.5719897747039795, "learning_rate": 1.9997375895732457e-05, "loss": 0.0344, "step": 108870 }, { "epoch": 131.3391671695836, "grad_norm": 3.854295492172241, "learning_rate": 1.9997375654477663e-05, "loss": 0.0346, "step": 108880 }, { "epoch": 131.3512371756186, "grad_norm": 3.553936719894409, "learning_rate": 1.999737541322287e-05, "loss": 0.0338, "step": 108890 }, { "epoch": 131.3633071816536, "grad_norm": 3.8107001781463623, "learning_rate": 1.9997375171968076e-05, "loss": 0.0338, "step": 108900 }, { "epoch": 131.3753771876886, "grad_norm": 3.710285186767578, "learning_rate": 1.9997374930713282e-05, "loss": 0.0349, "step": 108910 }, { "epoch": 131.3874471937236, "grad_norm": 3.5956692695617676, "learning_rate": 1.9997374689458488e-05, "loss": 0.034, "step": 108920 }, { "epoch": 131.3995171997586, "grad_norm": 3.8183021545410156, "learning_rate": 1.9997374448203694e-05, "loss": 0.0354, "step": 108930 }, { "epoch": 131.4115872057936, "grad_norm": 3.6761083602905273, "learning_rate": 1.99973742069489e-05, "loss": 0.0339, "step": 108940 }, { "epoch": 131.4236572118286, "grad_norm": 3.195909023284912, "learning_rate": 1.999737396569411e-05, "loss": 0.0347, "step": 108950 }, { "epoch": 131.43572721786362, "grad_norm": 3.7108161449432373, "learning_rate": 1.9997373724439316e-05, "loss": 0.0346, "step": 108960 }, { "epoch": 131.44779722389862, "grad_norm": 4.133672714233398, "learning_rate": 1.9997373483184523e-05, "loss": 0.0355, "step": 108970 }, { "epoch": 131.45986722993362, "grad_norm": 3.9275333881378174, "learning_rate": 1.999737324192973e-05, "loss": 0.0348, "step": 108980 }, { "epoch": 131.47193723596862, "grad_norm": 3.432812452316284, "learning_rate": 1.9997373000674935e-05, "loss": 0.0351, "step": 108990 }, { "epoch": 131.48400724200363, "grad_norm": 3.4484829902648926, "learning_rate": 1.9997372759420138e-05, "loss": 0.0336, "step": 109000 }, { "epoch": 131.48400724200363, "eval_loss": 13.273506164550781, "eval_runtime": 8.1868, "eval_samples_per_second": 85.137, "eval_steps_per_second": 10.749, "step": 109000 }, { "epoch": 131.49607724803863, "grad_norm": 3.384606122970581, "learning_rate": 1.9997372518165344e-05, "loss": 0.0342, "step": 109010 }, { "epoch": 131.50814725407363, "grad_norm": 4.099756717681885, "learning_rate": 1.999737227691055e-05, "loss": 0.0339, "step": 109020 }, { "epoch": 131.52021726010864, "grad_norm": 3.6966679096221924, "learning_rate": 1.9997372035655756e-05, "loss": 0.036, "step": 109030 }, { "epoch": 131.53228726614364, "grad_norm": 4.269016265869141, "learning_rate": 1.9997371794400963e-05, "loss": 0.0371, "step": 109040 }, { "epoch": 131.54435727217864, "grad_norm": 3.9228458404541016, "learning_rate": 1.999737155314617e-05, "loss": 0.0372, "step": 109050 }, { "epoch": 131.55642727821365, "grad_norm": 3.5916318893432617, "learning_rate": 1.9997371311891375e-05, "loss": 0.0361, "step": 109060 }, { "epoch": 131.56849728424865, "grad_norm": 3.7968990802764893, "learning_rate": 1.999737107063658e-05, "loss": 0.038, "step": 109070 }, { "epoch": 131.58056729028365, "grad_norm": 3.5696918964385986, "learning_rate": 1.9997370829381788e-05, "loss": 0.0344, "step": 109080 }, { "epoch": 131.59263729631866, "grad_norm": 4.0808844566345215, "learning_rate": 1.9997370588126994e-05, "loss": 0.0353, "step": 109090 }, { "epoch": 131.60470730235366, "grad_norm": 4.839911460876465, "learning_rate": 1.99973703468722e-05, "loss": 0.0386, "step": 109100 }, { "epoch": 131.61677730838866, "grad_norm": 3.957369565963745, "learning_rate": 1.9997370105617406e-05, "loss": 0.0378, "step": 109110 }, { "epoch": 131.62884731442367, "grad_norm": 3.9610989093780518, "learning_rate": 1.9997369864362612e-05, "loss": 0.0375, "step": 109120 }, { "epoch": 131.64091732045867, "grad_norm": 3.9394583702087402, "learning_rate": 1.999736962310782e-05, "loss": 0.0366, "step": 109130 }, { "epoch": 131.65298732649367, "grad_norm": 3.9396512508392334, "learning_rate": 1.9997369381853025e-05, "loss": 0.0373, "step": 109140 }, { "epoch": 131.66505733252868, "grad_norm": 3.9699559211730957, "learning_rate": 1.999736914059823e-05, "loss": 0.0367, "step": 109150 }, { "epoch": 131.67712733856368, "grad_norm": 3.8867475986480713, "learning_rate": 1.9997368899343437e-05, "loss": 0.0377, "step": 109160 }, { "epoch": 131.68919734459868, "grad_norm": 3.9247069358825684, "learning_rate": 1.9997368658088643e-05, "loss": 0.0339, "step": 109170 }, { "epoch": 131.70126735063369, "grad_norm": 3.647956132888794, "learning_rate": 1.999736841683385e-05, "loss": 0.0378, "step": 109180 }, { "epoch": 131.7133373566687, "grad_norm": 4.035648345947266, "learning_rate": 1.9997368175579056e-05, "loss": 0.0374, "step": 109190 }, { "epoch": 131.7254073627037, "grad_norm": 4.077027797698975, "learning_rate": 1.9997367934324262e-05, "loss": 0.0402, "step": 109200 }, { "epoch": 131.7374773687387, "grad_norm": 3.491787910461426, "learning_rate": 1.999736769306947e-05, "loss": 0.0375, "step": 109210 }, { "epoch": 131.7495473747737, "grad_norm": 3.974942445755005, "learning_rate": 1.9997367451814675e-05, "loss": 0.0372, "step": 109220 }, { "epoch": 131.7616173808087, "grad_norm": 3.830821990966797, "learning_rate": 1.999736721055988e-05, "loss": 0.0384, "step": 109230 }, { "epoch": 131.7736873868437, "grad_norm": 3.7187981605529785, "learning_rate": 1.9997366969305087e-05, "loss": 0.0376, "step": 109240 }, { "epoch": 131.7857573928787, "grad_norm": 3.7685866355895996, "learning_rate": 1.999736672805029e-05, "loss": 0.039, "step": 109250 }, { "epoch": 131.7978273989137, "grad_norm": 4.146918296813965, "learning_rate": 1.9997366486795496e-05, "loss": 0.0389, "step": 109260 }, { "epoch": 131.8098974049487, "grad_norm": 4.103612899780273, "learning_rate": 1.9997366245540702e-05, "loss": 0.0369, "step": 109270 }, { "epoch": 131.82196741098372, "grad_norm": 4.35811185836792, "learning_rate": 1.999736600428591e-05, "loss": 0.0388, "step": 109280 }, { "epoch": 131.83403741701872, "grad_norm": 3.603442907333374, "learning_rate": 1.9997365763031115e-05, "loss": 0.0379, "step": 109290 }, { "epoch": 131.84610742305372, "grad_norm": 3.891677141189575, "learning_rate": 1.999736552177632e-05, "loss": 0.0387, "step": 109300 }, { "epoch": 131.85817742908873, "grad_norm": 4.338212490081787, "learning_rate": 1.9997365280521527e-05, "loss": 0.0384, "step": 109310 }, { "epoch": 131.87024743512373, "grad_norm": 3.894554376602173, "learning_rate": 1.9997365039266733e-05, "loss": 0.0377, "step": 109320 }, { "epoch": 131.88231744115873, "grad_norm": 3.9565398693084717, "learning_rate": 1.999736479801194e-05, "loss": 0.0386, "step": 109330 }, { "epoch": 131.89438744719374, "grad_norm": 3.7814247608184814, "learning_rate": 1.9997364556757146e-05, "loss": 0.0406, "step": 109340 }, { "epoch": 131.90645745322874, "grad_norm": 4.2023725509643555, "learning_rate": 1.9997364315502352e-05, "loss": 0.0382, "step": 109350 }, { "epoch": 131.91852745926374, "grad_norm": 4.0407795906066895, "learning_rate": 1.9997364074247558e-05, "loss": 0.0384, "step": 109360 }, { "epoch": 131.93059746529875, "grad_norm": 3.7852394580841064, "learning_rate": 1.9997363832992764e-05, "loss": 0.0394, "step": 109370 }, { "epoch": 131.94266747133375, "grad_norm": 4.0166239738464355, "learning_rate": 1.999736359173797e-05, "loss": 0.0377, "step": 109380 }, { "epoch": 131.95473747736875, "grad_norm": 3.514252185821533, "learning_rate": 1.9997363350483177e-05, "loss": 0.0396, "step": 109390 }, { "epoch": 131.96680748340376, "grad_norm": 4.231459617614746, "learning_rate": 1.9997363109228383e-05, "loss": 0.0398, "step": 109400 }, { "epoch": 131.97887748943876, "grad_norm": 4.3404765129089355, "learning_rate": 1.999736286797359e-05, "loss": 0.039, "step": 109410 }, { "epoch": 131.99094749547376, "grad_norm": 3.861656904220581, "learning_rate": 1.9997362626718795e-05, "loss": 0.0388, "step": 109420 }, { "epoch": 132.002414001207, "grad_norm": 3.2879738807678223, "learning_rate": 1.9997362385464e-05, "loss": 0.035, "step": 109430 }, { "epoch": 132.014484007242, "grad_norm": 3.0326650142669678, "learning_rate": 1.9997362144209208e-05, "loss": 0.0279, "step": 109440 }, { "epoch": 132.026554013277, "grad_norm": 3.198637008666992, "learning_rate": 1.9997361902954414e-05, "loss": 0.0269, "step": 109450 }, { "epoch": 132.038624019312, "grad_norm": 3.2320408821105957, "learning_rate": 1.999736166169962e-05, "loss": 0.0283, "step": 109460 }, { "epoch": 132.050694025347, "grad_norm": 3.135470151901245, "learning_rate": 1.9997361420444827e-05, "loss": 0.0281, "step": 109470 }, { "epoch": 132.06276403138202, "grad_norm": 3.213348150253296, "learning_rate": 1.9997361179190033e-05, "loss": 0.0269, "step": 109480 }, { "epoch": 132.07483403741702, "grad_norm": 3.062178373336792, "learning_rate": 1.999736093793524e-05, "loss": 0.0298, "step": 109490 }, { "epoch": 132.08690404345202, "grad_norm": 3.5205883979797363, "learning_rate": 1.9997360696680445e-05, "loss": 0.0298, "step": 109500 }, { "epoch": 132.08690404345202, "eval_loss": 13.264662742614746, "eval_runtime": 8.1741, "eval_samples_per_second": 85.269, "eval_steps_per_second": 10.766, "step": 109500 }, { "epoch": 132.09897404948703, "grad_norm": 3.9262948036193848, "learning_rate": 1.999736045542565e-05, "loss": 0.029, "step": 109510 }, { "epoch": 132.11104405552203, "grad_norm": 3.3077311515808105, "learning_rate": 1.9997360214170858e-05, "loss": 0.0306, "step": 109520 }, { "epoch": 132.12311406155703, "grad_norm": 3.502394437789917, "learning_rate": 1.9997359972916064e-05, "loss": 0.0299, "step": 109530 }, { "epoch": 132.13518406759204, "grad_norm": 3.783214807510376, "learning_rate": 1.999735973166127e-05, "loss": 0.0318, "step": 109540 }, { "epoch": 132.14725407362704, "grad_norm": 3.719148874282837, "learning_rate": 1.9997359490406476e-05, "loss": 0.031, "step": 109550 }, { "epoch": 132.15932407966204, "grad_norm": 3.320500135421753, "learning_rate": 1.9997359249151682e-05, "loss": 0.0316, "step": 109560 }, { "epoch": 132.17139408569705, "grad_norm": 3.4958431720733643, "learning_rate": 1.999735900789689e-05, "loss": 0.0304, "step": 109570 }, { "epoch": 132.18346409173205, "grad_norm": 4.049909591674805, "learning_rate": 1.9997358766642095e-05, "loss": 0.0333, "step": 109580 }, { "epoch": 132.19553409776705, "grad_norm": 3.6698243618011475, "learning_rate": 1.99973585253873e-05, "loss": 0.0329, "step": 109590 }, { "epoch": 132.20760410380205, "grad_norm": 3.79453182220459, "learning_rate": 1.9997358284132507e-05, "loss": 0.0328, "step": 109600 }, { "epoch": 132.21967410983706, "grad_norm": 3.5471067428588867, "learning_rate": 1.9997358042877714e-05, "loss": 0.0294, "step": 109610 }, { "epoch": 132.23174411587206, "grad_norm": 3.344979763031006, "learning_rate": 1.999735780162292e-05, "loss": 0.0314, "step": 109620 }, { "epoch": 132.24381412190706, "grad_norm": 3.5979135036468506, "learning_rate": 1.9997357560368126e-05, "loss": 0.0316, "step": 109630 }, { "epoch": 132.25588412794207, "grad_norm": 4.215336322784424, "learning_rate": 1.9997357319113332e-05, "loss": 0.0328, "step": 109640 }, { "epoch": 132.26795413397707, "grad_norm": 3.944526433944702, "learning_rate": 1.999735707785854e-05, "loss": 0.0331, "step": 109650 }, { "epoch": 132.28002414001207, "grad_norm": 3.487309217453003, "learning_rate": 1.9997356836603745e-05, "loss": 0.0328, "step": 109660 }, { "epoch": 132.29209414604708, "grad_norm": 4.050907135009766, "learning_rate": 1.9997356595348947e-05, "loss": 0.0342, "step": 109670 }, { "epoch": 132.30416415208208, "grad_norm": 3.9049007892608643, "learning_rate": 1.9997356354094154e-05, "loss": 0.0327, "step": 109680 }, { "epoch": 132.31623415811708, "grad_norm": 3.938476085662842, "learning_rate": 1.999735611283936e-05, "loss": 0.033, "step": 109690 }, { "epoch": 132.3283041641521, "grad_norm": 3.434608221054077, "learning_rate": 1.9997355871584566e-05, "loss": 0.0326, "step": 109700 }, { "epoch": 132.3403741701871, "grad_norm": 3.4020462036132812, "learning_rate": 1.9997355630329772e-05, "loss": 0.0334, "step": 109710 }, { "epoch": 132.3524441762221, "grad_norm": 3.591014862060547, "learning_rate": 1.999735538907498e-05, "loss": 0.033, "step": 109720 }, { "epoch": 132.3645141822571, "grad_norm": 3.4961533546447754, "learning_rate": 1.9997355147820185e-05, "loss": 0.0317, "step": 109730 }, { "epoch": 132.3765841882921, "grad_norm": 4.06326150894165, "learning_rate": 1.999735490656539e-05, "loss": 0.0334, "step": 109740 }, { "epoch": 132.3886541943271, "grad_norm": 3.763849973678589, "learning_rate": 1.9997354665310597e-05, "loss": 0.0339, "step": 109750 }, { "epoch": 132.4007242003621, "grad_norm": 3.7801926136016846, "learning_rate": 1.9997354424055803e-05, "loss": 0.0356, "step": 109760 }, { "epoch": 132.4127942063971, "grad_norm": 3.82584547996521, "learning_rate": 1.999735418280101e-05, "loss": 0.0355, "step": 109770 }, { "epoch": 132.4248642124321, "grad_norm": 3.874943494796753, "learning_rate": 1.9997353941546216e-05, "loss": 0.0333, "step": 109780 }, { "epoch": 132.43693421846712, "grad_norm": 4.207726001739502, "learning_rate": 1.9997353700291422e-05, "loss": 0.0359, "step": 109790 }, { "epoch": 132.44900422450212, "grad_norm": 3.691091299057007, "learning_rate": 1.9997353459036628e-05, "loss": 0.035, "step": 109800 }, { "epoch": 132.46107423053712, "grad_norm": 3.7592155933380127, "learning_rate": 1.9997353217781834e-05, "loss": 0.037, "step": 109810 }, { "epoch": 132.47314423657213, "grad_norm": 3.642047643661499, "learning_rate": 1.999735297652704e-05, "loss": 0.0339, "step": 109820 }, { "epoch": 132.48521424260713, "grad_norm": 3.8270976543426514, "learning_rate": 1.9997352735272247e-05, "loss": 0.034, "step": 109830 }, { "epoch": 132.49728424864213, "grad_norm": 3.432359457015991, "learning_rate": 1.9997352494017453e-05, "loss": 0.0354, "step": 109840 }, { "epoch": 132.50935425467713, "grad_norm": 3.9810853004455566, "learning_rate": 1.999735225276266e-05, "loss": 0.0353, "step": 109850 }, { "epoch": 132.52142426071214, "grad_norm": 3.576040744781494, "learning_rate": 1.9997352011507866e-05, "loss": 0.0353, "step": 109860 }, { "epoch": 132.53349426674714, "grad_norm": 3.93225359916687, "learning_rate": 1.9997351770253072e-05, "loss": 0.0354, "step": 109870 }, { "epoch": 132.54556427278214, "grad_norm": 4.065990924835205, "learning_rate": 1.9997351528998278e-05, "loss": 0.0374, "step": 109880 }, { "epoch": 132.55763427881715, "grad_norm": 3.3732147216796875, "learning_rate": 1.9997351287743484e-05, "loss": 0.0358, "step": 109890 }, { "epoch": 132.56970428485215, "grad_norm": 3.759777069091797, "learning_rate": 1.999735104648869e-05, "loss": 0.036, "step": 109900 }, { "epoch": 132.58177429088715, "grad_norm": 3.5673186779022217, "learning_rate": 1.9997350805233897e-05, "loss": 0.0363, "step": 109910 }, { "epoch": 132.59384429692216, "grad_norm": 3.5837206840515137, "learning_rate": 1.99973505639791e-05, "loss": 0.0361, "step": 109920 }, { "epoch": 132.60591430295716, "grad_norm": 3.779291868209839, "learning_rate": 1.9997350322724306e-05, "loss": 0.0372, "step": 109930 }, { "epoch": 132.61798430899216, "grad_norm": 3.9857470989227295, "learning_rate": 1.9997350081469512e-05, "loss": 0.0359, "step": 109940 }, { "epoch": 132.63005431502717, "grad_norm": 3.992445707321167, "learning_rate": 1.9997349840214718e-05, "loss": 0.0374, "step": 109950 }, { "epoch": 132.64212432106217, "grad_norm": 4.1102471351623535, "learning_rate": 1.9997349598959924e-05, "loss": 0.0355, "step": 109960 }, { "epoch": 132.65419432709717, "grad_norm": 3.9233591556549072, "learning_rate": 1.999734935770513e-05, "loss": 0.0358, "step": 109970 }, { "epoch": 132.66626433313218, "grad_norm": 3.5233895778656006, "learning_rate": 1.9997349116450337e-05, "loss": 0.0359, "step": 109980 }, { "epoch": 132.67833433916718, "grad_norm": 4.664888858795166, "learning_rate": 1.9997348875195543e-05, "loss": 0.0392, "step": 109990 }, { "epoch": 132.69040434520218, "grad_norm": 3.8901586532592773, "learning_rate": 1.999734863394075e-05, "loss": 0.0379, "step": 110000 }, { "epoch": 132.69040434520218, "eval_loss": 13.285630226135254, "eval_runtime": 8.1832, "eval_samples_per_second": 85.175, "eval_steps_per_second": 10.754, "step": 110000 }, { "epoch": 132.70247435123719, "grad_norm": 3.9432876110076904, "learning_rate": 1.9997348392685955e-05, "loss": 0.0367, "step": 110010 }, { "epoch": 132.7145443572722, "grad_norm": 3.8472707271575928, "learning_rate": 1.999734815143116e-05, "loss": 0.0379, "step": 110020 }, { "epoch": 132.7266143633072, "grad_norm": 3.9809796810150146, "learning_rate": 1.999734791017637e-05, "loss": 0.0357, "step": 110030 }, { "epoch": 132.7386843693422, "grad_norm": 3.817535400390625, "learning_rate": 1.9997347668921577e-05, "loss": 0.0364, "step": 110040 }, { "epoch": 132.7507543753772, "grad_norm": 3.917285919189453, "learning_rate": 1.9997347427666784e-05, "loss": 0.0383, "step": 110050 }, { "epoch": 132.7628243814122, "grad_norm": 4.544585227966309, "learning_rate": 1.999734718641199e-05, "loss": 0.0377, "step": 110060 }, { "epoch": 132.7748943874472, "grad_norm": 3.9584288597106934, "learning_rate": 1.9997346945157196e-05, "loss": 0.0364, "step": 110070 }, { "epoch": 132.7869643934822, "grad_norm": 3.8023810386657715, "learning_rate": 1.99973467039024e-05, "loss": 0.0378, "step": 110080 }, { "epoch": 132.7990343995172, "grad_norm": 4.039845943450928, "learning_rate": 1.9997346462647605e-05, "loss": 0.0356, "step": 110090 }, { "epoch": 132.81110440555221, "grad_norm": 4.021655082702637, "learning_rate": 1.999734622139281e-05, "loss": 0.0362, "step": 110100 }, { "epoch": 132.82317441158722, "grad_norm": 3.549312114715576, "learning_rate": 1.9997345980138018e-05, "loss": 0.0388, "step": 110110 }, { "epoch": 132.83524441762222, "grad_norm": 3.659240961074829, "learning_rate": 1.9997345738883224e-05, "loss": 0.0384, "step": 110120 }, { "epoch": 132.84731442365722, "grad_norm": 3.825103998184204, "learning_rate": 1.999734549762843e-05, "loss": 0.0387, "step": 110130 }, { "epoch": 132.85938442969223, "grad_norm": 3.8717129230499268, "learning_rate": 1.9997345256373636e-05, "loss": 0.038, "step": 110140 }, { "epoch": 132.87145443572723, "grad_norm": 3.7235710620880127, "learning_rate": 1.9997345015118842e-05, "loss": 0.0369, "step": 110150 }, { "epoch": 132.88352444176223, "grad_norm": 4.166902542114258, "learning_rate": 1.999734477386405e-05, "loss": 0.0375, "step": 110160 }, { "epoch": 132.89559444779724, "grad_norm": 3.80621600151062, "learning_rate": 1.9997344532609255e-05, "loss": 0.04, "step": 110170 }, { "epoch": 132.90766445383224, "grad_norm": 4.271759033203125, "learning_rate": 1.999734429135446e-05, "loss": 0.0389, "step": 110180 }, { "epoch": 132.91973445986724, "grad_norm": 3.7986183166503906, "learning_rate": 1.9997344050099667e-05, "loss": 0.0372, "step": 110190 }, { "epoch": 132.93180446590225, "grad_norm": 3.742349863052368, "learning_rate": 1.9997343808844873e-05, "loss": 0.0377, "step": 110200 }, { "epoch": 132.94387447193725, "grad_norm": 3.4748380184173584, "learning_rate": 1.999734356759008e-05, "loss": 0.0372, "step": 110210 }, { "epoch": 132.95594447797225, "grad_norm": 3.7461276054382324, "learning_rate": 1.9997343326335286e-05, "loss": 0.038, "step": 110220 }, { "epoch": 132.96801448400726, "grad_norm": 3.562201738357544, "learning_rate": 1.9997343085080492e-05, "loss": 0.0386, "step": 110230 }, { "epoch": 132.98008449004226, "grad_norm": 4.1073808670043945, "learning_rate": 1.99973428438257e-05, "loss": 0.041, "step": 110240 }, { "epoch": 132.99215449607726, "grad_norm": 3.9185633659362793, "learning_rate": 1.9997342602570905e-05, "loss": 0.0396, "step": 110250 }, { "epoch": 133.0036210018105, "grad_norm": 3.044025421142578, "learning_rate": 1.999734236131611e-05, "loss": 0.0353, "step": 110260 }, { "epoch": 133.0156910078455, "grad_norm": 3.328104019165039, "learning_rate": 1.9997342120061317e-05, "loss": 0.0246, "step": 110270 }, { "epoch": 133.0277610138805, "grad_norm": 3.5110549926757812, "learning_rate": 1.9997341878806523e-05, "loss": 0.0286, "step": 110280 }, { "epoch": 133.0398310199155, "grad_norm": 3.432509422302246, "learning_rate": 1.999734163755173e-05, "loss": 0.0269, "step": 110290 }, { "epoch": 133.0519010259505, "grad_norm": 3.4292893409729004, "learning_rate": 1.9997341396296936e-05, "loss": 0.0295, "step": 110300 }, { "epoch": 133.06397103198552, "grad_norm": 3.6343116760253906, "learning_rate": 1.9997341155042142e-05, "loss": 0.0317, "step": 110310 }, { "epoch": 133.07604103802052, "grad_norm": 3.744141101837158, "learning_rate": 1.9997340913787348e-05, "loss": 0.0306, "step": 110320 }, { "epoch": 133.08811104405552, "grad_norm": 2.84360671043396, "learning_rate": 1.999734067253255e-05, "loss": 0.0295, "step": 110330 }, { "epoch": 133.10018105009053, "grad_norm": 3.533329963684082, "learning_rate": 1.9997340431277757e-05, "loss": 0.03, "step": 110340 }, { "epoch": 133.11225105612553, "grad_norm": 3.5632247924804688, "learning_rate": 1.9997340190022963e-05, "loss": 0.0288, "step": 110350 }, { "epoch": 133.12432106216053, "grad_norm": 3.126340866088867, "learning_rate": 1.999733994876817e-05, "loss": 0.0303, "step": 110360 }, { "epoch": 133.13639106819554, "grad_norm": 3.8229939937591553, "learning_rate": 1.9997339707513376e-05, "loss": 0.0315, "step": 110370 }, { "epoch": 133.14846107423054, "grad_norm": 4.019691467285156, "learning_rate": 1.9997339466258582e-05, "loss": 0.0338, "step": 110380 }, { "epoch": 133.16053108026554, "grad_norm": 3.510550022125244, "learning_rate": 1.9997339225003788e-05, "loss": 0.0295, "step": 110390 }, { "epoch": 133.17260108630055, "grad_norm": 3.723007917404175, "learning_rate": 1.9997338983748994e-05, "loss": 0.0313, "step": 110400 }, { "epoch": 133.18467109233555, "grad_norm": 3.56760835647583, "learning_rate": 1.99973387424942e-05, "loss": 0.0318, "step": 110410 }, { "epoch": 133.19674109837055, "grad_norm": 3.220663070678711, "learning_rate": 1.9997338501239407e-05, "loss": 0.0319, "step": 110420 }, { "epoch": 133.20881110440556, "grad_norm": 3.7068610191345215, "learning_rate": 1.9997338259984613e-05, "loss": 0.0296, "step": 110430 }, { "epoch": 133.22088111044056, "grad_norm": 4.010056018829346, "learning_rate": 1.999733801872982e-05, "loss": 0.0328, "step": 110440 }, { "epoch": 133.23295111647556, "grad_norm": 4.163107872009277, "learning_rate": 1.9997337777475025e-05, "loss": 0.0308, "step": 110450 }, { "epoch": 133.24502112251056, "grad_norm": 3.121800661087036, "learning_rate": 1.9997337536220232e-05, "loss": 0.0333, "step": 110460 }, { "epoch": 133.25709112854557, "grad_norm": 3.369997024536133, "learning_rate": 1.9997337294965438e-05, "loss": 0.0314, "step": 110470 }, { "epoch": 133.26916113458057, "grad_norm": 3.0685877799987793, "learning_rate": 1.9997337053710644e-05, "loss": 0.0322, "step": 110480 }, { "epoch": 133.28123114061557, "grad_norm": 3.5323376655578613, "learning_rate": 1.999733681245585e-05, "loss": 0.0313, "step": 110490 }, { "epoch": 133.29330114665058, "grad_norm": 3.559544086456299, "learning_rate": 1.9997336571201057e-05, "loss": 0.0325, "step": 110500 }, { "epoch": 133.29330114665058, "eval_loss": 13.298911094665527, "eval_runtime": 8.1492, "eval_samples_per_second": 85.53, "eval_steps_per_second": 10.799, "step": 110500 }, { "epoch": 133.30537115268558, "grad_norm": 3.2595458030700684, "learning_rate": 1.9997336329946263e-05, "loss": 0.0329, "step": 110510 }, { "epoch": 133.31744115872058, "grad_norm": 3.3060669898986816, "learning_rate": 1.999733608869147e-05, "loss": 0.0322, "step": 110520 }, { "epoch": 133.3295111647556, "grad_norm": 3.7432444095611572, "learning_rate": 1.9997335847436675e-05, "loss": 0.0328, "step": 110530 }, { "epoch": 133.3415811707906, "grad_norm": 4.239130020141602, "learning_rate": 1.999733560618188e-05, "loss": 0.0349, "step": 110540 }, { "epoch": 133.3536511768256, "grad_norm": 3.429046154022217, "learning_rate": 1.9997335364927088e-05, "loss": 0.0332, "step": 110550 }, { "epoch": 133.3657211828606, "grad_norm": 3.732522964477539, "learning_rate": 1.9997335123672294e-05, "loss": 0.0334, "step": 110560 }, { "epoch": 133.3777911888956, "grad_norm": 3.989677906036377, "learning_rate": 1.99973348824175e-05, "loss": 0.0327, "step": 110570 }, { "epoch": 133.3898611949306, "grad_norm": 3.5017964839935303, "learning_rate": 1.9997334641162706e-05, "loss": 0.0347, "step": 110580 }, { "epoch": 133.4019312009656, "grad_norm": 3.32092547416687, "learning_rate": 1.9997334399907912e-05, "loss": 0.0328, "step": 110590 }, { "epoch": 133.4140012070006, "grad_norm": 4.2251129150390625, "learning_rate": 1.999733415865312e-05, "loss": 0.0329, "step": 110600 }, { "epoch": 133.4260712130356, "grad_norm": 3.5074453353881836, "learning_rate": 1.9997333917398325e-05, "loss": 0.0323, "step": 110610 }, { "epoch": 133.43814121907062, "grad_norm": 3.6233725547790527, "learning_rate": 1.999733367614353e-05, "loss": 0.0352, "step": 110620 }, { "epoch": 133.45021122510562, "grad_norm": 3.5295701026916504, "learning_rate": 1.9997333434888737e-05, "loss": 0.0336, "step": 110630 }, { "epoch": 133.46228123114062, "grad_norm": 3.9190969467163086, "learning_rate": 1.9997333193633944e-05, "loss": 0.0345, "step": 110640 }, { "epoch": 133.47435123717563, "grad_norm": 4.001247406005859, "learning_rate": 1.999733295237915e-05, "loss": 0.035, "step": 110650 }, { "epoch": 133.48642124321063, "grad_norm": 4.138425827026367, "learning_rate": 1.9997332711124356e-05, "loss": 0.0347, "step": 110660 }, { "epoch": 133.49849124924563, "grad_norm": 3.9179956912994385, "learning_rate": 1.9997332469869562e-05, "loss": 0.0359, "step": 110670 }, { "epoch": 133.51056125528063, "grad_norm": 3.803826332092285, "learning_rate": 1.999733222861477e-05, "loss": 0.0345, "step": 110680 }, { "epoch": 133.52263126131564, "grad_norm": 3.6566662788391113, "learning_rate": 1.9997331987359975e-05, "loss": 0.035, "step": 110690 }, { "epoch": 133.53470126735064, "grad_norm": 4.073094367980957, "learning_rate": 1.999733174610518e-05, "loss": 0.0358, "step": 110700 }, { "epoch": 133.54677127338564, "grad_norm": 3.7104837894439697, "learning_rate": 1.9997331504850387e-05, "loss": 0.0351, "step": 110710 }, { "epoch": 133.55884127942065, "grad_norm": 3.795722246170044, "learning_rate": 1.9997331263595593e-05, "loss": 0.0354, "step": 110720 }, { "epoch": 133.57091128545565, "grad_norm": 4.060943126678467, "learning_rate": 1.99973310223408e-05, "loss": 0.0371, "step": 110730 }, { "epoch": 133.58298129149065, "grad_norm": 3.867842197418213, "learning_rate": 1.9997330781086002e-05, "loss": 0.0358, "step": 110740 }, { "epoch": 133.59505129752566, "grad_norm": 3.8117127418518066, "learning_rate": 1.999733053983121e-05, "loss": 0.036, "step": 110750 }, { "epoch": 133.60712130356066, "grad_norm": 3.539499044418335, "learning_rate": 1.9997330298576415e-05, "loss": 0.0367, "step": 110760 }, { "epoch": 133.61919130959566, "grad_norm": 3.777909278869629, "learning_rate": 1.999733005732162e-05, "loss": 0.0352, "step": 110770 }, { "epoch": 133.63126131563067, "grad_norm": 3.737819194793701, "learning_rate": 1.9997329816066827e-05, "loss": 0.0354, "step": 110780 }, { "epoch": 133.64333132166567, "grad_norm": 3.8122715950012207, "learning_rate": 1.9997329574812033e-05, "loss": 0.0344, "step": 110790 }, { "epoch": 133.65540132770067, "grad_norm": 4.070217609405518, "learning_rate": 1.999732933355724e-05, "loss": 0.0361, "step": 110800 }, { "epoch": 133.66747133373568, "grad_norm": 3.844998359680176, "learning_rate": 1.9997329092302446e-05, "loss": 0.0362, "step": 110810 }, { "epoch": 133.67954133977068, "grad_norm": 3.7924904823303223, "learning_rate": 1.9997328851047652e-05, "loss": 0.0355, "step": 110820 }, { "epoch": 133.69161134580568, "grad_norm": 3.9023613929748535, "learning_rate": 1.9997328609792858e-05, "loss": 0.0359, "step": 110830 }, { "epoch": 133.7036813518407, "grad_norm": 3.440830945968628, "learning_rate": 1.9997328368538064e-05, "loss": 0.035, "step": 110840 }, { "epoch": 133.7157513578757, "grad_norm": 3.5951006412506104, "learning_rate": 1.999732812728327e-05, "loss": 0.0364, "step": 110850 }, { "epoch": 133.7278213639107, "grad_norm": 3.494546413421631, "learning_rate": 1.9997327886028477e-05, "loss": 0.0365, "step": 110860 }, { "epoch": 133.7398913699457, "grad_norm": 4.1499762535095215, "learning_rate": 1.9997327644773683e-05, "loss": 0.0354, "step": 110870 }, { "epoch": 133.7519613759807, "grad_norm": 3.8310790061950684, "learning_rate": 1.999732740351889e-05, "loss": 0.0357, "step": 110880 }, { "epoch": 133.7640313820157, "grad_norm": 4.031505584716797, "learning_rate": 1.9997327162264096e-05, "loss": 0.0367, "step": 110890 }, { "epoch": 133.7761013880507, "grad_norm": 3.8265089988708496, "learning_rate": 1.9997326921009302e-05, "loss": 0.0366, "step": 110900 }, { "epoch": 133.7881713940857, "grad_norm": 3.9157638549804688, "learning_rate": 1.9997326679754508e-05, "loss": 0.0386, "step": 110910 }, { "epoch": 133.8002414001207, "grad_norm": 4.3965325355529785, "learning_rate": 1.9997326438499714e-05, "loss": 0.0402, "step": 110920 }, { "epoch": 133.81231140615571, "grad_norm": 3.4576637744903564, "learning_rate": 1.999732619724492e-05, "loss": 0.0377, "step": 110930 }, { "epoch": 133.82438141219072, "grad_norm": 4.089404106140137, "learning_rate": 1.9997325955990127e-05, "loss": 0.0374, "step": 110940 }, { "epoch": 133.83645141822572, "grad_norm": 3.965289354324341, "learning_rate": 1.9997325714735333e-05, "loss": 0.0386, "step": 110950 }, { "epoch": 133.84852142426072, "grad_norm": 3.690810203552246, "learning_rate": 1.999732547348054e-05, "loss": 0.0375, "step": 110960 }, { "epoch": 133.86059143029573, "grad_norm": 3.8265345096588135, "learning_rate": 1.9997325232225745e-05, "loss": 0.0382, "step": 110970 }, { "epoch": 133.87266143633073, "grad_norm": 4.398624420166016, "learning_rate": 1.999732499097095e-05, "loss": 0.0402, "step": 110980 }, { "epoch": 133.88473144236573, "grad_norm": 4.551454544067383, "learning_rate": 1.9997324749716154e-05, "loss": 0.0392, "step": 110990 }, { "epoch": 133.89680144840074, "grad_norm": 4.296910285949707, "learning_rate": 1.999732450846136e-05, "loss": 0.0385, "step": 111000 }, { "epoch": 133.89680144840074, "eval_loss": 13.279839515686035, "eval_runtime": 8.1873, "eval_samples_per_second": 85.132, "eval_steps_per_second": 10.748, "step": 111000 }, { "epoch": 133.90887145443574, "grad_norm": 3.6723530292510986, "learning_rate": 1.9997324267206567e-05, "loss": 0.0392, "step": 111010 }, { "epoch": 133.92094146047074, "grad_norm": 3.6463735103607178, "learning_rate": 1.9997324025951773e-05, "loss": 0.0388, "step": 111020 }, { "epoch": 133.93301146650575, "grad_norm": 3.908057689666748, "learning_rate": 1.999732378469698e-05, "loss": 0.037, "step": 111030 }, { "epoch": 133.94508147254075, "grad_norm": 4.1112060546875, "learning_rate": 1.9997323543442185e-05, "loss": 0.0398, "step": 111040 }, { "epoch": 133.95715147857575, "grad_norm": 3.7105088233947754, "learning_rate": 1.999732330218739e-05, "loss": 0.037, "step": 111050 }, { "epoch": 133.96922148461076, "grad_norm": 4.522140026092529, "learning_rate": 1.9997323060932598e-05, "loss": 0.0401, "step": 111060 }, { "epoch": 133.98129149064576, "grad_norm": 3.9517524242401123, "learning_rate": 1.9997322819677804e-05, "loss": 0.0396, "step": 111070 }, { "epoch": 133.99336149668076, "grad_norm": 3.712806224822998, "learning_rate": 1.999732257842301e-05, "loss": 0.0371, "step": 111080 }, { "epoch": 134.004828002414, "grad_norm": 3.4044151306152344, "learning_rate": 1.9997322337168216e-05, "loss": 0.0333, "step": 111090 }, { "epoch": 134.016898008449, "grad_norm": 3.2462565898895264, "learning_rate": 1.9997322095913423e-05, "loss": 0.0254, "step": 111100 }, { "epoch": 134.028968014484, "grad_norm": 3.094968557357788, "learning_rate": 1.9997321854658632e-05, "loss": 0.0278, "step": 111110 }, { "epoch": 134.041038020519, "grad_norm": 3.3328065872192383, "learning_rate": 1.999732161340384e-05, "loss": 0.0278, "step": 111120 }, { "epoch": 134.053108026554, "grad_norm": 3.219123601913452, "learning_rate": 1.9997321372149045e-05, "loss": 0.0285, "step": 111130 }, { "epoch": 134.06517803258902, "grad_norm": 3.3882853984832764, "learning_rate": 1.999732113089425e-05, "loss": 0.0271, "step": 111140 }, { "epoch": 134.07724803862402, "grad_norm": 3.2242114543914795, "learning_rate": 1.9997320889639457e-05, "loss": 0.0288, "step": 111150 }, { "epoch": 134.08931804465902, "grad_norm": 3.835935592651367, "learning_rate": 1.999732064838466e-05, "loss": 0.0294, "step": 111160 }, { "epoch": 134.10138805069403, "grad_norm": 4.011316299438477, "learning_rate": 1.9997320407129866e-05, "loss": 0.031, "step": 111170 }, { "epoch": 134.11345805672903, "grad_norm": 3.5682640075683594, "learning_rate": 1.9997320165875072e-05, "loss": 0.0302, "step": 111180 }, { "epoch": 134.12552806276403, "grad_norm": 3.2057464122772217, "learning_rate": 1.999731992462028e-05, "loss": 0.0309, "step": 111190 }, { "epoch": 134.13759806879904, "grad_norm": 3.570439338684082, "learning_rate": 1.9997319683365485e-05, "loss": 0.0293, "step": 111200 }, { "epoch": 134.14966807483404, "grad_norm": 3.363670825958252, "learning_rate": 1.999731944211069e-05, "loss": 0.0301, "step": 111210 }, { "epoch": 134.16173808086904, "grad_norm": 3.3048298358917236, "learning_rate": 1.9997319200855897e-05, "loss": 0.0292, "step": 111220 }, { "epoch": 134.17380808690405, "grad_norm": 3.6965579986572266, "learning_rate": 1.9997318959601104e-05, "loss": 0.0323, "step": 111230 }, { "epoch": 134.18587809293905, "grad_norm": 3.4400155544281006, "learning_rate": 1.999731871834631e-05, "loss": 0.0301, "step": 111240 }, { "epoch": 134.19794809897405, "grad_norm": 3.9727604389190674, "learning_rate": 1.9997318477091516e-05, "loss": 0.0312, "step": 111250 }, { "epoch": 134.21001810500906, "grad_norm": 3.9429218769073486, "learning_rate": 1.9997318235836722e-05, "loss": 0.0305, "step": 111260 }, { "epoch": 134.22208811104406, "grad_norm": 3.131199836730957, "learning_rate": 1.999731799458193e-05, "loss": 0.0299, "step": 111270 }, { "epoch": 134.23415811707906, "grad_norm": 3.471595048904419, "learning_rate": 1.9997317753327135e-05, "loss": 0.0305, "step": 111280 }, { "epoch": 134.24622812311407, "grad_norm": 3.924201488494873, "learning_rate": 1.999731751207234e-05, "loss": 0.0319, "step": 111290 }, { "epoch": 134.25829812914907, "grad_norm": 3.6240813732147217, "learning_rate": 1.9997317270817547e-05, "loss": 0.0314, "step": 111300 }, { "epoch": 134.27036813518407, "grad_norm": 3.51688551902771, "learning_rate": 1.9997317029562753e-05, "loss": 0.0334, "step": 111310 }, { "epoch": 134.28243814121907, "grad_norm": 3.9423396587371826, "learning_rate": 1.999731678830796e-05, "loss": 0.0332, "step": 111320 }, { "epoch": 134.29450814725408, "grad_norm": 3.8274359703063965, "learning_rate": 1.9997316547053166e-05, "loss": 0.0334, "step": 111330 }, { "epoch": 134.30657815328908, "grad_norm": 3.644534111022949, "learning_rate": 1.9997316305798372e-05, "loss": 0.0347, "step": 111340 }, { "epoch": 134.31864815932408, "grad_norm": 3.4629576206207275, "learning_rate": 1.9997316064543578e-05, "loss": 0.0326, "step": 111350 }, { "epoch": 134.3307181653591, "grad_norm": 3.2014498710632324, "learning_rate": 1.9997315823288784e-05, "loss": 0.0321, "step": 111360 }, { "epoch": 134.3427881713941, "grad_norm": 3.6938395500183105, "learning_rate": 1.999731558203399e-05, "loss": 0.0327, "step": 111370 }, { "epoch": 134.3548581774291, "grad_norm": 3.2707858085632324, "learning_rate": 1.9997315340779197e-05, "loss": 0.0329, "step": 111380 }, { "epoch": 134.3669281834641, "grad_norm": 3.843048572540283, "learning_rate": 1.9997315099524403e-05, "loss": 0.0342, "step": 111390 }, { "epoch": 134.3789981894991, "grad_norm": 3.579281806945801, "learning_rate": 1.999731485826961e-05, "loss": 0.0335, "step": 111400 }, { "epoch": 134.3910681955341, "grad_norm": 3.7381744384765625, "learning_rate": 1.9997314617014812e-05, "loss": 0.0339, "step": 111410 }, { "epoch": 134.4031382015691, "grad_norm": 4.615571022033691, "learning_rate": 1.9997314375760018e-05, "loss": 0.034, "step": 111420 }, { "epoch": 134.4152082076041, "grad_norm": 3.5962533950805664, "learning_rate": 1.9997314134505224e-05, "loss": 0.0356, "step": 111430 }, { "epoch": 134.4272782136391, "grad_norm": 3.906850814819336, "learning_rate": 1.999731389325043e-05, "loss": 0.0343, "step": 111440 }, { "epoch": 134.43934821967412, "grad_norm": 3.3018081188201904, "learning_rate": 1.9997313651995637e-05, "loss": 0.0366, "step": 111450 }, { "epoch": 134.45141822570912, "grad_norm": 3.9995453357696533, "learning_rate": 1.9997313410740843e-05, "loss": 0.0344, "step": 111460 }, { "epoch": 134.46348823174412, "grad_norm": 4.0639424324035645, "learning_rate": 1.999731316948605e-05, "loss": 0.0331, "step": 111470 }, { "epoch": 134.47555823777913, "grad_norm": 3.5602755546569824, "learning_rate": 1.9997312928231256e-05, "loss": 0.0341, "step": 111480 }, { "epoch": 134.48762824381413, "grad_norm": 3.2411324977874756, "learning_rate": 1.9997312686976462e-05, "loss": 0.034, "step": 111490 }, { "epoch": 134.49969824984913, "grad_norm": 3.775719165802002, "learning_rate": 1.9997312445721668e-05, "loss": 0.0342, "step": 111500 }, { "epoch": 134.49969824984913, "eval_loss": 13.314921379089355, "eval_runtime": 8.1705, "eval_samples_per_second": 85.307, "eval_steps_per_second": 10.77, "step": 111500 }, { "epoch": 134.51176825588414, "grad_norm": 3.9534623622894287, "learning_rate": 1.9997312204466874e-05, "loss": 0.0367, "step": 111510 }, { "epoch": 134.52383826191914, "grad_norm": 3.8041868209838867, "learning_rate": 1.999731196321208e-05, "loss": 0.0352, "step": 111520 }, { "epoch": 134.53590826795414, "grad_norm": 3.8087775707244873, "learning_rate": 1.9997311721957287e-05, "loss": 0.0348, "step": 111530 }, { "epoch": 134.54797827398914, "grad_norm": 3.6860063076019287, "learning_rate": 1.9997311480702493e-05, "loss": 0.0349, "step": 111540 }, { "epoch": 134.56004828002415, "grad_norm": 3.7292640209198, "learning_rate": 1.99973112394477e-05, "loss": 0.0336, "step": 111550 }, { "epoch": 134.57211828605915, "grad_norm": 3.80342960357666, "learning_rate": 1.9997310998192905e-05, "loss": 0.035, "step": 111560 }, { "epoch": 134.58418829209415, "grad_norm": 3.596423625946045, "learning_rate": 1.999731075693811e-05, "loss": 0.035, "step": 111570 }, { "epoch": 134.59625829812916, "grad_norm": 3.4051663875579834, "learning_rate": 1.9997310515683318e-05, "loss": 0.0354, "step": 111580 }, { "epoch": 134.60832830416416, "grad_norm": 3.575540781021118, "learning_rate": 1.9997310274428524e-05, "loss": 0.0363, "step": 111590 }, { "epoch": 134.62039831019916, "grad_norm": 4.0288238525390625, "learning_rate": 1.999731003317373e-05, "loss": 0.036, "step": 111600 }, { "epoch": 134.63246831623417, "grad_norm": 3.444014072418213, "learning_rate": 1.9997309791918936e-05, "loss": 0.0353, "step": 111610 }, { "epoch": 134.64453832226917, "grad_norm": 4.408532619476318, "learning_rate": 1.9997309550664143e-05, "loss": 0.0364, "step": 111620 }, { "epoch": 134.65660832830417, "grad_norm": 4.066373825073242, "learning_rate": 1.999730930940935e-05, "loss": 0.0376, "step": 111630 }, { "epoch": 134.66867833433918, "grad_norm": 4.062343597412109, "learning_rate": 1.9997309068154555e-05, "loss": 0.0357, "step": 111640 }, { "epoch": 134.68074834037418, "grad_norm": 4.038297176361084, "learning_rate": 1.999730882689976e-05, "loss": 0.0363, "step": 111650 }, { "epoch": 134.69281834640918, "grad_norm": 3.924981117248535, "learning_rate": 1.9997308585644967e-05, "loss": 0.0341, "step": 111660 }, { "epoch": 134.7048883524442, "grad_norm": 3.726881504058838, "learning_rate": 1.9997308344390174e-05, "loss": 0.0359, "step": 111670 }, { "epoch": 134.7169583584792, "grad_norm": 3.559183120727539, "learning_rate": 1.999730810313538e-05, "loss": 0.0348, "step": 111680 }, { "epoch": 134.7290283645142, "grad_norm": 4.031441688537598, "learning_rate": 1.9997307861880586e-05, "loss": 0.0386, "step": 111690 }, { "epoch": 134.7410983705492, "grad_norm": 3.627102851867676, "learning_rate": 1.9997307620625792e-05, "loss": 0.0352, "step": 111700 }, { "epoch": 134.7531683765842, "grad_norm": 4.3110270500183105, "learning_rate": 1.9997307379371e-05, "loss": 0.0366, "step": 111710 }, { "epoch": 134.7652383826192, "grad_norm": 3.963238000869751, "learning_rate": 1.9997307138116205e-05, "loss": 0.0369, "step": 111720 }, { "epoch": 134.7773083886542, "grad_norm": 3.4519219398498535, "learning_rate": 1.999730689686141e-05, "loss": 0.0384, "step": 111730 }, { "epoch": 134.7893783946892, "grad_norm": 3.4165401458740234, "learning_rate": 1.9997306655606617e-05, "loss": 0.0361, "step": 111740 }, { "epoch": 134.8014484007242, "grad_norm": 4.197215557098389, "learning_rate": 1.9997306414351823e-05, "loss": 0.0364, "step": 111750 }, { "epoch": 134.81351840675921, "grad_norm": 4.011495113372803, "learning_rate": 1.999730617309703e-05, "loss": 0.0375, "step": 111760 }, { "epoch": 134.82558841279422, "grad_norm": 3.698286771774292, "learning_rate": 1.9997305931842236e-05, "loss": 0.0373, "step": 111770 }, { "epoch": 134.83765841882922, "grad_norm": 3.8003952503204346, "learning_rate": 1.9997305690587442e-05, "loss": 0.0372, "step": 111780 }, { "epoch": 134.84972842486422, "grad_norm": 3.9824154376983643, "learning_rate": 1.9997305449332648e-05, "loss": 0.037, "step": 111790 }, { "epoch": 134.86179843089923, "grad_norm": 4.16353702545166, "learning_rate": 1.9997305208077854e-05, "loss": 0.0388, "step": 111800 }, { "epoch": 134.87386843693423, "grad_norm": 3.853778600692749, "learning_rate": 1.999730496682306e-05, "loss": 0.0391, "step": 111810 }, { "epoch": 134.88593844296923, "grad_norm": 3.723334789276123, "learning_rate": 1.9997304725568263e-05, "loss": 0.0375, "step": 111820 }, { "epoch": 134.89800844900424, "grad_norm": 3.8584940433502197, "learning_rate": 1.999730448431347e-05, "loss": 0.0392, "step": 111830 }, { "epoch": 134.91007845503924, "grad_norm": 3.4863686561584473, "learning_rate": 1.9997304243058676e-05, "loss": 0.0374, "step": 111840 }, { "epoch": 134.92214846107424, "grad_norm": 4.09597635269165, "learning_rate": 1.9997304001803882e-05, "loss": 0.0365, "step": 111850 }, { "epoch": 134.93421846710925, "grad_norm": 3.565704345703125, "learning_rate": 1.9997303760549088e-05, "loss": 0.0403, "step": 111860 }, { "epoch": 134.94628847314425, "grad_norm": 4.263326168060303, "learning_rate": 1.9997303519294295e-05, "loss": 0.0384, "step": 111870 }, { "epoch": 134.95835847917925, "grad_norm": 3.8951122760772705, "learning_rate": 1.99973032780395e-05, "loss": 0.0382, "step": 111880 }, { "epoch": 134.97042848521426, "grad_norm": 4.058570384979248, "learning_rate": 1.9997303036784707e-05, "loss": 0.0393, "step": 111890 }, { "epoch": 134.98249849124926, "grad_norm": 3.9972617626190186, "learning_rate": 1.9997302795529913e-05, "loss": 0.0379, "step": 111900 }, { "epoch": 134.99456849728426, "grad_norm": 3.922955274581909, "learning_rate": 1.999730255427512e-05, "loss": 0.0378, "step": 111910 }, { "epoch": 135.0060350030175, "grad_norm": 2.859344482421875, "learning_rate": 1.9997302313020326e-05, "loss": 0.032, "step": 111920 }, { "epoch": 135.0181050090525, "grad_norm": 3.0729143619537354, "learning_rate": 1.9997302071765532e-05, "loss": 0.0236, "step": 111930 }, { "epoch": 135.0301750150875, "grad_norm": 3.441162109375, "learning_rate": 1.9997301830510738e-05, "loss": 0.0277, "step": 111940 }, { "epoch": 135.0422450211225, "grad_norm": 2.829716920852661, "learning_rate": 1.9997301589255944e-05, "loss": 0.0279, "step": 111950 }, { "epoch": 135.05431502715751, "grad_norm": 3.552825450897217, "learning_rate": 1.999730134800115e-05, "loss": 0.0278, "step": 111960 }, { "epoch": 135.06638503319252, "grad_norm": 3.231715202331543, "learning_rate": 1.9997301106746357e-05, "loss": 0.0282, "step": 111970 }, { "epoch": 135.07845503922752, "grad_norm": 3.4480841159820557, "learning_rate": 1.9997300865491563e-05, "loss": 0.0296, "step": 111980 }, { "epoch": 135.09052504526252, "grad_norm": 3.2923026084899902, "learning_rate": 1.999730062423677e-05, "loss": 0.0268, "step": 111990 }, { "epoch": 135.10259505129753, "grad_norm": 3.1666805744171143, "learning_rate": 1.9997300382981975e-05, "loss": 0.0299, "step": 112000 }, { "epoch": 135.10259505129753, "eval_loss": 13.309896469116211, "eval_runtime": 8.1668, "eval_samples_per_second": 85.346, "eval_steps_per_second": 10.775, "step": 112000 }, { "epoch": 135.11466505733253, "grad_norm": 3.446138858795166, "learning_rate": 1.999730014172718e-05, "loss": 0.0296, "step": 112010 }, { "epoch": 135.12673506336753, "grad_norm": 3.9419827461242676, "learning_rate": 1.9997299900472388e-05, "loss": 0.0297, "step": 112020 }, { "epoch": 135.13880506940254, "grad_norm": 3.400510311126709, "learning_rate": 1.9997299659217594e-05, "loss": 0.028, "step": 112030 }, { "epoch": 135.15087507543754, "grad_norm": 3.295396089553833, "learning_rate": 1.99972994179628e-05, "loss": 0.0288, "step": 112040 }, { "epoch": 135.16294508147254, "grad_norm": 3.4163379669189453, "learning_rate": 1.9997299176708006e-05, "loss": 0.0298, "step": 112050 }, { "epoch": 135.17501508750755, "grad_norm": 3.3353774547576904, "learning_rate": 1.9997298935453213e-05, "loss": 0.0317, "step": 112060 }, { "epoch": 135.18708509354255, "grad_norm": 3.173999786376953, "learning_rate": 1.9997298694198415e-05, "loss": 0.0307, "step": 112070 }, { "epoch": 135.19915509957755, "grad_norm": 3.11354398727417, "learning_rate": 1.999729845294362e-05, "loss": 0.0311, "step": 112080 }, { "epoch": 135.21122510561256, "grad_norm": 3.522416353225708, "learning_rate": 1.9997298211688828e-05, "loss": 0.0321, "step": 112090 }, { "epoch": 135.22329511164756, "grad_norm": 3.8452227115631104, "learning_rate": 1.9997297970434034e-05, "loss": 0.0316, "step": 112100 }, { "epoch": 135.23536511768256, "grad_norm": 3.614595651626587, "learning_rate": 1.999729772917924e-05, "loss": 0.0313, "step": 112110 }, { "epoch": 135.24743512371757, "grad_norm": 3.783914089202881, "learning_rate": 1.9997297487924447e-05, "loss": 0.033, "step": 112120 }, { "epoch": 135.25950512975257, "grad_norm": 3.3990120887756348, "learning_rate": 1.9997297246669653e-05, "loss": 0.032, "step": 112130 }, { "epoch": 135.27157513578757, "grad_norm": 3.400040864944458, "learning_rate": 1.999729700541486e-05, "loss": 0.0308, "step": 112140 }, { "epoch": 135.28364514182257, "grad_norm": 3.4837758541107178, "learning_rate": 1.9997296764160065e-05, "loss": 0.0318, "step": 112150 }, { "epoch": 135.29571514785758, "grad_norm": 3.402927875518799, "learning_rate": 1.999729652290527e-05, "loss": 0.0348, "step": 112160 }, { "epoch": 135.30778515389258, "grad_norm": 3.750244379043579, "learning_rate": 1.9997296281650478e-05, "loss": 0.0348, "step": 112170 }, { "epoch": 135.31985515992758, "grad_norm": 3.908245086669922, "learning_rate": 1.9997296040395684e-05, "loss": 0.033, "step": 112180 }, { "epoch": 135.3319251659626, "grad_norm": 3.750645160675049, "learning_rate": 1.9997295799140893e-05, "loss": 0.0323, "step": 112190 }, { "epoch": 135.3439951719976, "grad_norm": 3.7653274536132812, "learning_rate": 1.99972955578861e-05, "loss": 0.034, "step": 112200 }, { "epoch": 135.3560651780326, "grad_norm": 3.885206460952759, "learning_rate": 1.9997295316631306e-05, "loss": 0.0325, "step": 112210 }, { "epoch": 135.3681351840676, "grad_norm": 3.74125599861145, "learning_rate": 1.9997295075376512e-05, "loss": 0.034, "step": 112220 }, { "epoch": 135.3802051901026, "grad_norm": 3.497706413269043, "learning_rate": 1.9997294834121715e-05, "loss": 0.033, "step": 112230 }, { "epoch": 135.3922751961376, "grad_norm": 3.342161178588867, "learning_rate": 1.999729459286692e-05, "loss": 0.0343, "step": 112240 }, { "epoch": 135.4043452021726, "grad_norm": 3.360208511352539, "learning_rate": 1.9997294351612127e-05, "loss": 0.0341, "step": 112250 }, { "epoch": 135.4164152082076, "grad_norm": 3.291797399520874, "learning_rate": 1.9997294110357334e-05, "loss": 0.0348, "step": 112260 }, { "epoch": 135.4284852142426, "grad_norm": 3.837857246398926, "learning_rate": 1.999729386910254e-05, "loss": 0.0346, "step": 112270 }, { "epoch": 135.44055522027762, "grad_norm": 3.5286593437194824, "learning_rate": 1.9997293627847746e-05, "loss": 0.0353, "step": 112280 }, { "epoch": 135.45262522631262, "grad_norm": 3.4647233486175537, "learning_rate": 1.9997293386592952e-05, "loss": 0.0353, "step": 112290 }, { "epoch": 135.46469523234762, "grad_norm": 3.3072562217712402, "learning_rate": 1.999729314533816e-05, "loss": 0.0335, "step": 112300 }, { "epoch": 135.47676523838263, "grad_norm": 3.7289786338806152, "learning_rate": 1.9997292904083365e-05, "loss": 0.0337, "step": 112310 }, { "epoch": 135.48883524441763, "grad_norm": 3.6889407634735107, "learning_rate": 1.999729266282857e-05, "loss": 0.0332, "step": 112320 }, { "epoch": 135.50090525045263, "grad_norm": 3.256443738937378, "learning_rate": 1.9997292421573777e-05, "loss": 0.0329, "step": 112330 }, { "epoch": 135.51297525648764, "grad_norm": 3.6716713905334473, "learning_rate": 1.9997292180318983e-05, "loss": 0.0349, "step": 112340 }, { "epoch": 135.52504526252264, "grad_norm": 3.959540605545044, "learning_rate": 1.999729193906419e-05, "loss": 0.0346, "step": 112350 }, { "epoch": 135.53711526855764, "grad_norm": 3.941373348236084, "learning_rate": 1.9997291697809396e-05, "loss": 0.0362, "step": 112360 }, { "epoch": 135.54918527459265, "grad_norm": 4.159048557281494, "learning_rate": 1.9997291456554602e-05, "loss": 0.0356, "step": 112370 }, { "epoch": 135.56125528062765, "grad_norm": 3.5170607566833496, "learning_rate": 1.9997291215299808e-05, "loss": 0.0339, "step": 112380 }, { "epoch": 135.57332528666265, "grad_norm": 3.606083393096924, "learning_rate": 1.9997290974045014e-05, "loss": 0.0339, "step": 112390 }, { "epoch": 135.58539529269765, "grad_norm": 4.083943843841553, "learning_rate": 1.999729073279022e-05, "loss": 0.0351, "step": 112400 }, { "epoch": 135.59746529873266, "grad_norm": 3.5795176029205322, "learning_rate": 1.9997290491535427e-05, "loss": 0.0366, "step": 112410 }, { "epoch": 135.60953530476766, "grad_norm": 3.450845718383789, "learning_rate": 1.9997290250280633e-05, "loss": 0.0342, "step": 112420 }, { "epoch": 135.62160531080266, "grad_norm": 3.577342987060547, "learning_rate": 1.999729000902584e-05, "loss": 0.0344, "step": 112430 }, { "epoch": 135.63367531683767, "grad_norm": 3.917128562927246, "learning_rate": 1.9997289767771045e-05, "loss": 0.0362, "step": 112440 }, { "epoch": 135.64574532287267, "grad_norm": 3.152911424636841, "learning_rate": 1.999728952651625e-05, "loss": 0.0351, "step": 112450 }, { "epoch": 135.65781532890767, "grad_norm": 3.6017329692840576, "learning_rate": 1.9997289285261458e-05, "loss": 0.0363, "step": 112460 }, { "epoch": 135.66988533494268, "grad_norm": 3.9478983879089355, "learning_rate": 1.9997289044006664e-05, "loss": 0.0359, "step": 112470 }, { "epoch": 135.68195534097768, "grad_norm": 4.003688335418701, "learning_rate": 1.9997288802751867e-05, "loss": 0.0366, "step": 112480 }, { "epoch": 135.69402534701268, "grad_norm": 3.6151254177093506, "learning_rate": 1.9997288561497073e-05, "loss": 0.039, "step": 112490 }, { "epoch": 135.7060953530477, "grad_norm": 3.6069858074188232, "learning_rate": 1.999728832024228e-05, "loss": 0.0351, "step": 112500 }, { "epoch": 135.7060953530477, "eval_loss": 13.345036506652832, "eval_runtime": 8.1686, "eval_samples_per_second": 85.327, "eval_steps_per_second": 10.773, "step": 112500 }, { "epoch": 135.7181653590827, "grad_norm": 3.6475048065185547, "learning_rate": 1.9997288078987486e-05, "loss": 0.0353, "step": 112510 }, { "epoch": 135.7302353651177, "grad_norm": 4.029049873352051, "learning_rate": 1.9997287837732692e-05, "loss": 0.0347, "step": 112520 }, { "epoch": 135.7423053711527, "grad_norm": 3.735318183898926, "learning_rate": 1.9997287596477898e-05, "loss": 0.0357, "step": 112530 }, { "epoch": 135.7543753771877, "grad_norm": 3.7286159992218018, "learning_rate": 1.9997287355223104e-05, "loss": 0.0365, "step": 112540 }, { "epoch": 135.7664453832227, "grad_norm": 3.82015323638916, "learning_rate": 1.999728711396831e-05, "loss": 0.0372, "step": 112550 }, { "epoch": 135.7785153892577, "grad_norm": 3.772801399230957, "learning_rate": 1.9997286872713517e-05, "loss": 0.0352, "step": 112560 }, { "epoch": 135.7905853952927, "grad_norm": 4.079813003540039, "learning_rate": 1.9997286631458723e-05, "loss": 0.0361, "step": 112570 }, { "epoch": 135.8026554013277, "grad_norm": 4.119377136230469, "learning_rate": 1.999728639020393e-05, "loss": 0.0372, "step": 112580 }, { "epoch": 135.81472540736272, "grad_norm": 3.8583385944366455, "learning_rate": 1.9997286148949135e-05, "loss": 0.0365, "step": 112590 }, { "epoch": 135.82679541339772, "grad_norm": 4.041693210601807, "learning_rate": 1.999728590769434e-05, "loss": 0.0357, "step": 112600 }, { "epoch": 135.83886541943272, "grad_norm": 3.6110615730285645, "learning_rate": 1.9997285666439548e-05, "loss": 0.036, "step": 112610 }, { "epoch": 135.85093542546772, "grad_norm": 4.091300010681152, "learning_rate": 1.9997285425184754e-05, "loss": 0.0368, "step": 112620 }, { "epoch": 135.86300543150273, "grad_norm": 4.106878757476807, "learning_rate": 1.999728518392996e-05, "loss": 0.0371, "step": 112630 }, { "epoch": 135.87507543753773, "grad_norm": 4.077164649963379, "learning_rate": 1.9997284942675166e-05, "loss": 0.0388, "step": 112640 }, { "epoch": 135.88714544357273, "grad_norm": 3.7839865684509277, "learning_rate": 1.9997284701420373e-05, "loss": 0.0376, "step": 112650 }, { "epoch": 135.89921544960774, "grad_norm": 3.7983107566833496, "learning_rate": 1.999728446016558e-05, "loss": 0.0375, "step": 112660 }, { "epoch": 135.91128545564274, "grad_norm": 3.915958881378174, "learning_rate": 1.9997284218910785e-05, "loss": 0.0371, "step": 112670 }, { "epoch": 135.92335546167774, "grad_norm": 3.7744083404541016, "learning_rate": 1.999728397765599e-05, "loss": 0.0365, "step": 112680 }, { "epoch": 135.93542546771275, "grad_norm": 3.9090490341186523, "learning_rate": 1.9997283736401197e-05, "loss": 0.0361, "step": 112690 }, { "epoch": 135.94749547374775, "grad_norm": 4.561279773712158, "learning_rate": 1.9997283495146404e-05, "loss": 0.0382, "step": 112700 }, { "epoch": 135.95956547978275, "grad_norm": 3.8069908618927, "learning_rate": 1.999728325389161e-05, "loss": 0.0366, "step": 112710 }, { "epoch": 135.97163548581776, "grad_norm": 4.076487064361572, "learning_rate": 1.9997283012636816e-05, "loss": 0.0386, "step": 112720 }, { "epoch": 135.98370549185276, "grad_norm": 3.519378423690796, "learning_rate": 1.9997282771382022e-05, "loss": 0.039, "step": 112730 }, { "epoch": 135.99577549788776, "grad_norm": 3.6473376750946045, "learning_rate": 1.999728253012723e-05, "loss": 0.0384, "step": 112740 }, { "epoch": 136.007242003621, "grad_norm": 2.986841917037964, "learning_rate": 1.9997282288872435e-05, "loss": 0.0308, "step": 112750 }, { "epoch": 136.019312009656, "grad_norm": 3.3854312896728516, "learning_rate": 1.999728204761764e-05, "loss": 0.0255, "step": 112760 }, { "epoch": 136.031382015691, "grad_norm": 3.736431360244751, "learning_rate": 1.9997281806362847e-05, "loss": 0.0268, "step": 112770 }, { "epoch": 136.043452021726, "grad_norm": 2.977726459503174, "learning_rate": 1.9997281565108053e-05, "loss": 0.0271, "step": 112780 }, { "epoch": 136.05552202776101, "grad_norm": 3.109063148498535, "learning_rate": 1.999728132385326e-05, "loss": 0.0271, "step": 112790 }, { "epoch": 136.06759203379602, "grad_norm": 3.211918592453003, "learning_rate": 1.9997281082598466e-05, "loss": 0.0284, "step": 112800 }, { "epoch": 136.07966203983102, "grad_norm": 3.5791070461273193, "learning_rate": 1.9997280841343672e-05, "loss": 0.0279, "step": 112810 }, { "epoch": 136.09173204586602, "grad_norm": 3.2370376586914062, "learning_rate": 1.9997280600088878e-05, "loss": 0.0305, "step": 112820 }, { "epoch": 136.10380205190103, "grad_norm": 3.578221321105957, "learning_rate": 1.9997280358834084e-05, "loss": 0.0297, "step": 112830 }, { "epoch": 136.11587205793603, "grad_norm": 3.4730265140533447, "learning_rate": 1.999728011757929e-05, "loss": 0.0292, "step": 112840 }, { "epoch": 136.12794206397103, "grad_norm": 3.363391637802124, "learning_rate": 1.9997279876324497e-05, "loss": 0.0295, "step": 112850 }, { "epoch": 136.14001207000604, "grad_norm": 3.616219997406006, "learning_rate": 1.9997279635069703e-05, "loss": 0.0317, "step": 112860 }, { "epoch": 136.15208207604104, "grad_norm": 3.466155529022217, "learning_rate": 1.999727939381491e-05, "loss": 0.0305, "step": 112870 }, { "epoch": 136.16415208207604, "grad_norm": 3.651829957962036, "learning_rate": 1.9997279152560115e-05, "loss": 0.0307, "step": 112880 }, { "epoch": 136.17622208811105, "grad_norm": 4.1136794090271, "learning_rate": 1.9997278911305322e-05, "loss": 0.0312, "step": 112890 }, { "epoch": 136.18829209414605, "grad_norm": 3.2465946674346924, "learning_rate": 1.9997278670050525e-05, "loss": 0.0318, "step": 112900 }, { "epoch": 136.20036210018105, "grad_norm": 3.3640923500061035, "learning_rate": 1.999727842879573e-05, "loss": 0.0293, "step": 112910 }, { "epoch": 136.21243210621606, "grad_norm": 3.78999924659729, "learning_rate": 1.9997278187540937e-05, "loss": 0.0322, "step": 112920 }, { "epoch": 136.22450211225106, "grad_norm": 3.4381332397460938, "learning_rate": 1.9997277946286143e-05, "loss": 0.0312, "step": 112930 }, { "epoch": 136.23657211828606, "grad_norm": 3.418445348739624, "learning_rate": 1.999727770503135e-05, "loss": 0.0304, "step": 112940 }, { "epoch": 136.24864212432107, "grad_norm": 3.5691418647766113, "learning_rate": 1.9997277463776556e-05, "loss": 0.03, "step": 112950 }, { "epoch": 136.26071213035607, "grad_norm": 3.97804856300354, "learning_rate": 1.9997277222521762e-05, "loss": 0.0314, "step": 112960 }, { "epoch": 136.27278213639107, "grad_norm": 3.617116928100586, "learning_rate": 1.9997276981266968e-05, "loss": 0.0311, "step": 112970 }, { "epoch": 136.28485214242608, "grad_norm": 3.9393391609191895, "learning_rate": 1.9997276740012174e-05, "loss": 0.0325, "step": 112980 }, { "epoch": 136.29692214846108, "grad_norm": 3.196687698364258, "learning_rate": 1.999727649875738e-05, "loss": 0.032, "step": 112990 }, { "epoch": 136.30899215449608, "grad_norm": 3.9843063354492188, "learning_rate": 1.9997276257502587e-05, "loss": 0.0333, "step": 113000 }, { "epoch": 136.30899215449608, "eval_loss": 13.345564842224121, "eval_runtime": 8.1877, "eval_samples_per_second": 85.128, "eval_steps_per_second": 10.748, "step": 113000 }, { "epoch": 136.32106216053108, "grad_norm": 3.358295202255249, "learning_rate": 1.9997276016247793e-05, "loss": 0.0327, "step": 113010 }, { "epoch": 136.3331321665661, "grad_norm": 3.1848208904266357, "learning_rate": 1.9997275774993e-05, "loss": 0.0344, "step": 113020 }, { "epoch": 136.3452021726011, "grad_norm": 3.1574506759643555, "learning_rate": 1.9997275533738205e-05, "loss": 0.0323, "step": 113030 }, { "epoch": 136.3572721786361, "grad_norm": 4.022443771362305, "learning_rate": 1.999727529248341e-05, "loss": 0.0332, "step": 113040 }, { "epoch": 136.3693421846711, "grad_norm": 3.874346971511841, "learning_rate": 1.9997275051228618e-05, "loss": 0.0321, "step": 113050 }, { "epoch": 136.3814121907061, "grad_norm": 3.5772294998168945, "learning_rate": 1.9997274809973824e-05, "loss": 0.0322, "step": 113060 }, { "epoch": 136.3934821967411, "grad_norm": 3.6972126960754395, "learning_rate": 1.999727456871903e-05, "loss": 0.0331, "step": 113070 }, { "epoch": 136.4055522027761, "grad_norm": 3.471121072769165, "learning_rate": 1.9997274327464236e-05, "loss": 0.033, "step": 113080 }, { "epoch": 136.4176222088111, "grad_norm": 3.979156732559204, "learning_rate": 1.9997274086209443e-05, "loss": 0.0331, "step": 113090 }, { "epoch": 136.4296922148461, "grad_norm": 3.671051263809204, "learning_rate": 1.999727384495465e-05, "loss": 0.0326, "step": 113100 }, { "epoch": 136.44176222088112, "grad_norm": 3.3135743141174316, "learning_rate": 1.9997273603699855e-05, "loss": 0.0348, "step": 113110 }, { "epoch": 136.45383222691612, "grad_norm": 3.46612286567688, "learning_rate": 1.999727336244506e-05, "loss": 0.0343, "step": 113120 }, { "epoch": 136.46590223295112, "grad_norm": 4.014800071716309, "learning_rate": 1.9997273121190267e-05, "loss": 0.0328, "step": 113130 }, { "epoch": 136.47797223898613, "grad_norm": 3.5406341552734375, "learning_rate": 1.9997272879935474e-05, "loss": 0.0337, "step": 113140 }, { "epoch": 136.49004224502113, "grad_norm": 3.3198235034942627, "learning_rate": 1.9997272638680677e-05, "loss": 0.0339, "step": 113150 }, { "epoch": 136.50211225105613, "grad_norm": 4.012604236602783, "learning_rate": 1.9997272397425883e-05, "loss": 0.0341, "step": 113160 }, { "epoch": 136.51418225709114, "grad_norm": 4.089316368103027, "learning_rate": 1.999727215617109e-05, "loss": 0.0365, "step": 113170 }, { "epoch": 136.52625226312614, "grad_norm": 3.634805917739868, "learning_rate": 1.9997271914916295e-05, "loss": 0.035, "step": 113180 }, { "epoch": 136.53832226916114, "grad_norm": 3.525632619857788, "learning_rate": 1.99972716736615e-05, "loss": 0.0341, "step": 113190 }, { "epoch": 136.55039227519615, "grad_norm": 3.572026252746582, "learning_rate": 1.9997271432406708e-05, "loss": 0.0342, "step": 113200 }, { "epoch": 136.56246228123115, "grad_norm": 3.334413766860962, "learning_rate": 1.9997271191151914e-05, "loss": 0.035, "step": 113210 }, { "epoch": 136.57453228726615, "grad_norm": 3.6279704570770264, "learning_rate": 1.999727094989712e-05, "loss": 0.0337, "step": 113220 }, { "epoch": 136.58660229330115, "grad_norm": 3.2914443016052246, "learning_rate": 1.9997270708642326e-05, "loss": 0.0353, "step": 113230 }, { "epoch": 136.59867229933616, "grad_norm": 4.099559307098389, "learning_rate": 1.9997270467387532e-05, "loss": 0.0349, "step": 113240 }, { "epoch": 136.61074230537116, "grad_norm": 3.676015615463257, "learning_rate": 1.999727022613274e-05, "loss": 0.0356, "step": 113250 }, { "epoch": 136.62281231140616, "grad_norm": 3.839491128921509, "learning_rate": 1.9997269984877945e-05, "loss": 0.0345, "step": 113260 }, { "epoch": 136.63488231744117, "grad_norm": 3.690974235534668, "learning_rate": 1.9997269743623155e-05, "loss": 0.0359, "step": 113270 }, { "epoch": 136.64695232347617, "grad_norm": 4.409002304077148, "learning_rate": 1.999726950236836e-05, "loss": 0.0363, "step": 113280 }, { "epoch": 136.65902232951117, "grad_norm": 3.730311632156372, "learning_rate": 1.9997269261113567e-05, "loss": 0.0343, "step": 113290 }, { "epoch": 136.67109233554618, "grad_norm": 3.6937270164489746, "learning_rate": 1.9997269019858773e-05, "loss": 0.0372, "step": 113300 }, { "epoch": 136.68316234158118, "grad_norm": 4.233080863952637, "learning_rate": 1.9997268778603976e-05, "loss": 0.036, "step": 113310 }, { "epoch": 136.69523234761618, "grad_norm": 3.564448595046997, "learning_rate": 1.9997268537349182e-05, "loss": 0.037, "step": 113320 }, { "epoch": 136.7073023536512, "grad_norm": 3.9407503604888916, "learning_rate": 1.999726829609439e-05, "loss": 0.0366, "step": 113330 }, { "epoch": 136.7193723596862, "grad_norm": 3.7812247276306152, "learning_rate": 1.9997268054839595e-05, "loss": 0.0353, "step": 113340 }, { "epoch": 136.7314423657212, "grad_norm": 3.8895153999328613, "learning_rate": 1.99972678135848e-05, "loss": 0.0365, "step": 113350 }, { "epoch": 136.7435123717562, "grad_norm": 4.00675630569458, "learning_rate": 1.9997267572330007e-05, "loss": 0.0359, "step": 113360 }, { "epoch": 136.7555823777912, "grad_norm": 3.9647603034973145, "learning_rate": 1.9997267331075213e-05, "loss": 0.0377, "step": 113370 }, { "epoch": 136.7676523838262, "grad_norm": 4.106258869171143, "learning_rate": 1.999726708982042e-05, "loss": 0.0376, "step": 113380 }, { "epoch": 136.7797223898612, "grad_norm": 3.5276944637298584, "learning_rate": 1.9997266848565626e-05, "loss": 0.0355, "step": 113390 }, { "epoch": 136.7917923958962, "grad_norm": 4.045913219451904, "learning_rate": 1.9997266607310832e-05, "loss": 0.0378, "step": 113400 }, { "epoch": 136.8038624019312, "grad_norm": 3.955909490585327, "learning_rate": 1.9997266366056038e-05, "loss": 0.0363, "step": 113410 }, { "epoch": 136.81593240796622, "grad_norm": 3.6476316452026367, "learning_rate": 1.9997266124801244e-05, "loss": 0.0366, "step": 113420 }, { "epoch": 136.82800241400122, "grad_norm": 3.6430373191833496, "learning_rate": 1.999726588354645e-05, "loss": 0.0362, "step": 113430 }, { "epoch": 136.84007242003622, "grad_norm": 3.8191802501678467, "learning_rate": 1.9997265642291657e-05, "loss": 0.0359, "step": 113440 }, { "epoch": 136.85214242607123, "grad_norm": 4.024453163146973, "learning_rate": 1.9997265401036863e-05, "loss": 0.0365, "step": 113450 }, { "epoch": 136.86421243210623, "grad_norm": 3.7969141006469727, "learning_rate": 1.999726515978207e-05, "loss": 0.0374, "step": 113460 }, { "epoch": 136.87628243814123, "grad_norm": 3.891348361968994, "learning_rate": 1.9997264918527275e-05, "loss": 0.0364, "step": 113470 }, { "epoch": 136.88835244417623, "grad_norm": 4.045078277587891, "learning_rate": 1.999726467727248e-05, "loss": 0.0373, "step": 113480 }, { "epoch": 136.90042245021124, "grad_norm": 3.769214630126953, "learning_rate": 1.9997264436017688e-05, "loss": 0.0377, "step": 113490 }, { "epoch": 136.91249245624624, "grad_norm": 3.6933934688568115, "learning_rate": 1.9997264194762894e-05, "loss": 0.0381, "step": 113500 }, { "epoch": 136.91249245624624, "eval_loss": 13.356138229370117, "eval_runtime": 8.1777, "eval_samples_per_second": 85.232, "eval_steps_per_second": 10.761, "step": 113500 }, { "epoch": 136.92456246228124, "grad_norm": 3.941481828689575, "learning_rate": 1.99972639535081e-05, "loss": 0.0371, "step": 113510 }, { "epoch": 136.93663246831625, "grad_norm": 3.832308530807495, "learning_rate": 1.9997263712253307e-05, "loss": 0.0369, "step": 113520 }, { "epoch": 136.94870247435125, "grad_norm": 3.9751229286193848, "learning_rate": 1.9997263470998513e-05, "loss": 0.0382, "step": 113530 }, { "epoch": 136.96077248038625, "grad_norm": 3.708709716796875, "learning_rate": 1.999726322974372e-05, "loss": 0.0383, "step": 113540 }, { "epoch": 136.97284248642126, "grad_norm": 3.742506265640259, "learning_rate": 1.9997262988488925e-05, "loss": 0.0362, "step": 113550 }, { "epoch": 136.98491249245626, "grad_norm": 3.763831853866577, "learning_rate": 1.9997262747234128e-05, "loss": 0.038, "step": 113560 }, { "epoch": 136.99698249849126, "grad_norm": 4.160096645355225, "learning_rate": 1.9997262505979334e-05, "loss": 0.0379, "step": 113570 }, { "epoch": 137.0084490042245, "grad_norm": 3.4543275833129883, "learning_rate": 1.999726226472454e-05, "loss": 0.0274, "step": 113580 }, { "epoch": 137.0205190102595, "grad_norm": 2.960864782333374, "learning_rate": 1.9997262023469747e-05, "loss": 0.0263, "step": 113590 }, { "epoch": 137.0325890162945, "grad_norm": 3.014665126800537, "learning_rate": 1.9997261782214953e-05, "loss": 0.025, "step": 113600 }, { "epoch": 137.0446590223295, "grad_norm": 3.4995360374450684, "learning_rate": 1.999726154096016e-05, "loss": 0.0265, "step": 113610 }, { "epoch": 137.05672902836451, "grad_norm": 3.1657874584198, "learning_rate": 1.9997261299705365e-05, "loss": 0.0278, "step": 113620 }, { "epoch": 137.06879903439952, "grad_norm": 2.973095178604126, "learning_rate": 1.999726105845057e-05, "loss": 0.0281, "step": 113630 }, { "epoch": 137.08086904043452, "grad_norm": 3.7748985290527344, "learning_rate": 1.9997260817195778e-05, "loss": 0.0281, "step": 113640 }, { "epoch": 137.09293904646952, "grad_norm": 3.7361176013946533, "learning_rate": 1.9997260575940984e-05, "loss": 0.0297, "step": 113650 }, { "epoch": 137.10500905250453, "grad_norm": 3.4916839599609375, "learning_rate": 1.999726033468619e-05, "loss": 0.0285, "step": 113660 }, { "epoch": 137.11707905853953, "grad_norm": 3.0617001056671143, "learning_rate": 1.9997260093431396e-05, "loss": 0.03, "step": 113670 }, { "epoch": 137.12914906457453, "grad_norm": 3.5456302165985107, "learning_rate": 1.9997259852176603e-05, "loss": 0.031, "step": 113680 }, { "epoch": 137.14121907060954, "grad_norm": 3.2795825004577637, "learning_rate": 1.999725961092181e-05, "loss": 0.0297, "step": 113690 }, { "epoch": 137.15328907664454, "grad_norm": 3.372800350189209, "learning_rate": 1.9997259369667015e-05, "loss": 0.0318, "step": 113700 }, { "epoch": 137.16535908267954, "grad_norm": 3.68198299407959, "learning_rate": 1.999725912841222e-05, "loss": 0.0318, "step": 113710 }, { "epoch": 137.17742908871455, "grad_norm": 3.4222800731658936, "learning_rate": 1.9997258887157427e-05, "loss": 0.0301, "step": 113720 }, { "epoch": 137.18949909474955, "grad_norm": 2.9050230979919434, "learning_rate": 1.9997258645902634e-05, "loss": 0.0293, "step": 113730 }, { "epoch": 137.20156910078455, "grad_norm": 3.813150644302368, "learning_rate": 1.999725840464784e-05, "loss": 0.0323, "step": 113740 }, { "epoch": 137.21363910681956, "grad_norm": 3.4595530033111572, "learning_rate": 1.9997258163393046e-05, "loss": 0.0307, "step": 113750 }, { "epoch": 137.22570911285456, "grad_norm": 3.351656913757324, "learning_rate": 1.9997257922138252e-05, "loss": 0.0315, "step": 113760 }, { "epoch": 137.23777911888956, "grad_norm": 3.4554696083068848, "learning_rate": 1.999725768088346e-05, "loss": 0.0321, "step": 113770 }, { "epoch": 137.24984912492457, "grad_norm": 3.9025416374206543, "learning_rate": 1.9997257439628665e-05, "loss": 0.0341, "step": 113780 }, { "epoch": 137.26191913095957, "grad_norm": 3.675968885421753, "learning_rate": 1.999725719837387e-05, "loss": 0.0334, "step": 113790 }, { "epoch": 137.27398913699457, "grad_norm": 3.82600736618042, "learning_rate": 1.9997256957119077e-05, "loss": 0.0333, "step": 113800 }, { "epoch": 137.28605914302958, "grad_norm": 3.460942268371582, "learning_rate": 1.9997256715864283e-05, "loss": 0.0331, "step": 113810 }, { "epoch": 137.29812914906458, "grad_norm": 3.7408392429351807, "learning_rate": 1.999725647460949e-05, "loss": 0.0326, "step": 113820 }, { "epoch": 137.31019915509958, "grad_norm": 3.351940631866455, "learning_rate": 1.9997256233354696e-05, "loss": 0.0311, "step": 113830 }, { "epoch": 137.32226916113459, "grad_norm": 3.329519748687744, "learning_rate": 1.9997255992099902e-05, "loss": 0.0332, "step": 113840 }, { "epoch": 137.3343391671696, "grad_norm": 3.2030527591705322, "learning_rate": 1.9997255750845108e-05, "loss": 0.03, "step": 113850 }, { "epoch": 137.3464091732046, "grad_norm": 3.6840527057647705, "learning_rate": 1.9997255509590314e-05, "loss": 0.0339, "step": 113860 }, { "epoch": 137.3584791792396, "grad_norm": 3.2663238048553467, "learning_rate": 1.999725526833552e-05, "loss": 0.0348, "step": 113870 }, { "epoch": 137.3705491852746, "grad_norm": 3.7018513679504395, "learning_rate": 1.9997255027080727e-05, "loss": 0.0347, "step": 113880 }, { "epoch": 137.3826191913096, "grad_norm": 3.9828953742980957, "learning_rate": 1.9997254785825933e-05, "loss": 0.0324, "step": 113890 }, { "epoch": 137.3946891973446, "grad_norm": 4.057321071624756, "learning_rate": 1.999725454457114e-05, "loss": 0.0335, "step": 113900 }, { "epoch": 137.4067592033796, "grad_norm": 3.8527095317840576, "learning_rate": 1.9997254303316346e-05, "loss": 0.032, "step": 113910 }, { "epoch": 137.4188292094146, "grad_norm": 3.810051202774048, "learning_rate": 1.9997254062061552e-05, "loss": 0.0333, "step": 113920 }, { "epoch": 137.4308992154496, "grad_norm": 3.888200283050537, "learning_rate": 1.9997253820806758e-05, "loss": 0.0353, "step": 113930 }, { "epoch": 137.44296922148462, "grad_norm": 3.6987030506134033, "learning_rate": 1.9997253579551964e-05, "loss": 0.0337, "step": 113940 }, { "epoch": 137.45503922751962, "grad_norm": 3.995087146759033, "learning_rate": 1.999725333829717e-05, "loss": 0.0342, "step": 113950 }, { "epoch": 137.46710923355462, "grad_norm": 3.8554275035858154, "learning_rate": 1.9997253097042377e-05, "loss": 0.032, "step": 113960 }, { "epoch": 137.47917923958963, "grad_norm": 3.7400879859924316, "learning_rate": 1.9997252855787583e-05, "loss": 0.0348, "step": 113970 }, { "epoch": 137.49124924562463, "grad_norm": 3.616100788116455, "learning_rate": 1.9997252614532786e-05, "loss": 0.0329, "step": 113980 }, { "epoch": 137.50331925165963, "grad_norm": 3.673367738723755, "learning_rate": 1.9997252373277992e-05, "loss": 0.0345, "step": 113990 }, { "epoch": 137.51538925769464, "grad_norm": 3.7816739082336426, "learning_rate": 1.9997252132023198e-05, "loss": 0.0383, "step": 114000 }, { "epoch": 137.51538925769464, "eval_loss": 13.351968765258789, "eval_runtime": 8.1781, "eval_samples_per_second": 85.227, "eval_steps_per_second": 10.76, "step": 114000 }, { "epoch": 137.52745926372964, "grad_norm": 3.998991012573242, "learning_rate": 1.9997251890768404e-05, "loss": 0.035, "step": 114010 }, { "epoch": 137.53952926976464, "grad_norm": 3.7356672286987305, "learning_rate": 1.999725164951361e-05, "loss": 0.0342, "step": 114020 }, { "epoch": 137.55159927579965, "grad_norm": 4.347715377807617, "learning_rate": 1.9997251408258817e-05, "loss": 0.035, "step": 114030 }, { "epoch": 137.56366928183465, "grad_norm": 3.500563144683838, "learning_rate": 1.9997251167004023e-05, "loss": 0.0356, "step": 114040 }, { "epoch": 137.57573928786965, "grad_norm": 3.8855700492858887, "learning_rate": 1.999725092574923e-05, "loss": 0.0357, "step": 114050 }, { "epoch": 137.58780929390466, "grad_norm": 3.569581985473633, "learning_rate": 1.9997250684494435e-05, "loss": 0.0346, "step": 114060 }, { "epoch": 137.59987929993966, "grad_norm": 3.9932303428649902, "learning_rate": 1.999725044323964e-05, "loss": 0.0344, "step": 114070 }, { "epoch": 137.61194930597466, "grad_norm": 3.6715238094329834, "learning_rate": 1.9997250201984848e-05, "loss": 0.0332, "step": 114080 }, { "epoch": 137.62401931200966, "grad_norm": 3.53840970993042, "learning_rate": 1.9997249960730054e-05, "loss": 0.0338, "step": 114090 }, { "epoch": 137.63608931804467, "grad_norm": 3.5763161182403564, "learning_rate": 1.999724971947526e-05, "loss": 0.0341, "step": 114100 }, { "epoch": 137.64815932407967, "grad_norm": 3.694807767868042, "learning_rate": 1.9997249478220466e-05, "loss": 0.0359, "step": 114110 }, { "epoch": 137.66022933011467, "grad_norm": 3.664790153503418, "learning_rate": 1.9997249236965673e-05, "loss": 0.0339, "step": 114120 }, { "epoch": 137.67229933614968, "grad_norm": 3.587886333465576, "learning_rate": 1.999724899571088e-05, "loss": 0.0359, "step": 114130 }, { "epoch": 137.68436934218468, "grad_norm": 3.7357687950134277, "learning_rate": 1.9997248754456085e-05, "loss": 0.0345, "step": 114140 }, { "epoch": 137.69643934821968, "grad_norm": 3.792633056640625, "learning_rate": 1.999724851320129e-05, "loss": 0.0352, "step": 114150 }, { "epoch": 137.7085093542547, "grad_norm": 4.2725958824157715, "learning_rate": 1.9997248271946498e-05, "loss": 0.0377, "step": 114160 }, { "epoch": 137.7205793602897, "grad_norm": 3.7101235389709473, "learning_rate": 1.9997248030691704e-05, "loss": 0.0362, "step": 114170 }, { "epoch": 137.7326493663247, "grad_norm": 3.9091968536376953, "learning_rate": 1.999724778943691e-05, "loss": 0.0342, "step": 114180 }, { "epoch": 137.7447193723597, "grad_norm": 3.843254566192627, "learning_rate": 1.9997247548182116e-05, "loss": 0.035, "step": 114190 }, { "epoch": 137.7567893783947, "grad_norm": 3.9258310794830322, "learning_rate": 1.9997247306927322e-05, "loss": 0.0358, "step": 114200 }, { "epoch": 137.7688593844297, "grad_norm": 3.971499443054199, "learning_rate": 1.999724706567253e-05, "loss": 0.036, "step": 114210 }, { "epoch": 137.7809293904647, "grad_norm": 3.75506329536438, "learning_rate": 1.9997246824417735e-05, "loss": 0.0377, "step": 114220 }, { "epoch": 137.7929993964997, "grad_norm": 3.5541462898254395, "learning_rate": 1.9997246583162938e-05, "loss": 0.036, "step": 114230 }, { "epoch": 137.8050694025347, "grad_norm": 4.337401866912842, "learning_rate": 1.9997246341908144e-05, "loss": 0.0385, "step": 114240 }, { "epoch": 137.81713940856972, "grad_norm": 3.772348165512085, "learning_rate": 1.999724610065335e-05, "loss": 0.0366, "step": 114250 }, { "epoch": 137.82920941460472, "grad_norm": 4.2627153396606445, "learning_rate": 1.9997245859398556e-05, "loss": 0.0361, "step": 114260 }, { "epoch": 137.84127942063972, "grad_norm": 3.810659408569336, "learning_rate": 1.9997245618143762e-05, "loss": 0.037, "step": 114270 }, { "epoch": 137.85334942667473, "grad_norm": 3.8949077129364014, "learning_rate": 1.999724537688897e-05, "loss": 0.0365, "step": 114280 }, { "epoch": 137.86541943270973, "grad_norm": 3.9109559059143066, "learning_rate": 1.9997245135634175e-05, "loss": 0.0363, "step": 114290 }, { "epoch": 137.87748943874473, "grad_norm": 4.397857666015625, "learning_rate": 1.999724489437938e-05, "loss": 0.0367, "step": 114300 }, { "epoch": 137.88955944477974, "grad_norm": 3.7424123287200928, "learning_rate": 1.9997244653124587e-05, "loss": 0.0386, "step": 114310 }, { "epoch": 137.90162945081474, "grad_norm": 3.4865009784698486, "learning_rate": 1.9997244411869794e-05, "loss": 0.0354, "step": 114320 }, { "epoch": 137.91369945684974, "grad_norm": 3.7611780166625977, "learning_rate": 1.9997244170615e-05, "loss": 0.0379, "step": 114330 }, { "epoch": 137.92576946288474, "grad_norm": 3.714602470397949, "learning_rate": 1.9997243929360206e-05, "loss": 0.0363, "step": 114340 }, { "epoch": 137.93783946891975, "grad_norm": 3.6372053623199463, "learning_rate": 1.9997243688105412e-05, "loss": 0.036, "step": 114350 }, { "epoch": 137.94990947495475, "grad_norm": 3.687589645385742, "learning_rate": 1.9997243446850622e-05, "loss": 0.0366, "step": 114360 }, { "epoch": 137.96197948098975, "grad_norm": 3.7477402687072754, "learning_rate": 1.9997243205595828e-05, "loss": 0.0371, "step": 114370 }, { "epoch": 137.97404948702476, "grad_norm": 3.8046867847442627, "learning_rate": 1.9997242964341034e-05, "loss": 0.0376, "step": 114380 }, { "epoch": 137.98611949305976, "grad_norm": 4.116754531860352, "learning_rate": 1.9997242723086237e-05, "loss": 0.036, "step": 114390 }, { "epoch": 137.99818949909474, "grad_norm": 4.019743919372559, "learning_rate": 1.9997242481831443e-05, "loss": 0.0371, "step": 114400 }, { "epoch": 138.009656004828, "grad_norm": 3.2004246711730957, "learning_rate": 1.999724224057665e-05, "loss": 0.0267, "step": 114410 }, { "epoch": 138.021726010863, "grad_norm": 3.131138801574707, "learning_rate": 1.9997241999321856e-05, "loss": 0.026, "step": 114420 }, { "epoch": 138.033796016898, "grad_norm": 3.8102972507476807, "learning_rate": 1.9997241758067062e-05, "loss": 0.0304, "step": 114430 }, { "epoch": 138.045866022933, "grad_norm": 3.4004149436950684, "learning_rate": 1.9997241516812268e-05, "loss": 0.0281, "step": 114440 }, { "epoch": 138.05793602896802, "grad_norm": 3.557236671447754, "learning_rate": 1.9997241275557474e-05, "loss": 0.0275, "step": 114450 }, { "epoch": 138.07000603500302, "grad_norm": 3.1193385124206543, "learning_rate": 1.999724103430268e-05, "loss": 0.0278, "step": 114460 }, { "epoch": 138.08207604103802, "grad_norm": 3.4005117416381836, "learning_rate": 1.9997240793047887e-05, "loss": 0.0272, "step": 114470 }, { "epoch": 138.09414604707302, "grad_norm": 3.8382322788238525, "learning_rate": 1.9997240551793093e-05, "loss": 0.0276, "step": 114480 }, { "epoch": 138.10621605310803, "grad_norm": 3.4782307147979736, "learning_rate": 1.99972403105383e-05, "loss": 0.0294, "step": 114490 }, { "epoch": 138.11828605914303, "grad_norm": 3.3099472522735596, "learning_rate": 1.9997240069283505e-05, "loss": 0.0292, "step": 114500 }, { "epoch": 138.11828605914303, "eval_loss": 13.35096263885498, "eval_runtime": 8.1717, "eval_samples_per_second": 85.294, "eval_steps_per_second": 10.769, "step": 114500 }, { "epoch": 138.13035606517803, "grad_norm": 3.03926420211792, "learning_rate": 1.999723982802871e-05, "loss": 0.0286, "step": 114510 }, { "epoch": 138.14242607121304, "grad_norm": 3.354682445526123, "learning_rate": 1.9997239586773918e-05, "loss": 0.0306, "step": 114520 }, { "epoch": 138.15449607724804, "grad_norm": 3.221061944961548, "learning_rate": 1.9997239345519124e-05, "loss": 0.0299, "step": 114530 }, { "epoch": 138.16656608328304, "grad_norm": 3.526113271713257, "learning_rate": 1.999723910426433e-05, "loss": 0.0308, "step": 114540 }, { "epoch": 138.17863608931805, "grad_norm": 4.002227306365967, "learning_rate": 1.9997238863009537e-05, "loss": 0.03, "step": 114550 }, { "epoch": 138.19070609535305, "grad_norm": 3.781034469604492, "learning_rate": 1.9997238621754743e-05, "loss": 0.0313, "step": 114560 }, { "epoch": 138.20277610138805, "grad_norm": 3.502734422683716, "learning_rate": 1.999723838049995e-05, "loss": 0.0306, "step": 114570 }, { "epoch": 138.21484610742306, "grad_norm": 3.359524965286255, "learning_rate": 1.9997238139245155e-05, "loss": 0.032, "step": 114580 }, { "epoch": 138.22691611345806, "grad_norm": 3.683258295059204, "learning_rate": 1.999723789799036e-05, "loss": 0.0313, "step": 114590 }, { "epoch": 138.23898611949306, "grad_norm": 3.8741064071655273, "learning_rate": 1.9997237656735568e-05, "loss": 0.0313, "step": 114600 }, { "epoch": 138.25105612552807, "grad_norm": 3.627187728881836, "learning_rate": 1.9997237415480774e-05, "loss": 0.0327, "step": 114610 }, { "epoch": 138.26312613156307, "grad_norm": 3.2115018367767334, "learning_rate": 1.999723717422598e-05, "loss": 0.0316, "step": 114620 }, { "epoch": 138.27519613759807, "grad_norm": 3.408907413482666, "learning_rate": 1.9997236932971186e-05, "loss": 0.032, "step": 114630 }, { "epoch": 138.28726614363308, "grad_norm": 3.6642792224884033, "learning_rate": 1.999723669171639e-05, "loss": 0.032, "step": 114640 }, { "epoch": 138.29933614966808, "grad_norm": 3.523219108581543, "learning_rate": 1.9997236450461595e-05, "loss": 0.0317, "step": 114650 }, { "epoch": 138.31140615570308, "grad_norm": 3.702486515045166, "learning_rate": 1.99972362092068e-05, "loss": 0.0337, "step": 114660 }, { "epoch": 138.32347616173809, "grad_norm": 3.6537930965423584, "learning_rate": 1.9997235967952008e-05, "loss": 0.0308, "step": 114670 }, { "epoch": 138.3355461677731, "grad_norm": 3.339521646499634, "learning_rate": 1.9997235726697214e-05, "loss": 0.0315, "step": 114680 }, { "epoch": 138.3476161738081, "grad_norm": 3.883038282394409, "learning_rate": 1.999723548544242e-05, "loss": 0.0317, "step": 114690 }, { "epoch": 138.3596861798431, "grad_norm": 3.4638497829437256, "learning_rate": 1.9997235244187626e-05, "loss": 0.0322, "step": 114700 }, { "epoch": 138.3717561858781, "grad_norm": 3.952723503112793, "learning_rate": 1.9997235002932833e-05, "loss": 0.0327, "step": 114710 }, { "epoch": 138.3838261919131, "grad_norm": 3.4378576278686523, "learning_rate": 1.999723476167804e-05, "loss": 0.0316, "step": 114720 }, { "epoch": 138.3958961979481, "grad_norm": 3.492349624633789, "learning_rate": 1.9997234520423245e-05, "loss": 0.0332, "step": 114730 }, { "epoch": 138.4079662039831, "grad_norm": 3.719794273376465, "learning_rate": 1.999723427916845e-05, "loss": 0.0326, "step": 114740 }, { "epoch": 138.4200362100181, "grad_norm": 3.4022655487060547, "learning_rate": 1.9997234037913657e-05, "loss": 0.0334, "step": 114750 }, { "epoch": 138.4321062160531, "grad_norm": 3.382495641708374, "learning_rate": 1.9997233796658864e-05, "loss": 0.0325, "step": 114760 }, { "epoch": 138.44417622208812, "grad_norm": 3.6495361328125, "learning_rate": 1.999723355540407e-05, "loss": 0.0342, "step": 114770 }, { "epoch": 138.45624622812312, "grad_norm": 3.9346468448638916, "learning_rate": 1.9997233314149276e-05, "loss": 0.0334, "step": 114780 }, { "epoch": 138.46831623415812, "grad_norm": 3.9404659271240234, "learning_rate": 1.9997233072894482e-05, "loss": 0.0346, "step": 114790 }, { "epoch": 138.48038624019313, "grad_norm": 3.6769676208496094, "learning_rate": 1.999723283163969e-05, "loss": 0.0334, "step": 114800 }, { "epoch": 138.49245624622813, "grad_norm": 3.5982091426849365, "learning_rate": 1.9997232590384895e-05, "loss": 0.0343, "step": 114810 }, { "epoch": 138.50452625226313, "grad_norm": 3.774543523788452, "learning_rate": 1.99972323491301e-05, "loss": 0.0354, "step": 114820 }, { "epoch": 138.51659625829814, "grad_norm": 3.31181001663208, "learning_rate": 1.9997232107875307e-05, "loss": 0.0317, "step": 114830 }, { "epoch": 138.52866626433314, "grad_norm": 3.7815322875976562, "learning_rate": 1.9997231866620513e-05, "loss": 0.0325, "step": 114840 }, { "epoch": 138.54073627036814, "grad_norm": 3.768540620803833, "learning_rate": 1.999723162536572e-05, "loss": 0.033, "step": 114850 }, { "epoch": 138.55280627640315, "grad_norm": 3.488816976547241, "learning_rate": 1.9997231384110926e-05, "loss": 0.0338, "step": 114860 }, { "epoch": 138.56487628243815, "grad_norm": 3.5552446842193604, "learning_rate": 1.9997231142856132e-05, "loss": 0.0353, "step": 114870 }, { "epoch": 138.57694628847315, "grad_norm": 3.4001169204711914, "learning_rate": 1.9997230901601338e-05, "loss": 0.035, "step": 114880 }, { "epoch": 138.58901629450816, "grad_norm": 3.7815957069396973, "learning_rate": 1.999723066034654e-05, "loss": 0.0349, "step": 114890 }, { "epoch": 138.60108630054316, "grad_norm": 3.5837533473968506, "learning_rate": 1.999723041909175e-05, "loss": 0.0341, "step": 114900 }, { "epoch": 138.61315630657816, "grad_norm": 3.7332041263580322, "learning_rate": 1.9997230177836957e-05, "loss": 0.0356, "step": 114910 }, { "epoch": 138.62522631261317, "grad_norm": 3.531928300857544, "learning_rate": 1.9997229936582163e-05, "loss": 0.0333, "step": 114920 }, { "epoch": 138.63729631864817, "grad_norm": 3.560577630996704, "learning_rate": 1.999722969532737e-05, "loss": 0.037, "step": 114930 }, { "epoch": 138.64936632468317, "grad_norm": 3.4851934909820557, "learning_rate": 1.9997229454072576e-05, "loss": 0.0338, "step": 114940 }, { "epoch": 138.66143633071817, "grad_norm": 3.8520748615264893, "learning_rate": 1.9997229212817782e-05, "loss": 0.0348, "step": 114950 }, { "epoch": 138.67350633675318, "grad_norm": 3.590022325515747, "learning_rate": 1.9997228971562988e-05, "loss": 0.0343, "step": 114960 }, { "epoch": 138.68557634278818, "grad_norm": 3.791085958480835, "learning_rate": 1.9997228730308194e-05, "loss": 0.0371, "step": 114970 }, { "epoch": 138.69764634882318, "grad_norm": 3.661593437194824, "learning_rate": 1.99972284890534e-05, "loss": 0.036, "step": 114980 }, { "epoch": 138.7097163548582, "grad_norm": 3.9690022468566895, "learning_rate": 1.9997228247798607e-05, "loss": 0.0362, "step": 114990 }, { "epoch": 138.7217863608932, "grad_norm": 3.6119747161865234, "learning_rate": 1.9997228006543813e-05, "loss": 0.0342, "step": 115000 }, { "epoch": 138.7217863608932, "eval_loss": 13.363920211791992, "eval_runtime": 8.1298, "eval_samples_per_second": 85.734, "eval_steps_per_second": 10.824, "step": 115000 }, { "epoch": 138.7338563669282, "grad_norm": 3.513683795928955, "learning_rate": 1.999722776528902e-05, "loss": 0.034, "step": 115010 }, { "epoch": 138.7459263729632, "grad_norm": 3.5137906074523926, "learning_rate": 1.9997227524034225e-05, "loss": 0.0366, "step": 115020 }, { "epoch": 138.7579963789982, "grad_norm": 4.076277732849121, "learning_rate": 1.999722728277943e-05, "loss": 0.0358, "step": 115030 }, { "epoch": 138.7700663850332, "grad_norm": 3.3934433460235596, "learning_rate": 1.9997227041524638e-05, "loss": 0.0364, "step": 115040 }, { "epoch": 138.7821363910682, "grad_norm": 3.7398531436920166, "learning_rate": 1.999722680026984e-05, "loss": 0.0387, "step": 115050 }, { "epoch": 138.7942063971032, "grad_norm": 3.938678026199341, "learning_rate": 1.9997226559015047e-05, "loss": 0.0377, "step": 115060 }, { "epoch": 138.8062764031382, "grad_norm": 3.5014944076538086, "learning_rate": 1.9997226317760253e-05, "loss": 0.0361, "step": 115070 }, { "epoch": 138.81834640917322, "grad_norm": 3.5871806144714355, "learning_rate": 1.999722607650546e-05, "loss": 0.0358, "step": 115080 }, { "epoch": 138.83041641520822, "grad_norm": 3.8073465824127197, "learning_rate": 1.9997225835250665e-05, "loss": 0.0357, "step": 115090 }, { "epoch": 138.84248642124322, "grad_norm": 3.6745107173919678, "learning_rate": 1.999722559399587e-05, "loss": 0.0346, "step": 115100 }, { "epoch": 138.85455642727823, "grad_norm": 3.9524433612823486, "learning_rate": 1.9997225352741078e-05, "loss": 0.0379, "step": 115110 }, { "epoch": 138.86662643331323, "grad_norm": 3.5644192695617676, "learning_rate": 1.9997225111486284e-05, "loss": 0.0362, "step": 115120 }, { "epoch": 138.87869643934823, "grad_norm": 3.4501476287841797, "learning_rate": 1.999722487023149e-05, "loss": 0.0359, "step": 115130 }, { "epoch": 138.89076644538324, "grad_norm": 4.16409158706665, "learning_rate": 1.9997224628976696e-05, "loss": 0.0355, "step": 115140 }, { "epoch": 138.90283645141824, "grad_norm": 3.512924909591675, "learning_rate": 1.9997224387721903e-05, "loss": 0.0357, "step": 115150 }, { "epoch": 138.91490645745324, "grad_norm": 3.718045711517334, "learning_rate": 1.999722414646711e-05, "loss": 0.0347, "step": 115160 }, { "epoch": 138.92697646348824, "grad_norm": 4.092338562011719, "learning_rate": 1.9997223905212315e-05, "loss": 0.0368, "step": 115170 }, { "epoch": 138.93904646952325, "grad_norm": 3.587381601333618, "learning_rate": 1.999722366395752e-05, "loss": 0.0362, "step": 115180 }, { "epoch": 138.95111647555825, "grad_norm": 4.185486316680908, "learning_rate": 1.9997223422702728e-05, "loss": 0.0364, "step": 115190 }, { "epoch": 138.96318648159325, "grad_norm": 3.9003779888153076, "learning_rate": 1.9997223181447934e-05, "loss": 0.0394, "step": 115200 }, { "epoch": 138.97525648762826, "grad_norm": 3.726027488708496, "learning_rate": 1.999722294019314e-05, "loss": 0.0366, "step": 115210 }, { "epoch": 138.98732649366326, "grad_norm": 3.925837278366089, "learning_rate": 1.9997222698938346e-05, "loss": 0.0365, "step": 115220 }, { "epoch": 138.99939649969826, "grad_norm": 4.067320823669434, "learning_rate": 1.9997222457683552e-05, "loss": 0.0382, "step": 115230 }, { "epoch": 139.0108630054315, "grad_norm": 3.0121114253997803, "learning_rate": 1.999722221642876e-05, "loss": 0.0277, "step": 115240 }, { "epoch": 139.0229330114665, "grad_norm": 3.245173215866089, "learning_rate": 1.9997221975173965e-05, "loss": 0.0269, "step": 115250 }, { "epoch": 139.0350030175015, "grad_norm": 3.0182604789733887, "learning_rate": 1.999722173391917e-05, "loss": 0.0257, "step": 115260 }, { "epoch": 139.0470730235365, "grad_norm": 3.167612314224243, "learning_rate": 1.9997221492664377e-05, "loss": 0.0272, "step": 115270 }, { "epoch": 139.05914302957152, "grad_norm": 3.35475492477417, "learning_rate": 1.9997221251409583e-05, "loss": 0.027, "step": 115280 }, { "epoch": 139.07121303560652, "grad_norm": 3.9636311531066895, "learning_rate": 1.999722101015479e-05, "loss": 0.028, "step": 115290 }, { "epoch": 139.08328304164152, "grad_norm": 3.6477136611938477, "learning_rate": 1.9997220768899993e-05, "loss": 0.0279, "step": 115300 }, { "epoch": 139.09535304767653, "grad_norm": 3.1573712825775146, "learning_rate": 1.99972205276452e-05, "loss": 0.027, "step": 115310 }, { "epoch": 139.10742305371153, "grad_norm": 3.029334783554077, "learning_rate": 1.9997220286390405e-05, "loss": 0.029, "step": 115320 }, { "epoch": 139.11949305974653, "grad_norm": 3.5670583248138428, "learning_rate": 1.999722004513561e-05, "loss": 0.0302, "step": 115330 }, { "epoch": 139.13156306578153, "grad_norm": 3.5089924335479736, "learning_rate": 1.9997219803880817e-05, "loss": 0.0279, "step": 115340 }, { "epoch": 139.14363307181654, "grad_norm": 3.4087071418762207, "learning_rate": 1.9997219562626024e-05, "loss": 0.0296, "step": 115350 }, { "epoch": 139.15570307785154, "grad_norm": 3.552494525909424, "learning_rate": 1.999721932137123e-05, "loss": 0.0307, "step": 115360 }, { "epoch": 139.16777308388654, "grad_norm": 3.634587049484253, "learning_rate": 1.9997219080116436e-05, "loss": 0.0314, "step": 115370 }, { "epoch": 139.17984308992155, "grad_norm": 3.055598258972168, "learning_rate": 1.9997218838861642e-05, "loss": 0.0291, "step": 115380 }, { "epoch": 139.19191309595655, "grad_norm": 3.793358325958252, "learning_rate": 1.999721859760685e-05, "loss": 0.0301, "step": 115390 }, { "epoch": 139.20398310199155, "grad_norm": 3.5512821674346924, "learning_rate": 1.9997218356352055e-05, "loss": 0.03, "step": 115400 }, { "epoch": 139.21605310802656, "grad_norm": 3.8262507915496826, "learning_rate": 1.999721811509726e-05, "loss": 0.0302, "step": 115410 }, { "epoch": 139.22812311406156, "grad_norm": 3.3117432594299316, "learning_rate": 1.9997217873842467e-05, "loss": 0.0315, "step": 115420 }, { "epoch": 139.24019312009656, "grad_norm": 3.235367774963379, "learning_rate": 1.9997217632587673e-05, "loss": 0.0294, "step": 115430 }, { "epoch": 139.25226312613157, "grad_norm": 3.40254282951355, "learning_rate": 1.9997217391332883e-05, "loss": 0.0304, "step": 115440 }, { "epoch": 139.26433313216657, "grad_norm": 3.650904655456543, "learning_rate": 1.999721715007809e-05, "loss": 0.0322, "step": 115450 }, { "epoch": 139.27640313820157, "grad_norm": 3.429194211959839, "learning_rate": 1.9997216908823295e-05, "loss": 0.0301, "step": 115460 }, { "epoch": 139.28847314423658, "grad_norm": 3.6415765285491943, "learning_rate": 1.9997216667568498e-05, "loss": 0.0338, "step": 115470 }, { "epoch": 139.30054315027158, "grad_norm": 3.759260416030884, "learning_rate": 1.9997216426313704e-05, "loss": 0.0315, "step": 115480 }, { "epoch": 139.31261315630658, "grad_norm": 4.152318477630615, "learning_rate": 1.999721618505891e-05, "loss": 0.0335, "step": 115490 }, { "epoch": 139.32468316234159, "grad_norm": 3.134941816329956, "learning_rate": 1.9997215943804117e-05, "loss": 0.0324, "step": 115500 }, { "epoch": 139.32468316234159, "eval_loss": 13.360313415527344, "eval_runtime": 8.1269, "eval_samples_per_second": 85.764, "eval_steps_per_second": 10.828, "step": 115500 }, { "epoch": 139.3367531683766, "grad_norm": 3.2773876190185547, "learning_rate": 1.9997215702549323e-05, "loss": 0.0326, "step": 115510 }, { "epoch": 139.3488231744116, "grad_norm": 3.712172746658325, "learning_rate": 1.999721546129453e-05, "loss": 0.0326, "step": 115520 }, { "epoch": 139.3608931804466, "grad_norm": 3.632763624191284, "learning_rate": 1.9997215220039735e-05, "loss": 0.0328, "step": 115530 }, { "epoch": 139.3729631864816, "grad_norm": 3.378002882003784, "learning_rate": 1.999721497878494e-05, "loss": 0.031, "step": 115540 }, { "epoch": 139.3850331925166, "grad_norm": 3.8279619216918945, "learning_rate": 1.9997214737530148e-05, "loss": 0.0337, "step": 115550 }, { "epoch": 139.3971031985516, "grad_norm": 3.589370012283325, "learning_rate": 1.9997214496275354e-05, "loss": 0.0335, "step": 115560 }, { "epoch": 139.4091732045866, "grad_norm": 3.448305606842041, "learning_rate": 1.999721425502056e-05, "loss": 0.0349, "step": 115570 }, { "epoch": 139.4212432106216, "grad_norm": 3.724813461303711, "learning_rate": 1.9997214013765767e-05, "loss": 0.034, "step": 115580 }, { "epoch": 139.43331321665661, "grad_norm": 3.8266193866729736, "learning_rate": 1.9997213772510973e-05, "loss": 0.0333, "step": 115590 }, { "epoch": 139.44538322269162, "grad_norm": 3.6961700916290283, "learning_rate": 1.999721353125618e-05, "loss": 0.0338, "step": 115600 }, { "epoch": 139.45745322872662, "grad_norm": 3.634012222290039, "learning_rate": 1.9997213290001385e-05, "loss": 0.0315, "step": 115610 }, { "epoch": 139.46952323476162, "grad_norm": 3.1926612854003906, "learning_rate": 1.999721304874659e-05, "loss": 0.0333, "step": 115620 }, { "epoch": 139.48159324079663, "grad_norm": 3.466188430786133, "learning_rate": 1.9997212807491798e-05, "loss": 0.0333, "step": 115630 }, { "epoch": 139.49366324683163, "grad_norm": 3.6191160678863525, "learning_rate": 1.9997212566237004e-05, "loss": 0.0324, "step": 115640 }, { "epoch": 139.50573325286663, "grad_norm": 3.6164186000823975, "learning_rate": 1.999721232498221e-05, "loss": 0.0335, "step": 115650 }, { "epoch": 139.51780325890164, "grad_norm": 3.698479652404785, "learning_rate": 1.9997212083727416e-05, "loss": 0.0352, "step": 115660 }, { "epoch": 139.52987326493664, "grad_norm": 3.384552478790283, "learning_rate": 1.9997211842472622e-05, "loss": 0.0335, "step": 115670 }, { "epoch": 139.54194327097164, "grad_norm": 3.796450138092041, "learning_rate": 1.999721160121783e-05, "loss": 0.0331, "step": 115680 }, { "epoch": 139.55401327700665, "grad_norm": 3.6064627170562744, "learning_rate": 1.9997211359963035e-05, "loss": 0.0342, "step": 115690 }, { "epoch": 139.56608328304165, "grad_norm": 3.5266013145446777, "learning_rate": 1.999721111870824e-05, "loss": 0.0333, "step": 115700 }, { "epoch": 139.57815328907665, "grad_norm": 3.572754144668579, "learning_rate": 1.9997210877453447e-05, "loss": 0.0345, "step": 115710 }, { "epoch": 139.59022329511166, "grad_norm": 3.3677258491516113, "learning_rate": 1.999721063619865e-05, "loss": 0.0327, "step": 115720 }, { "epoch": 139.60229330114666, "grad_norm": 3.2324490547180176, "learning_rate": 1.9997210394943856e-05, "loss": 0.0347, "step": 115730 }, { "epoch": 139.61436330718166, "grad_norm": 3.757913827896118, "learning_rate": 1.9997210153689063e-05, "loss": 0.0338, "step": 115740 }, { "epoch": 139.62643331321667, "grad_norm": 3.6238129138946533, "learning_rate": 1.999720991243427e-05, "loss": 0.0334, "step": 115750 }, { "epoch": 139.63850331925167, "grad_norm": 3.733358860015869, "learning_rate": 1.9997209671179475e-05, "loss": 0.0355, "step": 115760 }, { "epoch": 139.65057332528667, "grad_norm": 3.5972704887390137, "learning_rate": 1.999720942992468e-05, "loss": 0.036, "step": 115770 }, { "epoch": 139.66264333132168, "grad_norm": 4.288649082183838, "learning_rate": 1.9997209188669887e-05, "loss": 0.0363, "step": 115780 }, { "epoch": 139.67471333735668, "grad_norm": 3.540416717529297, "learning_rate": 1.9997208947415094e-05, "loss": 0.0348, "step": 115790 }, { "epoch": 139.68678334339168, "grad_norm": 3.6746294498443604, "learning_rate": 1.99972087061603e-05, "loss": 0.0359, "step": 115800 }, { "epoch": 139.69885334942668, "grad_norm": 4.323107719421387, "learning_rate": 1.9997208464905506e-05, "loss": 0.0352, "step": 115810 }, { "epoch": 139.7109233554617, "grad_norm": 3.6919407844543457, "learning_rate": 1.9997208223650712e-05, "loss": 0.0337, "step": 115820 }, { "epoch": 139.7229933614967, "grad_norm": 3.7694783210754395, "learning_rate": 1.999720798239592e-05, "loss": 0.0343, "step": 115830 }, { "epoch": 139.7350633675317, "grad_norm": 3.615276575088501, "learning_rate": 1.9997207741141125e-05, "loss": 0.034, "step": 115840 }, { "epoch": 139.7471333735667, "grad_norm": 3.6766746044158936, "learning_rate": 1.999720749988633e-05, "loss": 0.0335, "step": 115850 }, { "epoch": 139.7592033796017, "grad_norm": 4.398586750030518, "learning_rate": 1.9997207258631537e-05, "loss": 0.0368, "step": 115860 }, { "epoch": 139.7712733856367, "grad_norm": 3.2623450756073, "learning_rate": 1.9997207017376743e-05, "loss": 0.0359, "step": 115870 }, { "epoch": 139.7833433916717, "grad_norm": 3.810741424560547, "learning_rate": 1.999720677612195e-05, "loss": 0.0359, "step": 115880 }, { "epoch": 139.7954133977067, "grad_norm": 3.6409730911254883, "learning_rate": 1.9997206534867156e-05, "loss": 0.035, "step": 115890 }, { "epoch": 139.8074834037417, "grad_norm": 3.7700083255767822, "learning_rate": 1.9997206293612362e-05, "loss": 0.0351, "step": 115900 }, { "epoch": 139.81955340977672, "grad_norm": 3.957831621170044, "learning_rate": 1.9997206052357568e-05, "loss": 0.0369, "step": 115910 }, { "epoch": 139.83162341581172, "grad_norm": 3.7026724815368652, "learning_rate": 1.9997205811102774e-05, "loss": 0.0355, "step": 115920 }, { "epoch": 139.84369342184672, "grad_norm": 3.8033337593078613, "learning_rate": 1.999720556984798e-05, "loss": 0.0356, "step": 115930 }, { "epoch": 139.85576342788173, "grad_norm": 3.826897144317627, "learning_rate": 1.9997205328593187e-05, "loss": 0.0349, "step": 115940 }, { "epoch": 139.86783343391673, "grad_norm": 3.7991995811462402, "learning_rate": 1.9997205087338393e-05, "loss": 0.0357, "step": 115950 }, { "epoch": 139.87990343995173, "grad_norm": 3.946319103240967, "learning_rate": 1.99972048460836e-05, "loss": 0.0362, "step": 115960 }, { "epoch": 139.89197344598674, "grad_norm": 3.7842905521392822, "learning_rate": 1.9997204604828802e-05, "loss": 0.0365, "step": 115970 }, { "epoch": 139.90404345202174, "grad_norm": 3.6408908367156982, "learning_rate": 1.9997204363574012e-05, "loss": 0.0347, "step": 115980 }, { "epoch": 139.91611345805674, "grad_norm": 3.9429376125335693, "learning_rate": 1.9997204122319218e-05, "loss": 0.0373, "step": 115990 }, { "epoch": 139.92818346409175, "grad_norm": 3.8058090209960938, "learning_rate": 1.9997203881064424e-05, "loss": 0.0369, "step": 116000 }, { "epoch": 139.92818346409175, "eval_loss": 13.382257461547852, "eval_runtime": 8.1106, "eval_samples_per_second": 85.937, "eval_steps_per_second": 10.85, "step": 116000 }, { "epoch": 139.94025347012675, "grad_norm": 3.989734411239624, "learning_rate": 1.999720363980963e-05, "loss": 0.037, "step": 116010 }, { "epoch": 139.95232347616175, "grad_norm": 3.8192641735076904, "learning_rate": 1.9997203398554837e-05, "loss": 0.0343, "step": 116020 }, { "epoch": 139.96439348219675, "grad_norm": 3.967994213104248, "learning_rate": 1.9997203157300043e-05, "loss": 0.0354, "step": 116030 }, { "epoch": 139.97646348823176, "grad_norm": 3.827237129211426, "learning_rate": 1.999720291604525e-05, "loss": 0.0383, "step": 116040 }, { "epoch": 139.98853349426676, "grad_norm": 3.714514970779419, "learning_rate": 1.9997202674790455e-05, "loss": 0.0366, "step": 116050 }, { "epoch": 140.0, "grad_norm": 5.620866298675537, "learning_rate": 1.999720243353566e-05, "loss": 0.0367, "step": 116060 }, { "epoch": 140.012070006035, "grad_norm": 3.008502960205078, "learning_rate": 1.9997202192280868e-05, "loss": 0.0252, "step": 116070 }, { "epoch": 140.02414001207, "grad_norm": 2.950106382369995, "learning_rate": 1.9997201951026074e-05, "loss": 0.0257, "step": 116080 }, { "epoch": 140.036210018105, "grad_norm": 3.086402654647827, "learning_rate": 1.999720170977128e-05, "loss": 0.025, "step": 116090 }, { "epoch": 140.04828002414, "grad_norm": 2.794818639755249, "learning_rate": 1.9997201468516486e-05, "loss": 0.0257, "step": 116100 }, { "epoch": 140.06035003017502, "grad_norm": 3.3311257362365723, "learning_rate": 1.9997201227261693e-05, "loss": 0.0263, "step": 116110 }, { "epoch": 140.07242003621002, "grad_norm": 3.7210500240325928, "learning_rate": 1.99972009860069e-05, "loss": 0.0285, "step": 116120 }, { "epoch": 140.08449004224502, "grad_norm": 3.0838706493377686, "learning_rate": 1.99972007447521e-05, "loss": 0.0274, "step": 116130 }, { "epoch": 140.09656004828003, "grad_norm": 3.038543462753296, "learning_rate": 1.9997200503497308e-05, "loss": 0.0275, "step": 116140 }, { "epoch": 140.10863005431503, "grad_norm": 3.58552622795105, "learning_rate": 1.9997200262242514e-05, "loss": 0.0283, "step": 116150 }, { "epoch": 140.12070006035003, "grad_norm": 3.2570550441741943, "learning_rate": 1.999720002098772e-05, "loss": 0.0288, "step": 116160 }, { "epoch": 140.13277006638504, "grad_norm": 3.4130475521087646, "learning_rate": 1.9997199779732926e-05, "loss": 0.0286, "step": 116170 }, { "epoch": 140.14484007242004, "grad_norm": 3.694221258163452, "learning_rate": 1.9997199538478133e-05, "loss": 0.0286, "step": 116180 }, { "epoch": 140.15691007845504, "grad_norm": 3.43985652923584, "learning_rate": 1.999719929722334e-05, "loss": 0.0285, "step": 116190 }, { "epoch": 140.16898008449004, "grad_norm": 3.174690008163452, "learning_rate": 1.9997199055968545e-05, "loss": 0.0307, "step": 116200 }, { "epoch": 140.18105009052505, "grad_norm": 3.996309757232666, "learning_rate": 1.999719881471375e-05, "loss": 0.03, "step": 116210 }, { "epoch": 140.19312009656005, "grad_norm": 3.3966500759124756, "learning_rate": 1.9997198573458958e-05, "loss": 0.0299, "step": 116220 }, { "epoch": 140.20519010259505, "grad_norm": 3.295992612838745, "learning_rate": 1.9997198332204164e-05, "loss": 0.0306, "step": 116230 }, { "epoch": 140.21726010863006, "grad_norm": 3.454177141189575, "learning_rate": 1.999719809094937e-05, "loss": 0.0293, "step": 116240 }, { "epoch": 140.22933011466506, "grad_norm": 3.786206007003784, "learning_rate": 1.9997197849694576e-05, "loss": 0.0304, "step": 116250 }, { "epoch": 140.24140012070006, "grad_norm": 4.2395524978637695, "learning_rate": 1.9997197608439782e-05, "loss": 0.0321, "step": 116260 }, { "epoch": 140.25347012673507, "grad_norm": 3.6414566040039062, "learning_rate": 1.999719736718499e-05, "loss": 0.0308, "step": 116270 }, { "epoch": 140.26554013277007, "grad_norm": 3.1981215476989746, "learning_rate": 1.9997197125930195e-05, "loss": 0.0312, "step": 116280 }, { "epoch": 140.27761013880507, "grad_norm": 3.5777745246887207, "learning_rate": 1.99971968846754e-05, "loss": 0.0315, "step": 116290 }, { "epoch": 140.28968014484008, "grad_norm": 3.304226875305176, "learning_rate": 1.9997196643420607e-05, "loss": 0.0306, "step": 116300 }, { "epoch": 140.30175015087508, "grad_norm": 3.4167280197143555, "learning_rate": 1.9997196402165813e-05, "loss": 0.0307, "step": 116310 }, { "epoch": 140.31382015691008, "grad_norm": 3.3652327060699463, "learning_rate": 1.999719616091102e-05, "loss": 0.0325, "step": 116320 }, { "epoch": 140.3258901629451, "grad_norm": 3.2801108360290527, "learning_rate": 1.9997195919656226e-05, "loss": 0.0302, "step": 116330 }, { "epoch": 140.3379601689801, "grad_norm": 3.6271893978118896, "learning_rate": 1.9997195678401432e-05, "loss": 0.0326, "step": 116340 }, { "epoch": 140.3500301750151, "grad_norm": 3.591740369796753, "learning_rate": 1.999719543714664e-05, "loss": 0.0319, "step": 116350 }, { "epoch": 140.3621001810501, "grad_norm": 3.359099864959717, "learning_rate": 1.9997195195891845e-05, "loss": 0.0307, "step": 116360 }, { "epoch": 140.3741701870851, "grad_norm": 3.9724552631378174, "learning_rate": 1.999719495463705e-05, "loss": 0.032, "step": 116370 }, { "epoch": 140.3862401931201, "grad_norm": 3.614159345626831, "learning_rate": 1.9997194713382254e-05, "loss": 0.0324, "step": 116380 }, { "epoch": 140.3983101991551, "grad_norm": 3.9166171550750732, "learning_rate": 1.999719447212746e-05, "loss": 0.0328, "step": 116390 }, { "epoch": 140.4103802051901, "grad_norm": Infinity, "learning_rate": 1.9997194230872666e-05, "loss": 0.0326, "step": 116400 }, { "epoch": 140.4224502112251, "grad_norm": 3.4457919597625732, "learning_rate": 1.9997193989617872e-05, "loss": 0.0321, "step": 116410 }, { "epoch": 140.43452021726011, "grad_norm": 3.7838151454925537, "learning_rate": 1.999719374836308e-05, "loss": 0.0336, "step": 116420 }, { "epoch": 140.44659022329512, "grad_norm": 3.681593656539917, "learning_rate": 1.9997193507108285e-05, "loss": 0.0321, "step": 116430 }, { "epoch": 140.45866022933012, "grad_norm": 3.8514208793640137, "learning_rate": 1.999719326585349e-05, "loss": 0.0341, "step": 116440 }, { "epoch": 140.47073023536512, "grad_norm": 4.258256912231445, "learning_rate": 1.9997193024598697e-05, "loss": 0.0329, "step": 116450 }, { "epoch": 140.48280024140013, "grad_norm": 3.619532823562622, "learning_rate": 1.9997192783343903e-05, "loss": 0.034, "step": 116460 }, { "epoch": 140.49487024743513, "grad_norm": 3.4773452281951904, "learning_rate": 1.999719254208911e-05, "loss": 0.0366, "step": 116470 }, { "epoch": 140.50694025347013, "grad_norm": 3.595200300216675, "learning_rate": 1.9997192300834316e-05, "loss": 0.0324, "step": 116480 }, { "epoch": 140.51901025950514, "grad_norm": 3.745630979537964, "learning_rate": 1.9997192059579522e-05, "loss": 0.0337, "step": 116490 }, { "epoch": 140.53108026554014, "grad_norm": 3.485598564147949, "learning_rate": 1.9997191818324728e-05, "loss": 0.0351, "step": 116500 }, { "epoch": 140.53108026554014, "eval_loss": 13.36911392211914, "eval_runtime": 8.127, "eval_samples_per_second": 85.764, "eval_steps_per_second": 10.828, "step": 116500 } ], "logging_steps": 10, "max_steps": 828999171, "num_input_tokens_seen": 0, "num_train_epochs": 999999, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.47058433668966e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }