{ "best_global_step": 110000, "best_metric": 0.9635852287290507, "best_model_checkpoint": "./ai-small/checkpoint-110000", "epoch": 2.9999825128967386, "eval_steps": 10000, "global_step": 128664, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00023316137681793012, "grad_norm": 5.10535192489624, "learning_rate": 9.999222781819311e-07, "loss": 0.7392, "step": 10 }, { "epoch": 0.00046632275363586023, "grad_norm": 4.874694347381592, "learning_rate": 9.998445563638625e-07, "loss": 0.7346, "step": 20 }, { "epoch": 0.0006994841304537903, "grad_norm": 3.919328212738037, "learning_rate": 9.997668345457936e-07, "loss": 0.7244, "step": 30 }, { "epoch": 0.0009326455072717205, "grad_norm": 4.009303092956543, "learning_rate": 9.99689112727725e-07, "loss": 0.7167, "step": 40 }, { "epoch": 0.0011658068840896506, "grad_norm": 3.913461208343506, "learning_rate": 9.996113909096562e-07, "loss": 0.7087, "step": 50 }, { "epoch": 0.0013989682609075807, "grad_norm": 4.1730475425720215, "learning_rate": 9.995336690915873e-07, "loss": 0.7009, "step": 60 }, { "epoch": 0.0016321296377255108, "grad_norm": 3.664079189300537, "learning_rate": 9.994559472735185e-07, "loss": 0.6953, "step": 70 }, { "epoch": 0.001865291014543441, "grad_norm": 3.099745750427246, "learning_rate": 9.993782254554498e-07, "loss": 0.6854, "step": 80 }, { "epoch": 0.002098452391361371, "grad_norm": 3.394944906234741, "learning_rate": 9.99300503637381e-07, "loss": 0.6787, "step": 90 }, { "epoch": 0.002331613768179301, "grad_norm": 3.2610280513763428, "learning_rate": 9.992227818193124e-07, "loss": 0.6715, "step": 100 }, { "epoch": 0.0025647751449972313, "grad_norm": 2.7711915969848633, "learning_rate": 9.991450600012435e-07, "loss": 0.6629, "step": 110 }, { "epoch": 0.0027979365218151614, "grad_norm": 3.3216934204101562, "learning_rate": 9.990673381831749e-07, "loss": 0.656, "step": 120 }, { "epoch": 0.0030310978986330915, "grad_norm": 2.9564640522003174, "learning_rate": 9.989896163651058e-07, "loss": 0.6421, "step": 130 }, { "epoch": 0.0032642592754510216, "grad_norm": 2.3967812061309814, "learning_rate": 9.989118945470372e-07, "loss": 0.6317, "step": 140 }, { "epoch": 0.0034974206522689517, "grad_norm": 1.5710994005203247, "learning_rate": 9.988341727289684e-07, "loss": 0.6197, "step": 150 }, { "epoch": 0.003730582029086882, "grad_norm": 0.9222308993339539, "learning_rate": 9.987564509108997e-07, "loss": 0.6026, "step": 160 }, { "epoch": 0.003963743405904812, "grad_norm": 0.6757057309150696, "learning_rate": 9.986787290928309e-07, "loss": 0.6012, "step": 170 }, { "epoch": 0.004196904782722742, "grad_norm": 0.369189977645874, "learning_rate": 9.986010072747622e-07, "loss": 0.6033, "step": 180 }, { "epoch": 0.004430066159540672, "grad_norm": 0.47496727108955383, "learning_rate": 9.985232854566934e-07, "loss": 0.59, "step": 190 }, { "epoch": 0.004663227536358602, "grad_norm": 0.6934787631034851, "learning_rate": 9.984455636386246e-07, "loss": 0.5711, "step": 200 }, { "epoch": 0.0048963889131765324, "grad_norm": 0.8506338596343994, "learning_rate": 9.983678418205557e-07, "loss": 0.5604, "step": 210 }, { "epoch": 0.0051295502899944626, "grad_norm": 1.6041849851608276, "learning_rate": 9.98290120002487e-07, "loss": 0.5352, "step": 220 }, { "epoch": 0.005362711666812393, "grad_norm": 1.4068275690078735, "learning_rate": 9.982123981844182e-07, "loss": 0.5191, "step": 230 }, { "epoch": 0.005595873043630323, "grad_norm": 1.5548619031906128, "learning_rate": 9.981346763663496e-07, "loss": 0.522, "step": 240 }, { "epoch": 0.005829034420448253, "grad_norm": 1.2768529653549194, "learning_rate": 9.980569545482808e-07, "loss": 0.4979, "step": 250 }, { "epoch": 0.006062195797266183, "grad_norm": 1.4961940050125122, "learning_rate": 9.97979232730212e-07, "loss": 0.4798, "step": 260 }, { "epoch": 0.006295357174084113, "grad_norm": 1.240755558013916, "learning_rate": 9.979015109121433e-07, "loss": 0.4763, "step": 270 }, { "epoch": 0.006528518550902043, "grad_norm": 1.1632510423660278, "learning_rate": 9.978237890940744e-07, "loss": 0.4605, "step": 280 }, { "epoch": 0.006761679927719973, "grad_norm": 1.6182910203933716, "learning_rate": 9.977460672760056e-07, "loss": 0.4647, "step": 290 }, { "epoch": 0.0069948413045379035, "grad_norm": 1.3704261779785156, "learning_rate": 9.97668345457937e-07, "loss": 0.4515, "step": 300 }, { "epoch": 0.007228002681355834, "grad_norm": 1.4486756324768066, "learning_rate": 9.975906236398681e-07, "loss": 0.4351, "step": 310 }, { "epoch": 0.007461164058173764, "grad_norm": 1.5886913537979126, "learning_rate": 9.975129018217993e-07, "loss": 0.4243, "step": 320 }, { "epoch": 0.007694325434991694, "grad_norm": 1.7102346420288086, "learning_rate": 9.974351800037307e-07, "loss": 0.419, "step": 330 }, { "epoch": 0.007927486811809624, "grad_norm": 1.7952964305877686, "learning_rate": 9.973574581856618e-07, "loss": 0.4121, "step": 340 }, { "epoch": 0.008160648188627554, "grad_norm": 1.3879910707473755, "learning_rate": 9.972797363675932e-07, "loss": 0.3876, "step": 350 }, { "epoch": 0.008393809565445484, "grad_norm": 1.7326854467391968, "learning_rate": 9.972020145495243e-07, "loss": 0.3813, "step": 360 }, { "epoch": 0.008626970942263414, "grad_norm": 1.3452551364898682, "learning_rate": 9.971242927314555e-07, "loss": 0.3858, "step": 370 }, { "epoch": 0.008860132319081344, "grad_norm": 2.2456843852996826, "learning_rate": 9.970465709133866e-07, "loss": 0.3783, "step": 380 }, { "epoch": 0.009093293695899275, "grad_norm": 1.3006477355957031, "learning_rate": 9.96968849095318e-07, "loss": 0.378, "step": 390 }, { "epoch": 0.009326455072717205, "grad_norm": 1.4461941719055176, "learning_rate": 9.968911272772492e-07, "loss": 0.3794, "step": 400 }, { "epoch": 0.009559616449535135, "grad_norm": 1.506611704826355, "learning_rate": 9.968134054591805e-07, "loss": 0.365, "step": 410 }, { "epoch": 0.009792777826353065, "grad_norm": 1.4209531545639038, "learning_rate": 9.967356836411117e-07, "loss": 0.3498, "step": 420 }, { "epoch": 0.010025939203170995, "grad_norm": 2.0019609928131104, "learning_rate": 9.96657961823043e-07, "loss": 0.3587, "step": 430 }, { "epoch": 0.010259100579988925, "grad_norm": 1.3922737836837769, "learning_rate": 9.965802400049742e-07, "loss": 0.3619, "step": 440 }, { "epoch": 0.010492261956806855, "grad_norm": 1.780015230178833, "learning_rate": 9.965025181869054e-07, "loss": 0.3433, "step": 450 }, { "epoch": 0.010725423333624785, "grad_norm": 2.282528877258301, "learning_rate": 9.964247963688365e-07, "loss": 0.343, "step": 460 }, { "epoch": 0.010958584710442715, "grad_norm": 1.9319243431091309, "learning_rate": 9.96347074550768e-07, "loss": 0.3404, "step": 470 }, { "epoch": 0.011191746087260646, "grad_norm": 2.8231711387634277, "learning_rate": 9.96269352732699e-07, "loss": 0.342, "step": 480 }, { "epoch": 0.011424907464078576, "grad_norm": 2.7439684867858887, "learning_rate": 9.961916309146304e-07, "loss": 0.3228, "step": 490 }, { "epoch": 0.011658068840896506, "grad_norm": 1.216714859008789, "learning_rate": 9.961139090965616e-07, "loss": 0.3238, "step": 500 }, { "epoch": 0.011891230217714436, "grad_norm": 3.173093557357788, "learning_rate": 9.960361872784927e-07, "loss": 0.3119, "step": 510 }, { "epoch": 0.012124391594532366, "grad_norm": 1.7966827154159546, "learning_rate": 9.95958465460424e-07, "loss": 0.307, "step": 520 }, { "epoch": 0.012357552971350296, "grad_norm": 1.6286718845367432, "learning_rate": 9.958807436423553e-07, "loss": 0.3161, "step": 530 }, { "epoch": 0.012590714348168226, "grad_norm": 2.8719379901885986, "learning_rate": 9.958030218242864e-07, "loss": 0.3075, "step": 540 }, { "epoch": 0.012823875724986156, "grad_norm": 1.5743921995162964, "learning_rate": 9.957253000062178e-07, "loss": 0.3224, "step": 550 }, { "epoch": 0.013057037101804086, "grad_norm": 1.4150983095169067, "learning_rate": 9.95647578188149e-07, "loss": 0.2942, "step": 560 }, { "epoch": 0.013290198478622017, "grad_norm": 1.7903029918670654, "learning_rate": 9.9556985637008e-07, "loss": 0.298, "step": 570 }, { "epoch": 0.013523359855439947, "grad_norm": 1.4878578186035156, "learning_rate": 9.954921345520115e-07, "loss": 0.2905, "step": 580 }, { "epoch": 0.013756521232257877, "grad_norm": 1.8139123916625977, "learning_rate": 9.954144127339426e-07, "loss": 0.2961, "step": 590 }, { "epoch": 0.013989682609075807, "grad_norm": 1.7627248764038086, "learning_rate": 9.95336690915874e-07, "loss": 0.301, "step": 600 }, { "epoch": 0.014222843985893737, "grad_norm": 2.103706121444702, "learning_rate": 9.952589690978051e-07, "loss": 0.2767, "step": 610 }, { "epoch": 0.014456005362711667, "grad_norm": 2.2071237564086914, "learning_rate": 9.951812472797363e-07, "loss": 0.2851, "step": 620 }, { "epoch": 0.014689166739529597, "grad_norm": 2.2857580184936523, "learning_rate": 9.951035254616675e-07, "loss": 0.2847, "step": 630 }, { "epoch": 0.014922328116347527, "grad_norm": 2.3305001258850098, "learning_rate": 9.950258036435988e-07, "loss": 0.2602, "step": 640 }, { "epoch": 0.015155489493165458, "grad_norm": 3.2311830520629883, "learning_rate": 9.9494808182553e-07, "loss": 0.2763, "step": 650 }, { "epoch": 0.015388650869983388, "grad_norm": 1.5490572452545166, "learning_rate": 9.948703600074614e-07, "loss": 0.2666, "step": 660 }, { "epoch": 0.015621812246801318, "grad_norm": 1.9129961729049683, "learning_rate": 9.947926381893925e-07, "loss": 0.264, "step": 670 }, { "epoch": 0.015854973623619248, "grad_norm": 1.3288627862930298, "learning_rate": 9.947149163713237e-07, "loss": 0.276, "step": 680 }, { "epoch": 0.016088135000437178, "grad_norm": 3.0769546031951904, "learning_rate": 9.946371945532548e-07, "loss": 0.2605, "step": 690 }, { "epoch": 0.016321296377255108, "grad_norm": 3.2644011974334717, "learning_rate": 9.945594727351862e-07, "loss": 0.2619, "step": 700 }, { "epoch": 0.016554457754073038, "grad_norm": 5.099483966827393, "learning_rate": 9.944817509171173e-07, "loss": 0.2477, "step": 710 }, { "epoch": 0.01678761913089097, "grad_norm": 1.5380699634552002, "learning_rate": 9.944040290990487e-07, "loss": 0.2561, "step": 720 }, { "epoch": 0.0170207805077089, "grad_norm": 1.453488826751709, "learning_rate": 9.943263072809799e-07, "loss": 0.2611, "step": 730 }, { "epoch": 0.01725394188452683, "grad_norm": 2.216606855392456, "learning_rate": 9.94248585462911e-07, "loss": 0.2508, "step": 740 }, { "epoch": 0.01748710326134476, "grad_norm": 1.4280039072036743, "learning_rate": 9.941708636448424e-07, "loss": 0.2458, "step": 750 }, { "epoch": 0.01772026463816269, "grad_norm": 1.4460023641586304, "learning_rate": 9.940931418267735e-07, "loss": 0.2567, "step": 760 }, { "epoch": 0.01795342601498062, "grad_norm": 2.1173901557922363, "learning_rate": 9.940154200087047e-07, "loss": 0.2619, "step": 770 }, { "epoch": 0.01818658739179855, "grad_norm": 1.5530089139938354, "learning_rate": 9.93937698190636e-07, "loss": 0.2427, "step": 780 }, { "epoch": 0.01841974876861648, "grad_norm": 3.176727533340454, "learning_rate": 9.938599763725672e-07, "loss": 0.2579, "step": 790 }, { "epoch": 0.01865291014543441, "grad_norm": 1.915099859237671, "learning_rate": 9.937822545544984e-07, "loss": 0.241, "step": 800 }, { "epoch": 0.01888607152225234, "grad_norm": 1.636133074760437, "learning_rate": 9.937045327364298e-07, "loss": 0.2583, "step": 810 }, { "epoch": 0.01911923289907027, "grad_norm": 4.326306343078613, "learning_rate": 9.93626810918361e-07, "loss": 0.2443, "step": 820 }, { "epoch": 0.0193523942758882, "grad_norm": 1.35690438747406, "learning_rate": 9.935490891002923e-07, "loss": 0.2399, "step": 830 }, { "epoch": 0.01958555565270613, "grad_norm": 2.9677181243896484, "learning_rate": 9.934713672822234e-07, "loss": 0.2486, "step": 840 }, { "epoch": 0.01981871702952406, "grad_norm": 1.4149906635284424, "learning_rate": 9.933936454641546e-07, "loss": 0.2366, "step": 850 }, { "epoch": 0.02005187840634199, "grad_norm": 3.856945753097534, "learning_rate": 9.933159236460857e-07, "loss": 0.2517, "step": 860 }, { "epoch": 0.02028503978315992, "grad_norm": 1.2802690267562866, "learning_rate": 9.932382018280171e-07, "loss": 0.2505, "step": 870 }, { "epoch": 0.02051820115997785, "grad_norm": 1.8115483522415161, "learning_rate": 9.931604800099483e-07, "loss": 0.2452, "step": 880 }, { "epoch": 0.02075136253679578, "grad_norm": 1.6087301969528198, "learning_rate": 9.930827581918796e-07, "loss": 0.2372, "step": 890 }, { "epoch": 0.02098452391361371, "grad_norm": 3.446725845336914, "learning_rate": 9.930050363738108e-07, "loss": 0.2358, "step": 900 }, { "epoch": 0.02121768529043164, "grad_norm": 1.3298193216323853, "learning_rate": 9.929273145557422e-07, "loss": 0.2314, "step": 910 }, { "epoch": 0.02145084666724957, "grad_norm": 3.1009063720703125, "learning_rate": 9.928495927376733e-07, "loss": 0.2339, "step": 920 }, { "epoch": 0.0216840080440675, "grad_norm": 2.4057812690734863, "learning_rate": 9.927718709196045e-07, "loss": 0.255, "step": 930 }, { "epoch": 0.02191716942088543, "grad_norm": 1.9438070058822632, "learning_rate": 9.926941491015356e-07, "loss": 0.2506, "step": 940 }, { "epoch": 0.02215033079770336, "grad_norm": 1.567612648010254, "learning_rate": 9.92616427283467e-07, "loss": 0.2321, "step": 950 }, { "epoch": 0.02238349217452129, "grad_norm": 1.916723370552063, "learning_rate": 9.925387054653982e-07, "loss": 0.24, "step": 960 }, { "epoch": 0.02261665355133922, "grad_norm": 3.7643046379089355, "learning_rate": 9.924609836473295e-07, "loss": 0.233, "step": 970 }, { "epoch": 0.02284981492815715, "grad_norm": 1.5458979606628418, "learning_rate": 9.923832618292607e-07, "loss": 0.2376, "step": 980 }, { "epoch": 0.02308297630497508, "grad_norm": 2.089637041091919, "learning_rate": 9.923055400111918e-07, "loss": 0.2335, "step": 990 }, { "epoch": 0.02331613768179301, "grad_norm": 1.8295021057128906, "learning_rate": 9.922278181931232e-07, "loss": 0.2284, "step": 1000 }, { "epoch": 0.02354929905861094, "grad_norm": 2.7923829555511475, "learning_rate": 9.921500963750544e-07, "loss": 0.2385, "step": 1010 }, { "epoch": 0.023782460435428872, "grad_norm": 1.621211290359497, "learning_rate": 9.920723745569855e-07, "loss": 0.2259, "step": 1020 }, { "epoch": 0.024015621812246802, "grad_norm": 4.027182102203369, "learning_rate": 9.919946527389169e-07, "loss": 0.2417, "step": 1030 }, { "epoch": 0.024248783189064732, "grad_norm": 2.0310006141662598, "learning_rate": 9.91916930920848e-07, "loss": 0.2321, "step": 1040 }, { "epoch": 0.024481944565882662, "grad_norm": 2.158318281173706, "learning_rate": 9.918392091027792e-07, "loss": 0.2295, "step": 1050 }, { "epoch": 0.024715105942700592, "grad_norm": 5.660566806793213, "learning_rate": 9.917614872847106e-07, "loss": 0.2268, "step": 1060 }, { "epoch": 0.024948267319518522, "grad_norm": 2.3642871379852295, "learning_rate": 9.916837654666417e-07, "loss": 0.2168, "step": 1070 }, { "epoch": 0.025181428696336453, "grad_norm": 1.6536506414413452, "learning_rate": 9.91606043648573e-07, "loss": 0.2339, "step": 1080 }, { "epoch": 0.025414590073154383, "grad_norm": 2.040839433670044, "learning_rate": 9.915283218305042e-07, "loss": 0.2207, "step": 1090 }, { "epoch": 0.025647751449972313, "grad_norm": 4.313286781311035, "learning_rate": 9.914506000124354e-07, "loss": 0.2448, "step": 1100 }, { "epoch": 0.025880912826790243, "grad_norm": 1.4684804677963257, "learning_rate": 9.913728781943666e-07, "loss": 0.2413, "step": 1110 }, { "epoch": 0.026114074203608173, "grad_norm": 2.960918426513672, "learning_rate": 9.91295156376298e-07, "loss": 0.2114, "step": 1120 }, { "epoch": 0.026347235580426103, "grad_norm": 1.7561731338500977, "learning_rate": 9.91217434558229e-07, "loss": 0.2191, "step": 1130 }, { "epoch": 0.026580396957244033, "grad_norm": 4.747954368591309, "learning_rate": 9.911397127401605e-07, "loss": 0.2239, "step": 1140 }, { "epoch": 0.026813558334061963, "grad_norm": 4.183343410491943, "learning_rate": 9.910619909220916e-07, "loss": 0.2192, "step": 1150 }, { "epoch": 0.027046719710879893, "grad_norm": 1.4372986555099487, "learning_rate": 9.90984269104023e-07, "loss": 0.2398, "step": 1160 }, { "epoch": 0.027279881087697824, "grad_norm": 2.8832857608795166, "learning_rate": 9.90906547285954e-07, "loss": 0.2132, "step": 1170 }, { "epoch": 0.027513042464515754, "grad_norm": 1.1834295988082886, "learning_rate": 9.908288254678853e-07, "loss": 0.2114, "step": 1180 }, { "epoch": 0.027746203841333684, "grad_norm": 3.0164811611175537, "learning_rate": 9.907511036498164e-07, "loss": 0.2294, "step": 1190 }, { "epoch": 0.027979365218151614, "grad_norm": 1.4244053363800049, "learning_rate": 9.906733818317478e-07, "loss": 0.2354, "step": 1200 }, { "epoch": 0.028212526594969544, "grad_norm": 2.8288819789886475, "learning_rate": 9.90595660013679e-07, "loss": 0.2318, "step": 1210 }, { "epoch": 0.028445687971787474, "grad_norm": 1.423656702041626, "learning_rate": 9.905179381956103e-07, "loss": 0.2079, "step": 1220 }, { "epoch": 0.028678849348605404, "grad_norm": 1.4150737524032593, "learning_rate": 9.904402163775415e-07, "loss": 0.2232, "step": 1230 }, { "epoch": 0.028912010725423334, "grad_norm": 3.6050586700439453, "learning_rate": 9.903624945594726e-07, "loss": 0.2223, "step": 1240 }, { "epoch": 0.029145172102241265, "grad_norm": 2.1418724060058594, "learning_rate": 9.902847727414038e-07, "loss": 0.2335, "step": 1250 }, { "epoch": 0.029378333479059195, "grad_norm": 1.8748044967651367, "learning_rate": 9.902070509233352e-07, "loss": 0.2043, "step": 1260 }, { "epoch": 0.029611494855877125, "grad_norm": 1.319466471672058, "learning_rate": 9.901293291052663e-07, "loss": 0.2008, "step": 1270 }, { "epoch": 0.029844656232695055, "grad_norm": 2.1863596439361572, "learning_rate": 9.900516072871977e-07, "loss": 0.227, "step": 1280 }, { "epoch": 0.030077817609512985, "grad_norm": 2.526853561401367, "learning_rate": 9.899738854691289e-07, "loss": 0.2105, "step": 1290 }, { "epoch": 0.030310978986330915, "grad_norm": 2.190265417098999, "learning_rate": 9.8989616365106e-07, "loss": 0.2047, "step": 1300 }, { "epoch": 0.030544140363148845, "grad_norm": 1.2716585397720337, "learning_rate": 9.898184418329914e-07, "loss": 0.2232, "step": 1310 }, { "epoch": 0.030777301739966775, "grad_norm": 1.4289475679397583, "learning_rate": 9.897407200149225e-07, "loss": 0.2089, "step": 1320 }, { "epoch": 0.031010463116784705, "grad_norm": 2.999389171600342, "learning_rate": 9.896629981968537e-07, "loss": 0.1908, "step": 1330 }, { "epoch": 0.031243624493602636, "grad_norm": 4.698498725891113, "learning_rate": 9.89585276378785e-07, "loss": 0.2169, "step": 1340 }, { "epoch": 0.03147678587042056, "grad_norm": 1.4984439611434937, "learning_rate": 9.895075545607162e-07, "loss": 0.2137, "step": 1350 }, { "epoch": 0.031709947247238496, "grad_norm": 1.3296476602554321, "learning_rate": 9.894298327426474e-07, "loss": 0.2278, "step": 1360 }, { "epoch": 0.03194310862405642, "grad_norm": 1.5871483087539673, "learning_rate": 9.893521109245787e-07, "loss": 0.2351, "step": 1370 }, { "epoch": 0.032176270000874356, "grad_norm": 2.001645803451538, "learning_rate": 9.8927438910651e-07, "loss": 0.2057, "step": 1380 }, { "epoch": 0.03240943137769228, "grad_norm": 4.751797676086426, "learning_rate": 9.891966672884413e-07, "loss": 0.2143, "step": 1390 }, { "epoch": 0.032642592754510216, "grad_norm": 2.5335755348205566, "learning_rate": 9.891189454703724e-07, "loss": 0.2012, "step": 1400 }, { "epoch": 0.03287575413132814, "grad_norm": 4.260024070739746, "learning_rate": 9.890412236523036e-07, "loss": 0.2205, "step": 1410 }, { "epoch": 0.033108915508146076, "grad_norm": 3.1619794368743896, "learning_rate": 9.889635018342347e-07, "loss": 0.2191, "step": 1420 }, { "epoch": 0.033342076884964, "grad_norm": 2.4015815258026123, "learning_rate": 9.88885780016166e-07, "loss": 0.2098, "step": 1430 }, { "epoch": 0.03357523826178194, "grad_norm": 1.2730485200881958, "learning_rate": 9.888080581980973e-07, "loss": 0.2196, "step": 1440 }, { "epoch": 0.03380839963859986, "grad_norm": 1.3001588582992554, "learning_rate": 9.887303363800286e-07, "loss": 0.2208, "step": 1450 }, { "epoch": 0.0340415610154178, "grad_norm": 1.5651320219039917, "learning_rate": 9.886526145619598e-07, "loss": 0.2166, "step": 1460 }, { "epoch": 0.034274722392235724, "grad_norm": 2.903820753097534, "learning_rate": 9.885748927438911e-07, "loss": 0.2137, "step": 1470 }, { "epoch": 0.03450788376905366, "grad_norm": 3.769031047821045, "learning_rate": 9.884971709258223e-07, "loss": 0.2245, "step": 1480 }, { "epoch": 0.034741045145871584, "grad_norm": 2.634605646133423, "learning_rate": 9.884194491077535e-07, "loss": 0.2073, "step": 1490 }, { "epoch": 0.03497420652268952, "grad_norm": 3.6885557174682617, "learning_rate": 9.883417272896846e-07, "loss": 0.21, "step": 1500 }, { "epoch": 0.035207367899507444, "grad_norm": 1.5395257472991943, "learning_rate": 9.88264005471616e-07, "loss": 0.2116, "step": 1510 }, { "epoch": 0.03544052927632538, "grad_norm": 3.745643138885498, "learning_rate": 9.881862836535471e-07, "loss": 0.2082, "step": 1520 }, { "epoch": 0.035673690653143304, "grad_norm": 1.7398098707199097, "learning_rate": 9.881085618354785e-07, "loss": 0.2222, "step": 1530 }, { "epoch": 0.03590685202996124, "grad_norm": 1.6162129640579224, "learning_rate": 9.880308400174097e-07, "loss": 0.2276, "step": 1540 }, { "epoch": 0.036140013406779165, "grad_norm": 2.05202317237854, "learning_rate": 9.879531181993408e-07, "loss": 0.1988, "step": 1550 }, { "epoch": 0.0363731747835971, "grad_norm": 1.6686854362487793, "learning_rate": 9.878753963812722e-07, "loss": 0.2209, "step": 1560 }, { "epoch": 0.036606336160415025, "grad_norm": 2.7891643047332764, "learning_rate": 9.877976745632033e-07, "loss": 0.1994, "step": 1570 }, { "epoch": 0.03683949753723296, "grad_norm": 1.707245111465454, "learning_rate": 9.877199527451345e-07, "loss": 0.2187, "step": 1580 }, { "epoch": 0.037072658914050885, "grad_norm": 1.6504400968551636, "learning_rate": 9.876422309270659e-07, "loss": 0.2047, "step": 1590 }, { "epoch": 0.03730582029086882, "grad_norm": 1.4131348133087158, "learning_rate": 9.87564509108997e-07, "loss": 0.2089, "step": 1600 }, { "epoch": 0.037538981667686745, "grad_norm": 2.006941795349121, "learning_rate": 9.874867872909282e-07, "loss": 0.2086, "step": 1610 }, { "epoch": 0.03777214304450468, "grad_norm": 1.417238473892212, "learning_rate": 9.874090654728596e-07, "loss": 0.2037, "step": 1620 }, { "epoch": 0.038005304421322605, "grad_norm": 2.175795793533325, "learning_rate": 9.873313436547907e-07, "loss": 0.2054, "step": 1630 }, { "epoch": 0.03823846579814054, "grad_norm": 3.312040090560913, "learning_rate": 9.87253621836722e-07, "loss": 0.2013, "step": 1640 }, { "epoch": 0.038471627174958466, "grad_norm": 3.0786514282226562, "learning_rate": 9.871759000186532e-07, "loss": 0.1978, "step": 1650 }, { "epoch": 0.0387047885517764, "grad_norm": 2.767185688018799, "learning_rate": 9.870981782005844e-07, "loss": 0.2159, "step": 1660 }, { "epoch": 0.038937949928594326, "grad_norm": 1.4371248483657837, "learning_rate": 9.870204563825155e-07, "loss": 0.2035, "step": 1670 }, { "epoch": 0.03917111130541226, "grad_norm": 6.7961249351501465, "learning_rate": 9.86942734564447e-07, "loss": 0.2138, "step": 1680 }, { "epoch": 0.039404272682230186, "grad_norm": 1.4656274318695068, "learning_rate": 9.86865012746378e-07, "loss": 0.1988, "step": 1690 }, { "epoch": 0.03963743405904812, "grad_norm": 2.59407114982605, "learning_rate": 9.867872909283094e-07, "loss": 0.206, "step": 1700 }, { "epoch": 0.039870595435866046, "grad_norm": 4.849668979644775, "learning_rate": 9.867095691102406e-07, "loss": 0.1925, "step": 1710 }, { "epoch": 0.04010375681268398, "grad_norm": 1.4182133674621582, "learning_rate": 9.86631847292172e-07, "loss": 0.1866, "step": 1720 }, { "epoch": 0.04033691818950191, "grad_norm": 1.6214039325714111, "learning_rate": 9.86554125474103e-07, "loss": 0.1904, "step": 1730 }, { "epoch": 0.04057007956631984, "grad_norm": 1.3843207359313965, "learning_rate": 9.864764036560343e-07, "loss": 0.2322, "step": 1740 }, { "epoch": 0.04080324094313777, "grad_norm": 2.815669536590576, "learning_rate": 9.863986818379654e-07, "loss": 0.2041, "step": 1750 }, { "epoch": 0.0410364023199557, "grad_norm": 4.083254337310791, "learning_rate": 9.863209600198968e-07, "loss": 0.2058, "step": 1760 }, { "epoch": 0.04126956369677363, "grad_norm": 4.165858268737793, "learning_rate": 9.86243238201828e-07, "loss": 0.2117, "step": 1770 }, { "epoch": 0.04150272507359156, "grad_norm": 3.461043357849121, "learning_rate": 9.861655163837593e-07, "loss": 0.2258, "step": 1780 }, { "epoch": 0.04173588645040949, "grad_norm": 1.4096788167953491, "learning_rate": 9.860877945656905e-07, "loss": 0.2124, "step": 1790 }, { "epoch": 0.04196904782722742, "grad_norm": 1.8624194860458374, "learning_rate": 9.860100727476216e-07, "loss": 0.2009, "step": 1800 }, { "epoch": 0.04220220920404535, "grad_norm": 4.535089492797852, "learning_rate": 9.859323509295528e-07, "loss": 0.211, "step": 1810 }, { "epoch": 0.04243537058086328, "grad_norm": 2.6979804039001465, "learning_rate": 9.858546291114842e-07, "loss": 0.2141, "step": 1820 }, { "epoch": 0.04266853195768121, "grad_norm": 1.8659700155258179, "learning_rate": 9.857769072934153e-07, "loss": 0.2043, "step": 1830 }, { "epoch": 0.04290169333449914, "grad_norm": 2.6429076194763184, "learning_rate": 9.856991854753467e-07, "loss": 0.2001, "step": 1840 }, { "epoch": 0.04313485471131707, "grad_norm": 2.518728494644165, "learning_rate": 9.856214636572778e-07, "loss": 0.1975, "step": 1850 }, { "epoch": 0.043368016088135, "grad_norm": 3.5006818771362305, "learning_rate": 9.85543741839209e-07, "loss": 0.196, "step": 1860 }, { "epoch": 0.04360117746495293, "grad_norm": 1.5021830797195435, "learning_rate": 9.854660200211404e-07, "loss": 0.1926, "step": 1870 }, { "epoch": 0.04383433884177086, "grad_norm": 2.034532070159912, "learning_rate": 9.853882982030715e-07, "loss": 0.1764, "step": 1880 }, { "epoch": 0.04406750021858879, "grad_norm": 1.7590800523757935, "learning_rate": 9.853105763850027e-07, "loss": 0.2055, "step": 1890 }, { "epoch": 0.04430066159540672, "grad_norm": 2.4988481998443604, "learning_rate": 9.85232854566934e-07, "loss": 0.2035, "step": 1900 }, { "epoch": 0.04453382297222465, "grad_norm": 2.570770740509033, "learning_rate": 9.851551327488652e-07, "loss": 0.2055, "step": 1910 }, { "epoch": 0.04476698434904258, "grad_norm": 1.9855132102966309, "learning_rate": 9.850774109307964e-07, "loss": 0.2049, "step": 1920 }, { "epoch": 0.04500014572586051, "grad_norm": 1.2769640684127808, "learning_rate": 9.849996891127277e-07, "loss": 0.2028, "step": 1930 }, { "epoch": 0.04523330710267844, "grad_norm": 3.8392293453216553, "learning_rate": 9.849219672946589e-07, "loss": 0.1836, "step": 1940 }, { "epoch": 0.04546646847949637, "grad_norm": 2.986537218093872, "learning_rate": 9.848442454765903e-07, "loss": 0.2088, "step": 1950 }, { "epoch": 0.0456996298563143, "grad_norm": 1.5160176753997803, "learning_rate": 9.847665236585214e-07, "loss": 0.195, "step": 1960 }, { "epoch": 0.04593279123313223, "grad_norm": 1.447279453277588, "learning_rate": 9.846888018404526e-07, "loss": 0.2094, "step": 1970 }, { "epoch": 0.04616595260995016, "grad_norm": 1.3170820474624634, "learning_rate": 9.846110800223837e-07, "loss": 0.1959, "step": 1980 }, { "epoch": 0.04639911398676809, "grad_norm": 4.8373589515686035, "learning_rate": 9.84533358204315e-07, "loss": 0.1881, "step": 1990 }, { "epoch": 0.04663227536358602, "grad_norm": 7.029094219207764, "learning_rate": 9.844556363862462e-07, "loss": 0.1949, "step": 2000 }, { "epoch": 0.04686543674040395, "grad_norm": 2.6608362197875977, "learning_rate": 9.843779145681776e-07, "loss": 0.1954, "step": 2010 }, { "epoch": 0.04709859811722188, "grad_norm": 1.3125410079956055, "learning_rate": 9.843001927501088e-07, "loss": 0.2064, "step": 2020 }, { "epoch": 0.04733175949403981, "grad_norm": 1.6051887273788452, "learning_rate": 9.8422247093204e-07, "loss": 0.2029, "step": 2030 }, { "epoch": 0.047564920870857744, "grad_norm": 1.3579037189483643, "learning_rate": 9.841447491139713e-07, "loss": 0.1737, "step": 2040 }, { "epoch": 0.04779808224767567, "grad_norm": 2.2061381340026855, "learning_rate": 9.840670272959024e-07, "loss": 0.1886, "step": 2050 }, { "epoch": 0.048031243624493604, "grad_norm": 2.3070857524871826, "learning_rate": 9.839893054778336e-07, "loss": 0.2086, "step": 2060 }, { "epoch": 0.04826440500131153, "grad_norm": 1.5634127855300903, "learning_rate": 9.83911583659765e-07, "loss": 0.1919, "step": 2070 }, { "epoch": 0.048497566378129464, "grad_norm": 1.4531811475753784, "learning_rate": 9.838338618416961e-07, "loss": 0.2091, "step": 2080 }, { "epoch": 0.04873072775494739, "grad_norm": 1.4067283868789673, "learning_rate": 9.837561400236273e-07, "loss": 0.1967, "step": 2090 }, { "epoch": 0.048963889131765324, "grad_norm": 1.6479768753051758, "learning_rate": 9.836784182055587e-07, "loss": 0.2033, "step": 2100 }, { "epoch": 0.04919705050858325, "grad_norm": 1.3103773593902588, "learning_rate": 9.836006963874898e-07, "loss": 0.1998, "step": 2110 }, { "epoch": 0.049430211885401185, "grad_norm": 1.473359227180481, "learning_rate": 9.835229745694212e-07, "loss": 0.182, "step": 2120 }, { "epoch": 0.04966337326221911, "grad_norm": 3.096757411956787, "learning_rate": 9.834452527513523e-07, "loss": 0.2202, "step": 2130 }, { "epoch": 0.049896534639037045, "grad_norm": 1.5135953426361084, "learning_rate": 9.833675309332835e-07, "loss": 0.1889, "step": 2140 }, { "epoch": 0.05012969601585497, "grad_norm": 1.435315489768982, "learning_rate": 9.832898091152146e-07, "loss": 0.1943, "step": 2150 }, { "epoch": 0.050362857392672905, "grad_norm": 1.3945114612579346, "learning_rate": 9.83212087297146e-07, "loss": 0.2057, "step": 2160 }, { "epoch": 0.05059601876949083, "grad_norm": 1.738709568977356, "learning_rate": 9.831343654790772e-07, "loss": 0.1845, "step": 2170 }, { "epoch": 0.050829180146308765, "grad_norm": 5.544673919677734, "learning_rate": 9.830566436610085e-07, "loss": 0.1919, "step": 2180 }, { "epoch": 0.05106234152312669, "grad_norm": 1.5932238101959229, "learning_rate": 9.829789218429397e-07, "loss": 0.1886, "step": 2190 }, { "epoch": 0.051295502899944626, "grad_norm": 1.561086893081665, "learning_rate": 9.82901200024871e-07, "loss": 0.1943, "step": 2200 }, { "epoch": 0.05152866427676255, "grad_norm": 3.115044116973877, "learning_rate": 9.828234782068022e-07, "loss": 0.1934, "step": 2210 }, { "epoch": 0.051761825653580486, "grad_norm": 2.3300933837890625, "learning_rate": 9.827457563887334e-07, "loss": 0.1887, "step": 2220 }, { "epoch": 0.05199498703039841, "grad_norm": 1.304931402206421, "learning_rate": 9.826680345706645e-07, "loss": 0.18, "step": 2230 }, { "epoch": 0.052228148407216346, "grad_norm": 1.4292092323303223, "learning_rate": 9.82590312752596e-07, "loss": 0.2037, "step": 2240 }, { "epoch": 0.05246130978403427, "grad_norm": 1.6319721937179565, "learning_rate": 9.82512590934527e-07, "loss": 0.1815, "step": 2250 }, { "epoch": 0.052694471160852206, "grad_norm": 4.025607585906982, "learning_rate": 9.824348691164584e-07, "loss": 0.2098, "step": 2260 }, { "epoch": 0.05292763253767013, "grad_norm": 1.7344192266464233, "learning_rate": 9.823571472983896e-07, "loss": 0.195, "step": 2270 }, { "epoch": 0.053160793914488066, "grad_norm": 2.4242475032806396, "learning_rate": 9.822794254803207e-07, "loss": 0.186, "step": 2280 }, { "epoch": 0.05339395529130599, "grad_norm": 1.8629602193832397, "learning_rate": 9.822017036622519e-07, "loss": 0.2046, "step": 2290 }, { "epoch": 0.05362711666812393, "grad_norm": 3.553941249847412, "learning_rate": 9.821239818441833e-07, "loss": 0.1936, "step": 2300 }, { "epoch": 0.05386027804494185, "grad_norm": 2.838981866836548, "learning_rate": 9.820462600261144e-07, "loss": 0.178, "step": 2310 }, { "epoch": 0.05409343942175979, "grad_norm": 1.3706461191177368, "learning_rate": 9.819685382080458e-07, "loss": 0.1873, "step": 2320 }, { "epoch": 0.054326600798577714, "grad_norm": 2.752614736557007, "learning_rate": 9.81890816389977e-07, "loss": 0.1981, "step": 2330 }, { "epoch": 0.05455976217539565, "grad_norm": 5.957196235656738, "learning_rate": 9.81813094571908e-07, "loss": 0.1884, "step": 2340 }, { "epoch": 0.054792923552213574, "grad_norm": 1.6775951385498047, "learning_rate": 9.817353727538395e-07, "loss": 0.2098, "step": 2350 }, { "epoch": 0.05502608492903151, "grad_norm": 4.556507587432861, "learning_rate": 9.816576509357706e-07, "loss": 0.1942, "step": 2360 }, { "epoch": 0.055259246305849434, "grad_norm": 1.3593047857284546, "learning_rate": 9.815799291177018e-07, "loss": 0.1913, "step": 2370 }, { "epoch": 0.05549240768266737, "grad_norm": 2.7512853145599365, "learning_rate": 9.815022072996331e-07, "loss": 0.1886, "step": 2380 }, { "epoch": 0.055725569059485294, "grad_norm": 1.3762726783752441, "learning_rate": 9.814244854815643e-07, "loss": 0.1958, "step": 2390 }, { "epoch": 0.05595873043630323, "grad_norm": 1.432845115661621, "learning_rate": 9.813467636634955e-07, "loss": 0.1818, "step": 2400 }, { "epoch": 0.056191891813121154, "grad_norm": 1.8211219310760498, "learning_rate": 9.812690418454268e-07, "loss": 0.1845, "step": 2410 }, { "epoch": 0.05642505318993909, "grad_norm": 1.4484772682189941, "learning_rate": 9.81191320027358e-07, "loss": 0.208, "step": 2420 }, { "epoch": 0.056658214566757015, "grad_norm": 1.7233058214187622, "learning_rate": 9.811135982092894e-07, "loss": 0.1954, "step": 2430 }, { "epoch": 0.05689137594357495, "grad_norm": 1.7146556377410889, "learning_rate": 9.810358763912205e-07, "loss": 0.182, "step": 2440 }, { "epoch": 0.057124537320392875, "grad_norm": 1.2363665103912354, "learning_rate": 9.809581545731519e-07, "loss": 0.1958, "step": 2450 }, { "epoch": 0.05735769869721081, "grad_norm": 2.410062789916992, "learning_rate": 9.808804327550828e-07, "loss": 0.1872, "step": 2460 }, { "epoch": 0.057590860074028735, "grad_norm": 1.404356837272644, "learning_rate": 9.808027109370142e-07, "loss": 0.1906, "step": 2470 }, { "epoch": 0.05782402145084667, "grad_norm": 3.0583159923553467, "learning_rate": 9.807249891189453e-07, "loss": 0.1797, "step": 2480 }, { "epoch": 0.058057182827664595, "grad_norm": 2.713759422302246, "learning_rate": 9.806472673008767e-07, "loss": 0.1806, "step": 2490 }, { "epoch": 0.05829034420448253, "grad_norm": 3.398791790008545, "learning_rate": 9.805695454828079e-07, "loss": 0.1947, "step": 2500 }, { "epoch": 0.058523505581300456, "grad_norm": 1.3404293060302734, "learning_rate": 9.804918236647392e-07, "loss": 0.2153, "step": 2510 }, { "epoch": 0.05875666695811839, "grad_norm": 2.403160572052002, "learning_rate": 9.804141018466704e-07, "loss": 0.1985, "step": 2520 }, { "epoch": 0.058989828334936316, "grad_norm": 1.7761693000793457, "learning_rate": 9.803363800286015e-07, "loss": 0.1925, "step": 2530 }, { "epoch": 0.05922298971175425, "grad_norm": 1.5328673124313354, "learning_rate": 9.802586582105327e-07, "loss": 0.1943, "step": 2540 }, { "epoch": 0.059456151088572176, "grad_norm": 2.633974313735962, "learning_rate": 9.80180936392464e-07, "loss": 0.1785, "step": 2550 }, { "epoch": 0.05968931246539011, "grad_norm": 1.9748424291610718, "learning_rate": 9.801032145743952e-07, "loss": 0.1832, "step": 2560 }, { "epoch": 0.059922473842208036, "grad_norm": 3.5642213821411133, "learning_rate": 9.800254927563266e-07, "loss": 0.191, "step": 2570 }, { "epoch": 0.06015563521902597, "grad_norm": 2.678764581680298, "learning_rate": 9.799477709382578e-07, "loss": 0.1807, "step": 2580 }, { "epoch": 0.0603887965958439, "grad_norm": 1.5210490226745605, "learning_rate": 9.79870049120189e-07, "loss": 0.2059, "step": 2590 }, { "epoch": 0.06062195797266183, "grad_norm": 1.2852510213851929, "learning_rate": 9.797923273021203e-07, "loss": 0.1732, "step": 2600 }, { "epoch": 0.06085511934947976, "grad_norm": 3.0054008960723877, "learning_rate": 9.797146054840514e-07, "loss": 0.2044, "step": 2610 }, { "epoch": 0.06108828072629769, "grad_norm": 2.013793706893921, "learning_rate": 9.796368836659826e-07, "loss": 0.1877, "step": 2620 }, { "epoch": 0.06132144210311562, "grad_norm": 4.546442031860352, "learning_rate": 9.79559161847914e-07, "loss": 0.1882, "step": 2630 }, { "epoch": 0.06155460347993355, "grad_norm": 3.546489953994751, "learning_rate": 9.794814400298451e-07, "loss": 0.189, "step": 2640 }, { "epoch": 0.06178776485675148, "grad_norm": 2.1100404262542725, "learning_rate": 9.794037182117763e-07, "loss": 0.1982, "step": 2650 }, { "epoch": 0.06202092623356941, "grad_norm": 1.5123347043991089, "learning_rate": 9.793259963937076e-07, "loss": 0.1746, "step": 2660 }, { "epoch": 0.06225408761038734, "grad_norm": 3.6503889560699463, "learning_rate": 9.792482745756388e-07, "loss": 0.1838, "step": 2670 }, { "epoch": 0.06248724898720527, "grad_norm": 1.3324217796325684, "learning_rate": 9.791705527575702e-07, "loss": 0.182, "step": 2680 }, { "epoch": 0.0627204103640232, "grad_norm": 2.3224844932556152, "learning_rate": 9.790928309395013e-07, "loss": 0.202, "step": 2690 }, { "epoch": 0.06295357174084112, "grad_norm": 1.532859444618225, "learning_rate": 9.790151091214325e-07, "loss": 0.1856, "step": 2700 }, { "epoch": 0.06318673311765906, "grad_norm": 1.8759773969650269, "learning_rate": 9.789373873033636e-07, "loss": 0.1876, "step": 2710 }, { "epoch": 0.06341989449447699, "grad_norm": 1.4145463705062866, "learning_rate": 9.78859665485295e-07, "loss": 0.1847, "step": 2720 }, { "epoch": 0.06365305587129493, "grad_norm": 2.3919193744659424, "learning_rate": 9.787819436672262e-07, "loss": 0.1727, "step": 2730 }, { "epoch": 0.06388621724811284, "grad_norm": 3.513216257095337, "learning_rate": 9.787042218491575e-07, "loss": 0.17, "step": 2740 }, { "epoch": 0.06411937862493078, "grad_norm": 1.4704927206039429, "learning_rate": 9.786265000310887e-07, "loss": 0.1779, "step": 2750 }, { "epoch": 0.06435254000174871, "grad_norm": 1.752246379852295, "learning_rate": 9.7854877821302e-07, "loss": 0.1815, "step": 2760 }, { "epoch": 0.06458570137856665, "grad_norm": 1.7959797382354736, "learning_rate": 9.784710563949512e-07, "loss": 0.1986, "step": 2770 }, { "epoch": 0.06481886275538457, "grad_norm": 1.3923646211624146, "learning_rate": 9.783933345768824e-07, "loss": 0.171, "step": 2780 }, { "epoch": 0.0650520241322025, "grad_norm": 1.4566631317138672, "learning_rate": 9.783156127588135e-07, "loss": 0.1747, "step": 2790 }, { "epoch": 0.06528518550902043, "grad_norm": 3.4202725887298584, "learning_rate": 9.782378909407449e-07, "loss": 0.1884, "step": 2800 }, { "epoch": 0.06551834688583837, "grad_norm": 1.4625532627105713, "learning_rate": 9.78160169122676e-07, "loss": 0.1914, "step": 2810 }, { "epoch": 0.06575150826265629, "grad_norm": 1.8602243661880493, "learning_rate": 9.780824473046074e-07, "loss": 0.1856, "step": 2820 }, { "epoch": 0.06598466963947422, "grad_norm": 2.5006308555603027, "learning_rate": 9.780047254865386e-07, "loss": 0.1971, "step": 2830 }, { "epoch": 0.06621783101629215, "grad_norm": 1.5653735399246216, "learning_rate": 9.779270036684697e-07, "loss": 0.1943, "step": 2840 }, { "epoch": 0.06645099239311009, "grad_norm": 2.475044012069702, "learning_rate": 9.77849281850401e-07, "loss": 0.1755, "step": 2850 }, { "epoch": 0.066684153769928, "grad_norm": 1.4798297882080078, "learning_rate": 9.777715600323322e-07, "loss": 0.1817, "step": 2860 }, { "epoch": 0.06691731514674594, "grad_norm": 1.681902289390564, "learning_rate": 9.776938382142634e-07, "loss": 0.1804, "step": 2870 }, { "epoch": 0.06715047652356387, "grad_norm": 3.8524980545043945, "learning_rate": 9.776161163961948e-07, "loss": 0.1932, "step": 2880 }, { "epoch": 0.06738363790038181, "grad_norm": 1.6351650953292847, "learning_rate": 9.77538394578126e-07, "loss": 0.1711, "step": 2890 }, { "epoch": 0.06761679927719973, "grad_norm": 1.7624801397323608, "learning_rate": 9.77460672760057e-07, "loss": 0.1726, "step": 2900 }, { "epoch": 0.06784996065401766, "grad_norm": 1.6049234867095947, "learning_rate": 9.773829509419885e-07, "loss": 0.1935, "step": 2910 }, { "epoch": 0.0680831220308356, "grad_norm": 2.113098382949829, "learning_rate": 9.773052291239196e-07, "loss": 0.1862, "step": 2920 }, { "epoch": 0.06831628340765353, "grad_norm": 2.154775381088257, "learning_rate": 9.77227507305851e-07, "loss": 0.1897, "step": 2930 }, { "epoch": 0.06854944478447145, "grad_norm": 1.6939514875411987, "learning_rate": 9.771497854877821e-07, "loss": 0.1773, "step": 2940 }, { "epoch": 0.06878260616128938, "grad_norm": 3.2161378860473633, "learning_rate": 9.770720636697133e-07, "loss": 0.1892, "step": 2950 }, { "epoch": 0.06901576753810731, "grad_norm": 2.068840980529785, "learning_rate": 9.769943418516444e-07, "loss": 0.1876, "step": 2960 }, { "epoch": 0.06924892891492525, "grad_norm": 1.9604358673095703, "learning_rate": 9.769166200335758e-07, "loss": 0.1802, "step": 2970 }, { "epoch": 0.06948209029174317, "grad_norm": 1.4935749769210815, "learning_rate": 9.76838898215507e-07, "loss": 0.193, "step": 2980 }, { "epoch": 0.0697152516685611, "grad_norm": 2.2869293689727783, "learning_rate": 9.767611763974383e-07, "loss": 0.1815, "step": 2990 }, { "epoch": 0.06994841304537903, "grad_norm": 3.3016769886016846, "learning_rate": 9.766834545793695e-07, "loss": 0.1817, "step": 3000 }, { "epoch": 0.07018157442219697, "grad_norm": 1.3105617761611938, "learning_rate": 9.766057327613009e-07, "loss": 0.1785, "step": 3010 }, { "epoch": 0.07041473579901489, "grad_norm": 1.4877169132232666, "learning_rate": 9.765280109432318e-07, "loss": 0.1807, "step": 3020 }, { "epoch": 0.07064789717583282, "grad_norm": 1.5739063024520874, "learning_rate": 9.764502891251632e-07, "loss": 0.174, "step": 3030 }, { "epoch": 0.07088105855265076, "grad_norm": 3.618558406829834, "learning_rate": 9.763725673070943e-07, "loss": 0.1767, "step": 3040 }, { "epoch": 0.07111421992946869, "grad_norm": 1.7505218982696533, "learning_rate": 9.762948454890257e-07, "loss": 0.1887, "step": 3050 }, { "epoch": 0.07134738130628661, "grad_norm": 1.2975027561187744, "learning_rate": 9.762171236709569e-07, "loss": 0.1888, "step": 3060 }, { "epoch": 0.07158054268310454, "grad_norm": 3.4462947845458984, "learning_rate": 9.761394018528882e-07, "loss": 0.1664, "step": 3070 }, { "epoch": 0.07181370405992248, "grad_norm": 1.8078051805496216, "learning_rate": 9.760616800348194e-07, "loss": 0.1947, "step": 3080 }, { "epoch": 0.07204686543674041, "grad_norm": 1.6381957530975342, "learning_rate": 9.759839582167505e-07, "loss": 0.1861, "step": 3090 }, { "epoch": 0.07228002681355833, "grad_norm": 2.621211051940918, "learning_rate": 9.759062363986817e-07, "loss": 0.1837, "step": 3100 }, { "epoch": 0.07251318819037626, "grad_norm": 3.3523781299591064, "learning_rate": 9.75828514580613e-07, "loss": 0.1821, "step": 3110 }, { "epoch": 0.0727463495671942, "grad_norm": 2.2825770378112793, "learning_rate": 9.757507927625442e-07, "loss": 0.1696, "step": 3120 }, { "epoch": 0.07297951094401213, "grad_norm": 2.546217441558838, "learning_rate": 9.756730709444756e-07, "loss": 0.1711, "step": 3130 }, { "epoch": 0.07321267232083005, "grad_norm": 1.6150851249694824, "learning_rate": 9.755953491264067e-07, "loss": 0.1888, "step": 3140 }, { "epoch": 0.07344583369764798, "grad_norm": 2.1438193321228027, "learning_rate": 9.75517627308338e-07, "loss": 0.1661, "step": 3150 }, { "epoch": 0.07367899507446592, "grad_norm": 3.679672956466675, "learning_rate": 9.754399054902693e-07, "loss": 0.1754, "step": 3160 }, { "epoch": 0.07391215645128385, "grad_norm": 2.890486001968384, "learning_rate": 9.753621836722004e-07, "loss": 0.1812, "step": 3170 }, { "epoch": 0.07414531782810177, "grad_norm": 2.6440210342407227, "learning_rate": 9.752844618541316e-07, "loss": 0.1819, "step": 3180 }, { "epoch": 0.0743784792049197, "grad_norm": 2.104022741317749, "learning_rate": 9.75206740036063e-07, "loss": 0.1793, "step": 3190 }, { "epoch": 0.07461164058173764, "grad_norm": 1.346099615097046, "learning_rate": 9.75129018217994e-07, "loss": 0.1875, "step": 3200 }, { "epoch": 0.07484480195855557, "grad_norm": 3.8174452781677246, "learning_rate": 9.750512963999253e-07, "loss": 0.1794, "step": 3210 }, { "epoch": 0.07507796333537349, "grad_norm": 1.3394012451171875, "learning_rate": 9.749735745818566e-07, "loss": 0.1773, "step": 3220 }, { "epoch": 0.07531112471219142, "grad_norm": 2.6171391010284424, "learning_rate": 9.748958527637878e-07, "loss": 0.1877, "step": 3230 }, { "epoch": 0.07554428608900936, "grad_norm": 2.7846267223358154, "learning_rate": 9.748181309457192e-07, "loss": 0.1808, "step": 3240 }, { "epoch": 0.07577744746582729, "grad_norm": 1.6610242128372192, "learning_rate": 9.747404091276503e-07, "loss": 0.1955, "step": 3250 }, { "epoch": 0.07601060884264521, "grad_norm": 3.040719509124756, "learning_rate": 9.746626873095815e-07, "loss": 0.1736, "step": 3260 }, { "epoch": 0.07624377021946314, "grad_norm": 1.586087703704834, "learning_rate": 9.745849654915126e-07, "loss": 0.1743, "step": 3270 }, { "epoch": 0.07647693159628108, "grad_norm": 3.2856478691101074, "learning_rate": 9.74507243673444e-07, "loss": 0.1848, "step": 3280 }, { "epoch": 0.07671009297309901, "grad_norm": 3.760734796524048, "learning_rate": 9.744295218553751e-07, "loss": 0.1879, "step": 3290 }, { "epoch": 0.07694325434991693, "grad_norm": 3.2895562648773193, "learning_rate": 9.743518000373065e-07, "loss": 0.1762, "step": 3300 }, { "epoch": 0.07717641572673486, "grad_norm": 1.5448569059371948, "learning_rate": 9.742740782192377e-07, "loss": 0.1777, "step": 3310 }, { "epoch": 0.0774095771035528, "grad_norm": 2.0607237815856934, "learning_rate": 9.741963564011688e-07, "loss": 0.1773, "step": 3320 }, { "epoch": 0.07764273848037073, "grad_norm": 4.688923358917236, "learning_rate": 9.741186345831002e-07, "loss": 0.186, "step": 3330 }, { "epoch": 0.07787589985718865, "grad_norm": 1.7973723411560059, "learning_rate": 9.740409127650313e-07, "loss": 0.1707, "step": 3340 }, { "epoch": 0.07810906123400659, "grad_norm": 1.3803038597106934, "learning_rate": 9.739631909469625e-07, "loss": 0.1712, "step": 3350 }, { "epoch": 0.07834222261082452, "grad_norm": 1.9395781755447388, "learning_rate": 9.738854691288939e-07, "loss": 0.1839, "step": 3360 }, { "epoch": 0.07857538398764245, "grad_norm": 2.709672689437866, "learning_rate": 9.73807747310825e-07, "loss": 0.189, "step": 3370 }, { "epoch": 0.07880854536446037, "grad_norm": 1.9932961463928223, "learning_rate": 9.737300254927562e-07, "loss": 0.1693, "step": 3380 }, { "epoch": 0.0790417067412783, "grad_norm": 4.418886184692383, "learning_rate": 9.736523036746876e-07, "loss": 0.2001, "step": 3390 }, { "epoch": 0.07927486811809624, "grad_norm": 2.680229663848877, "learning_rate": 9.735745818566187e-07, "loss": 0.1812, "step": 3400 }, { "epoch": 0.07950802949491417, "grad_norm": 3.055176019668579, "learning_rate": 9.7349686003855e-07, "loss": 0.1787, "step": 3410 }, { "epoch": 0.07974119087173209, "grad_norm": 1.2843824625015259, "learning_rate": 9.734191382204812e-07, "loss": 0.1806, "step": 3420 }, { "epoch": 0.07997435224855003, "grad_norm": 1.4039149284362793, "learning_rate": 9.733414164024124e-07, "loss": 0.1827, "step": 3430 }, { "epoch": 0.08020751362536796, "grad_norm": 1.583985447883606, "learning_rate": 9.732636945843438e-07, "loss": 0.1713, "step": 3440 }, { "epoch": 0.0804406750021859, "grad_norm": 1.066357970237732, "learning_rate": 9.73185972766275e-07, "loss": 0.178, "step": 3450 }, { "epoch": 0.08067383637900381, "grad_norm": 2.7722671031951904, "learning_rate": 9.73108250948206e-07, "loss": 0.2013, "step": 3460 }, { "epoch": 0.08090699775582175, "grad_norm": 2.7668097019195557, "learning_rate": 9.730305291301374e-07, "loss": 0.1783, "step": 3470 }, { "epoch": 0.08114015913263968, "grad_norm": 3.196218490600586, "learning_rate": 9.729528073120686e-07, "loss": 0.1836, "step": 3480 }, { "epoch": 0.08137332050945761, "grad_norm": 1.733256459236145, "learning_rate": 9.72875085494e-07, "loss": 0.1896, "step": 3490 }, { "epoch": 0.08160648188627553, "grad_norm": 1.4032577276229858, "learning_rate": 9.727973636759311e-07, "loss": 0.1797, "step": 3500 }, { "epoch": 0.08183964326309347, "grad_norm": 1.3623361587524414, "learning_rate": 9.727196418578623e-07, "loss": 0.1581, "step": 3510 }, { "epoch": 0.0820728046399114, "grad_norm": 1.3118637800216675, "learning_rate": 9.726419200397934e-07, "loss": 0.1756, "step": 3520 }, { "epoch": 0.08230596601672933, "grad_norm": 1.860214352607727, "learning_rate": 9.725641982217248e-07, "loss": 0.1847, "step": 3530 }, { "epoch": 0.08253912739354725, "grad_norm": 1.2706900835037231, "learning_rate": 9.72486476403656e-07, "loss": 0.1659, "step": 3540 }, { "epoch": 0.08277228877036519, "grad_norm": 1.6824325323104858, "learning_rate": 9.724087545855873e-07, "loss": 0.1802, "step": 3550 }, { "epoch": 0.08300545014718312, "grad_norm": 1.9020298719406128, "learning_rate": 9.723310327675185e-07, "loss": 0.1669, "step": 3560 }, { "epoch": 0.08323861152400105, "grad_norm": 1.60233473777771, "learning_rate": 9.722533109494496e-07, "loss": 0.1787, "step": 3570 }, { "epoch": 0.08347177290081897, "grad_norm": 3.5403831005096436, "learning_rate": 9.721755891313808e-07, "loss": 0.1761, "step": 3580 }, { "epoch": 0.08370493427763691, "grad_norm": 1.527026891708374, "learning_rate": 9.720978673133122e-07, "loss": 0.1744, "step": 3590 }, { "epoch": 0.08393809565445484, "grad_norm": 1.4164358377456665, "learning_rate": 9.720201454952433e-07, "loss": 0.177, "step": 3600 }, { "epoch": 0.08417125703127278, "grad_norm": 1.7316707372665405, "learning_rate": 9.719424236771747e-07, "loss": 0.1662, "step": 3610 }, { "epoch": 0.0844044184080907, "grad_norm": 2.791077136993408, "learning_rate": 9.718647018591058e-07, "loss": 0.1732, "step": 3620 }, { "epoch": 0.08463757978490863, "grad_norm": 1.9515408277511597, "learning_rate": 9.71786980041037e-07, "loss": 0.1704, "step": 3630 }, { "epoch": 0.08487074116172656, "grad_norm": 1.9307165145874023, "learning_rate": 9.717092582229684e-07, "loss": 0.1611, "step": 3640 }, { "epoch": 0.0851039025385445, "grad_norm": 2.893683671951294, "learning_rate": 9.716315364048995e-07, "loss": 0.1666, "step": 3650 }, { "epoch": 0.08533706391536242, "grad_norm": 1.6813435554504395, "learning_rate": 9.715538145868307e-07, "loss": 0.1855, "step": 3660 }, { "epoch": 0.08557022529218035, "grad_norm": 1.5669840574264526, "learning_rate": 9.71476092768762e-07, "loss": 0.1805, "step": 3670 }, { "epoch": 0.08580338666899828, "grad_norm": 1.6619884967803955, "learning_rate": 9.713983709506932e-07, "loss": 0.1769, "step": 3680 }, { "epoch": 0.08603654804581622, "grad_norm": 1.3844255208969116, "learning_rate": 9.713206491326244e-07, "loss": 0.1709, "step": 3690 }, { "epoch": 0.08626970942263414, "grad_norm": 3.8314666748046875, "learning_rate": 9.712429273145557e-07, "loss": 0.1876, "step": 3700 }, { "epoch": 0.08650287079945207, "grad_norm": 1.779335856437683, "learning_rate": 9.711652054964869e-07, "loss": 0.1868, "step": 3710 }, { "epoch": 0.08673603217627, "grad_norm": 1.3215327262878418, "learning_rate": 9.710874836784183e-07, "loss": 0.1615, "step": 3720 }, { "epoch": 0.08696919355308794, "grad_norm": 1.4319617748260498, "learning_rate": 9.710097618603494e-07, "loss": 0.1712, "step": 3730 }, { "epoch": 0.08720235492990586, "grad_norm": 2.2544641494750977, "learning_rate": 9.709320400422806e-07, "loss": 0.1739, "step": 3740 }, { "epoch": 0.08743551630672379, "grad_norm": 1.7529710531234741, "learning_rate": 9.708543182242117e-07, "loss": 0.181, "step": 3750 }, { "epoch": 0.08766867768354172, "grad_norm": 2.3528292179107666, "learning_rate": 9.70776596406143e-07, "loss": 0.1774, "step": 3760 }, { "epoch": 0.08790183906035966, "grad_norm": 1.4840776920318604, "learning_rate": 9.706988745880742e-07, "loss": 0.1853, "step": 3770 }, { "epoch": 0.08813500043717758, "grad_norm": 3.456528663635254, "learning_rate": 9.706211527700056e-07, "loss": 0.1673, "step": 3780 }, { "epoch": 0.08836816181399551, "grad_norm": 3.3500890731811523, "learning_rate": 9.705434309519368e-07, "loss": 0.1714, "step": 3790 }, { "epoch": 0.08860132319081344, "grad_norm": 1.599959373474121, "learning_rate": 9.704657091338681e-07, "loss": 0.1629, "step": 3800 }, { "epoch": 0.08883448456763138, "grad_norm": 1.1413326263427734, "learning_rate": 9.703879873157993e-07, "loss": 0.174, "step": 3810 }, { "epoch": 0.0890676459444493, "grad_norm": 1.1849210262298584, "learning_rate": 9.703102654977304e-07, "loss": 0.1627, "step": 3820 }, { "epoch": 0.08930080732126723, "grad_norm": 1.7251511812210083, "learning_rate": 9.702325436796616e-07, "loss": 0.1763, "step": 3830 }, { "epoch": 0.08953396869808516, "grad_norm": 1.9848928451538086, "learning_rate": 9.70154821861593e-07, "loss": 0.1764, "step": 3840 }, { "epoch": 0.0897671300749031, "grad_norm": 1.2202283143997192, "learning_rate": 9.700771000435241e-07, "loss": 0.1743, "step": 3850 }, { "epoch": 0.09000029145172102, "grad_norm": 1.3143441677093506, "learning_rate": 9.699993782254555e-07, "loss": 0.1679, "step": 3860 }, { "epoch": 0.09023345282853895, "grad_norm": 2.3615972995758057, "learning_rate": 9.699216564073867e-07, "loss": 0.1834, "step": 3870 }, { "epoch": 0.09046661420535689, "grad_norm": 4.094841480255127, "learning_rate": 9.698439345893178e-07, "loss": 0.1798, "step": 3880 }, { "epoch": 0.09069977558217482, "grad_norm": 1.223569631576538, "learning_rate": 9.697662127712492e-07, "loss": 0.1771, "step": 3890 }, { "epoch": 0.09093293695899274, "grad_norm": 2.1717171669006348, "learning_rate": 9.696884909531803e-07, "loss": 0.1762, "step": 3900 }, { "epoch": 0.09116609833581067, "grad_norm": 4.590503692626953, "learning_rate": 9.696107691351115e-07, "loss": 0.1603, "step": 3910 }, { "epoch": 0.0913992597126286, "grad_norm": 4.1490864753723145, "learning_rate": 9.695330473170429e-07, "loss": 0.1765, "step": 3920 }, { "epoch": 0.09163242108944654, "grad_norm": 2.4293599128723145, "learning_rate": 9.69455325498974e-07, "loss": 0.1604, "step": 3930 }, { "epoch": 0.09186558246626446, "grad_norm": 1.885303258895874, "learning_rate": 9.693776036809052e-07, "loss": 0.1702, "step": 3940 }, { "epoch": 0.09209874384308239, "grad_norm": 2.6326653957366943, "learning_rate": 9.692998818628365e-07, "loss": 0.1861, "step": 3950 }, { "epoch": 0.09233190521990033, "grad_norm": 2.4422876834869385, "learning_rate": 9.692221600447677e-07, "loss": 0.1712, "step": 3960 }, { "epoch": 0.09256506659671826, "grad_norm": 1.2549638748168945, "learning_rate": 9.69144438226699e-07, "loss": 0.169, "step": 3970 }, { "epoch": 0.09279822797353618, "grad_norm": 1.5154821872711182, "learning_rate": 9.690667164086302e-07, "loss": 0.1668, "step": 3980 }, { "epoch": 0.09303138935035411, "grad_norm": 1.1912412643432617, "learning_rate": 9.689889945905614e-07, "loss": 0.1646, "step": 3990 }, { "epoch": 0.09326455072717205, "grad_norm": 1.8285268545150757, "learning_rate": 9.689112727724925e-07, "loss": 0.1705, "step": 4000 }, { "epoch": 0.09349771210398998, "grad_norm": 3.5178158283233643, "learning_rate": 9.68833550954424e-07, "loss": 0.1685, "step": 4010 }, { "epoch": 0.0937308734808079, "grad_norm": 1.3545713424682617, "learning_rate": 9.68755829136355e-07, "loss": 0.1754, "step": 4020 }, { "epoch": 0.09396403485762583, "grad_norm": 2.4144887924194336, "learning_rate": 9.686781073182864e-07, "loss": 0.1691, "step": 4030 }, { "epoch": 0.09419719623444377, "grad_norm": 1.8905471563339233, "learning_rate": 9.686081576820243e-07, "loss": 0.1657, "step": 4040 }, { "epoch": 0.0944303576112617, "grad_norm": 1.5670430660247803, "learning_rate": 9.685304358639557e-07, "loss": 0.1758, "step": 4050 }, { "epoch": 0.09466351898807962, "grad_norm": 1.5430500507354736, "learning_rate": 9.684527140458869e-07, "loss": 0.1678, "step": 4060 }, { "epoch": 0.09489668036489755, "grad_norm": 5.117939472198486, "learning_rate": 9.683749922278182e-07, "loss": 0.1704, "step": 4070 }, { "epoch": 0.09512984174171549, "grad_norm": 2.775599479675293, "learning_rate": 9.682972704097494e-07, "loss": 0.1794, "step": 4080 }, { "epoch": 0.09536300311853342, "grad_norm": 3.6164157390594482, "learning_rate": 9.682195485916805e-07, "loss": 0.1859, "step": 4090 }, { "epoch": 0.09559616449535134, "grad_norm": 1.4218089580535889, "learning_rate": 9.68141826773612e-07, "loss": 0.1846, "step": 4100 }, { "epoch": 0.09582932587216927, "grad_norm": 1.44814932346344, "learning_rate": 9.68064104955543e-07, "loss": 0.1776, "step": 4110 }, { "epoch": 0.09606248724898721, "grad_norm": 2.3546786308288574, "learning_rate": 9.679863831374742e-07, "loss": 0.1839, "step": 4120 }, { "epoch": 0.09629564862580514, "grad_norm": 2.788923501968384, "learning_rate": 9.679086613194056e-07, "loss": 0.1615, "step": 4130 }, { "epoch": 0.09652881000262306, "grad_norm": 3.191131830215454, "learning_rate": 9.678309395013367e-07, "loss": 0.1734, "step": 4140 }, { "epoch": 0.096761971379441, "grad_norm": 1.7101701498031616, "learning_rate": 9.67753217683268e-07, "loss": 0.1747, "step": 4150 }, { "epoch": 0.09699513275625893, "grad_norm": 1.5476278066635132, "learning_rate": 9.676754958651993e-07, "loss": 0.1728, "step": 4160 }, { "epoch": 0.09722829413307686, "grad_norm": 5.702023029327393, "learning_rate": 9.675977740471304e-07, "loss": 0.1766, "step": 4170 }, { "epoch": 0.09746145550989478, "grad_norm": 1.3397932052612305, "learning_rate": 9.675200522290618e-07, "loss": 0.1651, "step": 4180 }, { "epoch": 0.09769461688671272, "grad_norm": 1.77191960811615, "learning_rate": 9.67442330410993e-07, "loss": 0.163, "step": 4190 }, { "epoch": 0.09792777826353065, "grad_norm": 1.3603955507278442, "learning_rate": 9.673646085929241e-07, "loss": 0.1843, "step": 4200 }, { "epoch": 0.09816093964034858, "grad_norm": 1.9393361806869507, "learning_rate": 9.672868867748553e-07, "loss": 0.1674, "step": 4210 }, { "epoch": 0.0983941010171665, "grad_norm": 2.7166199684143066, "learning_rate": 9.672091649567866e-07, "loss": 0.1783, "step": 4220 }, { "epoch": 0.09862726239398444, "grad_norm": 1.9352298974990845, "learning_rate": 9.671314431387178e-07, "loss": 0.1632, "step": 4230 }, { "epoch": 0.09886042377080237, "grad_norm": 2.7174055576324463, "learning_rate": 9.670537213206492e-07, "loss": 0.1726, "step": 4240 }, { "epoch": 0.0990935851476203, "grad_norm": 1.8788416385650635, "learning_rate": 9.669759995025803e-07, "loss": 0.1887, "step": 4250 }, { "epoch": 0.09932674652443822, "grad_norm": 2.7310657501220703, "learning_rate": 9.668982776845117e-07, "loss": 0.1781, "step": 4260 }, { "epoch": 0.09955990790125616, "grad_norm": 1.3871183395385742, "learning_rate": 9.668205558664426e-07, "loss": 0.1613, "step": 4270 }, { "epoch": 0.09979306927807409, "grad_norm": 1.2881211042404175, "learning_rate": 9.66742834048374e-07, "loss": 0.1579, "step": 4280 }, { "epoch": 0.10002623065489202, "grad_norm": 1.7121739387512207, "learning_rate": 9.666651122303052e-07, "loss": 0.1758, "step": 4290 }, { "epoch": 0.10025939203170994, "grad_norm": 3.671835422515869, "learning_rate": 9.665873904122365e-07, "loss": 0.1778, "step": 4300 }, { "epoch": 0.10049255340852788, "grad_norm": 3.9405248165130615, "learning_rate": 9.665096685941677e-07, "loss": 0.1692, "step": 4310 }, { "epoch": 0.10072571478534581, "grad_norm": 2.2108895778656006, "learning_rate": 9.66431946776099e-07, "loss": 0.1518, "step": 4320 }, { "epoch": 0.10095887616216374, "grad_norm": 1.4910554885864258, "learning_rate": 9.663542249580302e-07, "loss": 0.1701, "step": 4330 }, { "epoch": 0.10119203753898166, "grad_norm": 1.7529791593551636, "learning_rate": 9.662765031399614e-07, "loss": 0.1636, "step": 4340 }, { "epoch": 0.1014251989157996, "grad_norm": 2.6613411903381348, "learning_rate": 9.661987813218925e-07, "loss": 0.1465, "step": 4350 }, { "epoch": 0.10165836029261753, "grad_norm": 1.2254959344863892, "learning_rate": 9.661210595038239e-07, "loss": 0.1651, "step": 4360 }, { "epoch": 0.10189152166943546, "grad_norm": 1.770861268043518, "learning_rate": 9.66043337685755e-07, "loss": 0.1738, "step": 4370 }, { "epoch": 0.10212468304625338, "grad_norm": 7.5406494140625, "learning_rate": 9.659656158676864e-07, "loss": 0.1638, "step": 4380 }, { "epoch": 0.10235784442307132, "grad_norm": 2.4083847999572754, "learning_rate": 9.658878940496176e-07, "loss": 0.1669, "step": 4390 }, { "epoch": 0.10259100579988925, "grad_norm": 4.018624782562256, "learning_rate": 9.658101722315487e-07, "loss": 0.1755, "step": 4400 }, { "epoch": 0.10282416717670718, "grad_norm": 2.6552672386169434, "learning_rate": 9.6573245041348e-07, "loss": 0.1665, "step": 4410 }, { "epoch": 0.1030573285535251, "grad_norm": 2.265709638595581, "learning_rate": 9.656547285954112e-07, "loss": 0.1542, "step": 4420 }, { "epoch": 0.10329048993034304, "grad_norm": 3.2372043132781982, "learning_rate": 9.655770067773424e-07, "loss": 0.194, "step": 4430 }, { "epoch": 0.10352365130716097, "grad_norm": 1.279836893081665, "learning_rate": 9.654992849592738e-07, "loss": 0.1705, "step": 4440 }, { "epoch": 0.1037568126839789, "grad_norm": 1.6250395774841309, "learning_rate": 9.65421563141205e-07, "loss": 0.1775, "step": 4450 }, { "epoch": 0.10398997406079682, "grad_norm": 1.2409791946411133, "learning_rate": 9.65343841323136e-07, "loss": 0.171, "step": 4460 }, { "epoch": 0.10422313543761476, "grad_norm": 2.112778902053833, "learning_rate": 9.652661195050674e-07, "loss": 0.161, "step": 4470 }, { "epoch": 0.10445629681443269, "grad_norm": 2.7808306217193604, "learning_rate": 9.651883976869986e-07, "loss": 0.1667, "step": 4480 }, { "epoch": 0.10468945819125063, "grad_norm": 2.351360559463501, "learning_rate": 9.6511067586893e-07, "loss": 0.1678, "step": 4490 }, { "epoch": 0.10492261956806855, "grad_norm": 3.005507230758667, "learning_rate": 9.650329540508611e-07, "loss": 0.1687, "step": 4500 }, { "epoch": 0.10515578094488648, "grad_norm": 1.7053532600402832, "learning_rate": 9.649552322327923e-07, "loss": 0.1869, "step": 4510 }, { "epoch": 0.10538894232170441, "grad_norm": 1.323677897453308, "learning_rate": 9.648775104147234e-07, "loss": 0.1583, "step": 4520 }, { "epoch": 0.10562210369852235, "grad_norm": 2.2460973262786865, "learning_rate": 9.647997885966548e-07, "loss": 0.1743, "step": 4530 }, { "epoch": 0.10585526507534027, "grad_norm": 1.222557544708252, "learning_rate": 9.64722066778586e-07, "loss": 0.1813, "step": 4540 }, { "epoch": 0.1060884264521582, "grad_norm": 2.468097448348999, "learning_rate": 9.646443449605173e-07, "loss": 0.1744, "step": 4550 }, { "epoch": 0.10632158782897613, "grad_norm": 1.388538122177124, "learning_rate": 9.645666231424485e-07, "loss": 0.164, "step": 4560 }, { "epoch": 0.10655474920579407, "grad_norm": 1.8138551712036133, "learning_rate": 9.644889013243799e-07, "loss": 0.1541, "step": 4570 }, { "epoch": 0.10678791058261199, "grad_norm": 2.0380921363830566, "learning_rate": 9.64411179506311e-07, "loss": 0.1723, "step": 4580 }, { "epoch": 0.10702107195942992, "grad_norm": 1.3098061084747314, "learning_rate": 9.643334576882422e-07, "loss": 0.1817, "step": 4590 }, { "epoch": 0.10725423333624785, "grad_norm": 2.1363892555236816, "learning_rate": 9.642557358701733e-07, "loss": 0.1772, "step": 4600 }, { "epoch": 0.10748739471306579, "grad_norm": 1.244398832321167, "learning_rate": 9.641780140521047e-07, "loss": 0.1728, "step": 4610 }, { "epoch": 0.1077205560898837, "grad_norm": 2.127281427383423, "learning_rate": 9.641002922340358e-07, "loss": 0.1718, "step": 4620 }, { "epoch": 0.10795371746670164, "grad_norm": 1.4553457498550415, "learning_rate": 9.640225704159672e-07, "loss": 0.1743, "step": 4630 }, { "epoch": 0.10818687884351957, "grad_norm": 2.59184193611145, "learning_rate": 9.639448485978984e-07, "loss": 0.1694, "step": 4640 }, { "epoch": 0.10842004022033751, "grad_norm": 1.2453906536102295, "learning_rate": 9.638671267798295e-07, "loss": 0.1792, "step": 4650 }, { "epoch": 0.10865320159715543, "grad_norm": 2.797988176345825, "learning_rate": 9.63789404961761e-07, "loss": 0.1796, "step": 4660 }, { "epoch": 0.10888636297397336, "grad_norm": 3.4174530506134033, "learning_rate": 9.63711683143692e-07, "loss": 0.1887, "step": 4670 }, { "epoch": 0.1091195243507913, "grad_norm": 2.279827356338501, "learning_rate": 9.636339613256232e-07, "loss": 0.1728, "step": 4680 }, { "epoch": 0.10935268572760923, "grad_norm": 1.862806797027588, "learning_rate": 9.635562395075546e-07, "loss": 0.1722, "step": 4690 }, { "epoch": 0.10958584710442715, "grad_norm": 3.24060320854187, "learning_rate": 9.634785176894857e-07, "loss": 0.1782, "step": 4700 }, { "epoch": 0.10981900848124508, "grad_norm": 2.3316357135772705, "learning_rate": 9.634007958714169e-07, "loss": 0.1631, "step": 4710 }, { "epoch": 0.11005216985806301, "grad_norm": 3.498556137084961, "learning_rate": 9.633230740533483e-07, "loss": 0.1661, "step": 4720 }, { "epoch": 0.11028533123488095, "grad_norm": 3.582646131515503, "learning_rate": 9.632453522352794e-07, "loss": 0.1683, "step": 4730 }, { "epoch": 0.11051849261169887, "grad_norm": 1.428401231765747, "learning_rate": 9.631676304172108e-07, "loss": 0.1677, "step": 4740 }, { "epoch": 0.1107516539885168, "grad_norm": 4.981324672698975, "learning_rate": 9.63089908599142e-07, "loss": 0.1775, "step": 4750 }, { "epoch": 0.11098481536533474, "grad_norm": 4.399308204650879, "learning_rate": 9.63012186781073e-07, "loss": 0.1722, "step": 4760 }, { "epoch": 0.11121797674215267, "grad_norm": 4.160839080810547, "learning_rate": 9.629344649630043e-07, "loss": 0.168, "step": 4770 }, { "epoch": 0.11145113811897059, "grad_norm": 1.831563949584961, "learning_rate": 9.628567431449356e-07, "loss": 0.17, "step": 4780 }, { "epoch": 0.11168429949578852, "grad_norm": 3.053858995437622, "learning_rate": 9.627790213268668e-07, "loss": 0.1854, "step": 4790 }, { "epoch": 0.11191746087260646, "grad_norm": 4.980790615081787, "learning_rate": 9.627012995087981e-07, "loss": 0.1675, "step": 4800 }, { "epoch": 0.11215062224942439, "grad_norm": 2.3949854373931885, "learning_rate": 9.626235776907293e-07, "loss": 0.1642, "step": 4810 }, { "epoch": 0.11238378362624231, "grad_norm": 1.4463613033294678, "learning_rate": 9.625458558726607e-07, "loss": 0.1704, "step": 4820 }, { "epoch": 0.11261694500306024, "grad_norm": 2.133066415786743, "learning_rate": 9.624681340545916e-07, "loss": 0.158, "step": 4830 }, { "epoch": 0.11285010637987818, "grad_norm": 3.0606913566589355, "learning_rate": 9.62390412236523e-07, "loss": 0.1766, "step": 4840 }, { "epoch": 0.11308326775669611, "grad_norm": 1.423598051071167, "learning_rate": 9.623126904184541e-07, "loss": 0.1573, "step": 4850 }, { "epoch": 0.11331642913351403, "grad_norm": 2.958470582962036, "learning_rate": 9.622349686003855e-07, "loss": 0.1719, "step": 4860 }, { "epoch": 0.11354959051033196, "grad_norm": 1.852198600769043, "learning_rate": 9.621572467823167e-07, "loss": 0.1888, "step": 4870 }, { "epoch": 0.1137827518871499, "grad_norm": 6.293911457061768, "learning_rate": 9.62079524964248e-07, "loss": 0.1824, "step": 4880 }, { "epoch": 0.11401591326396783, "grad_norm": 1.3878470659255981, "learning_rate": 9.620018031461792e-07, "loss": 0.1646, "step": 4890 }, { "epoch": 0.11424907464078575, "grad_norm": 1.5323675870895386, "learning_rate": 9.619240813281103e-07, "loss": 0.1706, "step": 4900 }, { "epoch": 0.11448223601760368, "grad_norm": 1.794468641281128, "learning_rate": 9.618463595100415e-07, "loss": 0.1726, "step": 4910 }, { "epoch": 0.11471539739442162, "grad_norm": 2.549290657043457, "learning_rate": 9.617686376919729e-07, "loss": 0.1568, "step": 4920 }, { "epoch": 0.11494855877123955, "grad_norm": 2.5417652130126953, "learning_rate": 9.61690915873904e-07, "loss": 0.1685, "step": 4930 }, { "epoch": 0.11518172014805747, "grad_norm": 1.287216305732727, "learning_rate": 9.616131940558354e-07, "loss": 0.1575, "step": 4940 }, { "epoch": 0.1154148815248754, "grad_norm": 7.309139728546143, "learning_rate": 9.615354722377665e-07, "loss": 0.1659, "step": 4950 }, { "epoch": 0.11564804290169334, "grad_norm": 3.6352343559265137, "learning_rate": 9.614577504196977e-07, "loss": 0.1629, "step": 4960 }, { "epoch": 0.11588120427851127, "grad_norm": 1.268729329109192, "learning_rate": 9.61380028601629e-07, "loss": 0.1717, "step": 4970 }, { "epoch": 0.11611436565532919, "grad_norm": 3.939270496368408, "learning_rate": 9.613023067835602e-07, "loss": 0.1707, "step": 4980 }, { "epoch": 0.11634752703214712, "grad_norm": 2.0558009147644043, "learning_rate": 9.612245849654914e-07, "loss": 0.1556, "step": 4990 }, { "epoch": 0.11658068840896506, "grad_norm": 1.3351329565048218, "learning_rate": 9.611468631474228e-07, "loss": 0.1734, "step": 5000 }, { "epoch": 0.11681384978578299, "grad_norm": 1.28730046749115, "learning_rate": 9.61069141329354e-07, "loss": 0.1774, "step": 5010 }, { "epoch": 0.11704701116260091, "grad_norm": 3.9666237831115723, "learning_rate": 9.60991419511285e-07, "loss": 0.1784, "step": 5020 }, { "epoch": 0.11728017253941884, "grad_norm": 1.203981637954712, "learning_rate": 9.609136976932164e-07, "loss": 0.1656, "step": 5030 }, { "epoch": 0.11751333391623678, "grad_norm": 1.5213061571121216, "learning_rate": 9.608359758751476e-07, "loss": 0.1733, "step": 5040 }, { "epoch": 0.11774649529305471, "grad_norm": 2.3665525913238525, "learning_rate": 9.60758254057079e-07, "loss": 0.1654, "step": 5050 }, { "epoch": 0.11797965666987263, "grad_norm": 2.3537373542785645, "learning_rate": 9.606805322390101e-07, "loss": 0.1557, "step": 5060 }, { "epoch": 0.11821281804669057, "grad_norm": 1.8853248357772827, "learning_rate": 9.606028104209415e-07, "loss": 0.1854, "step": 5070 }, { "epoch": 0.1184459794235085, "grad_norm": 1.260785460472107, "learning_rate": 9.605250886028724e-07, "loss": 0.1475, "step": 5080 }, { "epoch": 0.11867914080032643, "grad_norm": 1.9755197763442993, "learning_rate": 9.604473667848038e-07, "loss": 0.1765, "step": 5090 }, { "epoch": 0.11891230217714435, "grad_norm": 1.353851079940796, "learning_rate": 9.60369644966735e-07, "loss": 0.1663, "step": 5100 }, { "epoch": 0.11914546355396229, "grad_norm": 2.351999521255493, "learning_rate": 9.602919231486663e-07, "loss": 0.1608, "step": 5110 }, { "epoch": 0.11937862493078022, "grad_norm": 1.5289959907531738, "learning_rate": 9.602142013305975e-07, "loss": 0.1795, "step": 5120 }, { "epoch": 0.11961178630759815, "grad_norm": 2.4303910732269287, "learning_rate": 9.601364795125288e-07, "loss": 0.1652, "step": 5130 }, { "epoch": 0.11984494768441607, "grad_norm": 1.8655107021331787, "learning_rate": 9.6005875769446e-07, "loss": 0.167, "step": 5140 }, { "epoch": 0.120078109061234, "grad_norm": 1.3763298988342285, "learning_rate": 9.599810358763912e-07, "loss": 0.1608, "step": 5150 }, { "epoch": 0.12031127043805194, "grad_norm": 1.583499550819397, "learning_rate": 9.599033140583223e-07, "loss": 0.165, "step": 5160 }, { "epoch": 0.12054443181486987, "grad_norm": 2.2417995929718018, "learning_rate": 9.598255922402537e-07, "loss": 0.1631, "step": 5170 }, { "epoch": 0.1207775931916878, "grad_norm": 1.408921241760254, "learning_rate": 9.597478704221848e-07, "loss": 0.1637, "step": 5180 }, { "epoch": 0.12101075456850573, "grad_norm": 3.6971445083618164, "learning_rate": 9.596701486041162e-07, "loss": 0.1633, "step": 5190 }, { "epoch": 0.12124391594532366, "grad_norm": 1.5963451862335205, "learning_rate": 9.595924267860474e-07, "loss": 0.1733, "step": 5200 }, { "epoch": 0.1214770773221416, "grad_norm": 3.496896505355835, "learning_rate": 9.595147049679785e-07, "loss": 0.1662, "step": 5210 }, { "epoch": 0.12171023869895951, "grad_norm": 1.197727084159851, "learning_rate": 9.594369831499099e-07, "loss": 0.1718, "step": 5220 }, { "epoch": 0.12194340007577745, "grad_norm": 1.458288550376892, "learning_rate": 9.59359261331841e-07, "loss": 0.1583, "step": 5230 }, { "epoch": 0.12217656145259538, "grad_norm": 2.0773606300354004, "learning_rate": 9.592815395137722e-07, "loss": 0.162, "step": 5240 }, { "epoch": 0.12240972282941331, "grad_norm": 2.1920599937438965, "learning_rate": 9.592038176957036e-07, "loss": 0.1596, "step": 5250 }, { "epoch": 0.12264288420623123, "grad_norm": 3.78487491607666, "learning_rate": 9.591260958776347e-07, "loss": 0.1552, "step": 5260 }, { "epoch": 0.12287604558304917, "grad_norm": 2.0284407138824463, "learning_rate": 9.590483740595659e-07, "loss": 0.1693, "step": 5270 }, { "epoch": 0.1231092069598671, "grad_norm": 3.3035125732421875, "learning_rate": 9.589706522414972e-07, "loss": 0.1622, "step": 5280 }, { "epoch": 0.12334236833668503, "grad_norm": 2.0623300075531006, "learning_rate": 9.588929304234284e-07, "loss": 0.1665, "step": 5290 }, { "epoch": 0.12357552971350295, "grad_norm": 3.202251434326172, "learning_rate": 9.588152086053598e-07, "loss": 0.1779, "step": 5300 }, { "epoch": 0.12380869109032089, "grad_norm": 1.6747450828552246, "learning_rate": 9.58737486787291e-07, "loss": 0.1637, "step": 5310 }, { "epoch": 0.12404185246713882, "grad_norm": 2.2369680404663086, "learning_rate": 9.58659764969222e-07, "loss": 0.1726, "step": 5320 }, { "epoch": 0.12427501384395676, "grad_norm": 2.10920786857605, "learning_rate": 9.585820431511532e-07, "loss": 0.1582, "step": 5330 }, { "epoch": 0.12450817522077468, "grad_norm": 1.3288041353225708, "learning_rate": 9.585043213330846e-07, "loss": 0.1535, "step": 5340 }, { "epoch": 0.12474133659759261, "grad_norm": 3.0029919147491455, "learning_rate": 9.584265995150158e-07, "loss": 0.1701, "step": 5350 }, { "epoch": 0.12497449797441054, "grad_norm": 1.8169630765914917, "learning_rate": 9.583488776969471e-07, "loss": 0.1538, "step": 5360 }, { "epoch": 0.12520765935122846, "grad_norm": 3.3375067710876465, "learning_rate": 9.582711558788783e-07, "loss": 0.1768, "step": 5370 }, { "epoch": 0.1254408207280464, "grad_norm": 2.9756226539611816, "learning_rate": 9.581934340608094e-07, "loss": 0.1624, "step": 5380 }, { "epoch": 0.12567398210486433, "grad_norm": 2.858269453048706, "learning_rate": 9.581157122427408e-07, "loss": 0.1474, "step": 5390 }, { "epoch": 0.12590714348168225, "grad_norm": 2.49346661567688, "learning_rate": 9.58037990424672e-07, "loss": 0.1718, "step": 5400 }, { "epoch": 0.1261403048585002, "grad_norm": 2.436150312423706, "learning_rate": 9.579602686066031e-07, "loss": 0.1661, "step": 5410 }, { "epoch": 0.12637346623531812, "grad_norm": 1.9165921211242676, "learning_rate": 9.578825467885345e-07, "loss": 0.1682, "step": 5420 }, { "epoch": 0.12660662761213606, "grad_norm": 1.6544647216796875, "learning_rate": 9.578048249704656e-07, "loss": 0.1507, "step": 5430 }, { "epoch": 0.12683978898895398, "grad_norm": 2.311941146850586, "learning_rate": 9.577271031523968e-07, "loss": 0.1576, "step": 5440 }, { "epoch": 0.1270729503657719, "grad_norm": 1.380018949508667, "learning_rate": 9.576493813343282e-07, "loss": 0.1669, "step": 5450 }, { "epoch": 0.12730611174258985, "grad_norm": 1.1527833938598633, "learning_rate": 9.575716595162593e-07, "loss": 0.1552, "step": 5460 }, { "epoch": 0.12753927311940777, "grad_norm": 2.286519765853882, "learning_rate": 9.574939376981907e-07, "loss": 0.1665, "step": 5470 }, { "epoch": 0.1277724344962257, "grad_norm": 2.224874258041382, "learning_rate": 9.574162158801219e-07, "loss": 0.1708, "step": 5480 }, { "epoch": 0.12800559587304364, "grad_norm": 1.418873906135559, "learning_rate": 9.57338494062053e-07, "loss": 0.1657, "step": 5490 }, { "epoch": 0.12823875724986156, "grad_norm": 2.3668863773345947, "learning_rate": 9.572607722439842e-07, "loss": 0.1573, "step": 5500 }, { "epoch": 0.1284719186266795, "grad_norm": 2.0860776901245117, "learning_rate": 9.571830504259155e-07, "loss": 0.1558, "step": 5510 }, { "epoch": 0.12870508000349742, "grad_norm": 3.2291665077209473, "learning_rate": 9.571053286078467e-07, "loss": 0.163, "step": 5520 }, { "epoch": 0.12893824138031534, "grad_norm": 3.9288742542266846, "learning_rate": 9.57027606789778e-07, "loss": 0.1747, "step": 5530 }, { "epoch": 0.1291714027571333, "grad_norm": 2.055510997772217, "learning_rate": 9.569498849717092e-07, "loss": 0.1687, "step": 5540 }, { "epoch": 0.1294045641339512, "grad_norm": 2.3318026065826416, "learning_rate": 9.568721631536406e-07, "loss": 0.1602, "step": 5550 }, { "epoch": 0.12963772551076913, "grad_norm": 1.9076887369155884, "learning_rate": 9.567944413355717e-07, "loss": 0.1641, "step": 5560 }, { "epoch": 0.12987088688758708, "grad_norm": 3.552339792251587, "learning_rate": 9.56716719517503e-07, "loss": 0.1707, "step": 5570 }, { "epoch": 0.130104048264405, "grad_norm": 1.6694401502609253, "learning_rate": 9.56638997699434e-07, "loss": 0.1618, "step": 5580 }, { "epoch": 0.13033720964122295, "grad_norm": 1.4433293342590332, "learning_rate": 9.565612758813654e-07, "loss": 0.163, "step": 5590 }, { "epoch": 0.13057037101804086, "grad_norm": 1.2105273008346558, "learning_rate": 9.564835540632966e-07, "loss": 0.1571, "step": 5600 }, { "epoch": 0.13080353239485878, "grad_norm": 1.6604574918746948, "learning_rate": 9.56405832245228e-07, "loss": 0.1604, "step": 5610 }, { "epoch": 0.13103669377167673, "grad_norm": 1.5498321056365967, "learning_rate": 9.56328110427159e-07, "loss": 0.1613, "step": 5620 }, { "epoch": 0.13126985514849465, "grad_norm": 1.412672996520996, "learning_rate": 9.562503886090903e-07, "loss": 0.1684, "step": 5630 }, { "epoch": 0.13150301652531257, "grad_norm": 2.4672365188598633, "learning_rate": 9.561726667910214e-07, "loss": 0.1655, "step": 5640 }, { "epoch": 0.13173617790213052, "grad_norm": 1.6452497243881226, "learning_rate": 9.560949449729528e-07, "loss": 0.153, "step": 5650 }, { "epoch": 0.13196933927894844, "grad_norm": 2.9570555686950684, "learning_rate": 9.56017223154884e-07, "loss": 0.1577, "step": 5660 }, { "epoch": 0.13220250065576636, "grad_norm": 2.635821580886841, "learning_rate": 9.559395013368153e-07, "loss": 0.1556, "step": 5670 }, { "epoch": 0.1324356620325843, "grad_norm": 4.126746654510498, "learning_rate": 9.558617795187465e-07, "loss": 0.1633, "step": 5680 }, { "epoch": 0.13266882340940223, "grad_norm": 2.8333680629730225, "learning_rate": 9.557840577006776e-07, "loss": 0.1534, "step": 5690 }, { "epoch": 0.13290198478622017, "grad_norm": 2.2575266361236572, "learning_rate": 9.55706335882609e-07, "loss": 0.1576, "step": 5700 }, { "epoch": 0.1331351461630381, "grad_norm": 1.314151406288147, "learning_rate": 9.556286140645401e-07, "loss": 0.1685, "step": 5710 }, { "epoch": 0.133368307539856, "grad_norm": 2.446298122406006, "learning_rate": 9.555508922464713e-07, "loss": 0.1601, "step": 5720 }, { "epoch": 0.13360146891667396, "grad_norm": 2.437126874923706, "learning_rate": 9.554731704284027e-07, "loss": 0.1665, "step": 5730 }, { "epoch": 0.13383463029349188, "grad_norm": 2.0295491218566895, "learning_rate": 9.553954486103338e-07, "loss": 0.1791, "step": 5740 }, { "epoch": 0.1340677916703098, "grad_norm": 3.2143161296844482, "learning_rate": 9.55317726792265e-07, "loss": 0.1546, "step": 5750 }, { "epoch": 0.13430095304712775, "grad_norm": 1.6909109354019165, "learning_rate": 9.552400049741963e-07, "loss": 0.1463, "step": 5760 }, { "epoch": 0.13453411442394567, "grad_norm": 1.8603520393371582, "learning_rate": 9.551622831561275e-07, "loss": 0.1532, "step": 5770 }, { "epoch": 0.13476727580076361, "grad_norm": 1.4745934009552002, "learning_rate": 9.550845613380589e-07, "loss": 0.1844, "step": 5780 }, { "epoch": 0.13500043717758153, "grad_norm": 1.2969707250595093, "learning_rate": 9.5500683951999e-07, "loss": 0.1564, "step": 5790 }, { "epoch": 0.13523359855439945, "grad_norm": 2.980139970779419, "learning_rate": 9.549291177019212e-07, "loss": 0.1496, "step": 5800 }, { "epoch": 0.1354667599312174, "grad_norm": 1.8110922574996948, "learning_rate": 9.548513958838523e-07, "loss": 0.1599, "step": 5810 }, { "epoch": 0.13569992130803532, "grad_norm": 3.625563144683838, "learning_rate": 9.547736740657837e-07, "loss": 0.1701, "step": 5820 }, { "epoch": 0.13593308268485324, "grad_norm": 2.021158456802368, "learning_rate": 9.546959522477149e-07, "loss": 0.1623, "step": 5830 }, { "epoch": 0.1361662440616712, "grad_norm": 1.2860651016235352, "learning_rate": 9.546182304296462e-07, "loss": 0.1575, "step": 5840 }, { "epoch": 0.1363994054384891, "grad_norm": 1.5693072080612183, "learning_rate": 9.545405086115774e-07, "loss": 0.1623, "step": 5850 }, { "epoch": 0.13663256681530705, "grad_norm": 1.4244751930236816, "learning_rate": 9.544627867935088e-07, "loss": 0.1494, "step": 5860 }, { "epoch": 0.13686572819212497, "grad_norm": 1.9373494386672974, "learning_rate": 9.5438506497544e-07, "loss": 0.167, "step": 5870 }, { "epoch": 0.1370988895689429, "grad_norm": 4.519964218139648, "learning_rate": 9.54307343157371e-07, "loss": 0.1685, "step": 5880 }, { "epoch": 0.13733205094576084, "grad_norm": 3.266547203063965, "learning_rate": 9.542296213393022e-07, "loss": 0.1518, "step": 5890 }, { "epoch": 0.13756521232257876, "grad_norm": 3.349050283432007, "learning_rate": 9.541518995212336e-07, "loss": 0.1574, "step": 5900 }, { "epoch": 0.13779837369939668, "grad_norm": 1.4099982976913452, "learning_rate": 9.540741777031647e-07, "loss": 0.1606, "step": 5910 }, { "epoch": 0.13803153507621463, "grad_norm": 1.3211420774459839, "learning_rate": 9.539964558850961e-07, "loss": 0.1509, "step": 5920 }, { "epoch": 0.13826469645303255, "grad_norm": 1.729364275932312, "learning_rate": 9.539187340670273e-07, "loss": 0.1681, "step": 5930 }, { "epoch": 0.1384978578298505, "grad_norm": 2.1114988327026367, "learning_rate": 9.538410122489584e-07, "loss": 0.168, "step": 5940 }, { "epoch": 0.13873101920666842, "grad_norm": 2.0051207542419434, "learning_rate": 9.537632904308898e-07, "loss": 0.1535, "step": 5950 }, { "epoch": 0.13896418058348634, "grad_norm": 1.521937608718872, "learning_rate": 9.53685568612821e-07, "loss": 0.159, "step": 5960 }, { "epoch": 0.13919734196030428, "grad_norm": 1.3823131322860718, "learning_rate": 9.536078467947521e-07, "loss": 0.1606, "step": 5970 }, { "epoch": 0.1394305033371222, "grad_norm": 4.324008941650391, "learning_rate": 9.535301249766834e-07, "loss": 0.1566, "step": 5980 }, { "epoch": 0.13966366471394012, "grad_norm": 2.6060853004455566, "learning_rate": 9.534524031586146e-07, "loss": 0.1497, "step": 5990 }, { "epoch": 0.13989682609075807, "grad_norm": 2.196986198425293, "learning_rate": 9.533746813405459e-07, "loss": 0.1498, "step": 6000 }, { "epoch": 0.140129987467576, "grad_norm": 2.9057695865631104, "learning_rate": 9.532969595224772e-07, "loss": 0.161, "step": 6010 }, { "epoch": 0.14036314884439394, "grad_norm": 1.3079633712768555, "learning_rate": 9.532192377044084e-07, "loss": 0.1526, "step": 6020 }, { "epoch": 0.14059631022121186, "grad_norm": 1.7087525129318237, "learning_rate": 9.531415158863396e-07, "loss": 0.1485, "step": 6030 }, { "epoch": 0.14082947159802978, "grad_norm": 2.6136317253112793, "learning_rate": 9.530637940682707e-07, "loss": 0.152, "step": 6040 }, { "epoch": 0.14106263297484772, "grad_norm": 4.366525173187256, "learning_rate": 9.52993844432009e-07, "loss": 0.1719, "step": 6050 }, { "epoch": 0.14129579435166564, "grad_norm": 4.525439262390137, "learning_rate": 9.529161226139401e-07, "loss": 0.1606, "step": 6060 }, { "epoch": 0.14152895572848356, "grad_norm": 1.578507423400879, "learning_rate": 9.528384007958714e-07, "loss": 0.1623, "step": 6070 }, { "epoch": 0.1417621171053015, "grad_norm": 1.865159273147583, "learning_rate": 9.527606789778026e-07, "loss": 0.1552, "step": 6080 }, { "epoch": 0.14199527848211943, "grad_norm": 1.2686339616775513, "learning_rate": 9.526829571597338e-07, "loss": 0.1587, "step": 6090 }, { "epoch": 0.14222843985893738, "grad_norm": 2.437269926071167, "learning_rate": 9.526052353416651e-07, "loss": 0.177, "step": 6100 }, { "epoch": 0.1424616012357553, "grad_norm": 1.5003291368484497, "learning_rate": 9.525275135235963e-07, "loss": 0.1565, "step": 6110 }, { "epoch": 0.14269476261257322, "grad_norm": 4.4913010597229, "learning_rate": 9.524497917055275e-07, "loss": 0.1546, "step": 6120 }, { "epoch": 0.14292792398939116, "grad_norm": 1.4617289304733276, "learning_rate": 9.523720698874587e-07, "loss": 0.1537, "step": 6130 }, { "epoch": 0.14316108536620908, "grad_norm": 1.8418291807174683, "learning_rate": 9.5229434806939e-07, "loss": 0.1603, "step": 6140 }, { "epoch": 0.143394246743027, "grad_norm": 1.2504688501358032, "learning_rate": 9.522166262513213e-07, "loss": 0.1689, "step": 6150 }, { "epoch": 0.14362740811984495, "grad_norm": 2.991912364959717, "learning_rate": 9.521389044332525e-07, "loss": 0.1669, "step": 6160 }, { "epoch": 0.14386056949666287, "grad_norm": 1.5081084966659546, "learning_rate": 9.520611826151837e-07, "loss": 0.1616, "step": 6170 }, { "epoch": 0.14409373087348082, "grad_norm": 1.6158411502838135, "learning_rate": 9.519834607971148e-07, "loss": 0.1611, "step": 6180 }, { "epoch": 0.14432689225029874, "grad_norm": 2.3487257957458496, "learning_rate": 9.519057389790461e-07, "loss": 0.1517, "step": 6190 }, { "epoch": 0.14456005362711666, "grad_norm": 3.3568828105926514, "learning_rate": 9.518280171609774e-07, "loss": 0.1553, "step": 6200 }, { "epoch": 0.1447932150039346, "grad_norm": 2.9424643516540527, "learning_rate": 9.517502953429086e-07, "loss": 0.1583, "step": 6210 }, { "epoch": 0.14502637638075253, "grad_norm": 1.6532485485076904, "learning_rate": 9.516725735248399e-07, "loss": 0.1473, "step": 6220 }, { "epoch": 0.14525953775757044, "grad_norm": 1.335486888885498, "learning_rate": 9.515948517067712e-07, "loss": 0.1597, "step": 6230 }, { "epoch": 0.1454926991343884, "grad_norm": 2.802081823348999, "learning_rate": 9.515171298887023e-07, "loss": 0.1612, "step": 6240 }, { "epoch": 0.1457258605112063, "grad_norm": 2.420905828475952, "learning_rate": 9.514394080706335e-07, "loss": 0.1608, "step": 6250 }, { "epoch": 0.14595902188802426, "grad_norm": 2.0560903549194336, "learning_rate": 9.513616862525647e-07, "loss": 0.1493, "step": 6260 }, { "epoch": 0.14619218326484218, "grad_norm": 1.4209729433059692, "learning_rate": 9.51283964434496e-07, "loss": 0.1512, "step": 6270 }, { "epoch": 0.1464253446416601, "grad_norm": 1.7947211265563965, "learning_rate": 9.512062426164273e-07, "loss": 0.1565, "step": 6280 }, { "epoch": 0.14665850601847805, "grad_norm": 4.269296646118164, "learning_rate": 9.511285207983585e-07, "loss": 0.1588, "step": 6290 }, { "epoch": 0.14689166739529597, "grad_norm": 3.9955544471740723, "learning_rate": 9.510507989802898e-07, "loss": 0.1741, "step": 6300 }, { "epoch": 0.14712482877211389, "grad_norm": 1.2706855535507202, "learning_rate": 9.509730771622209e-07, "loss": 0.1694, "step": 6310 }, { "epoch": 0.14735799014893183, "grad_norm": 2.4024574756622314, "learning_rate": 9.508953553441522e-07, "loss": 0.1603, "step": 6320 }, { "epoch": 0.14759115152574975, "grad_norm": 1.5600382089614868, "learning_rate": 9.508176335260834e-07, "loss": 0.1622, "step": 6330 }, { "epoch": 0.1478243129025677, "grad_norm": 3.176621437072754, "learning_rate": 9.507399117080146e-07, "loss": 0.15, "step": 6340 }, { "epoch": 0.14805747427938562, "grad_norm": 2.296375036239624, "learning_rate": 9.506621898899459e-07, "loss": 0.1576, "step": 6350 }, { "epoch": 0.14829063565620354, "grad_norm": 1.6492382287979126, "learning_rate": 9.505844680718771e-07, "loss": 0.164, "step": 6360 }, { "epoch": 0.1485237970330215, "grad_norm": 2.5475525856018066, "learning_rate": 9.505067462538083e-07, "loss": 0.1645, "step": 6370 }, { "epoch": 0.1487569584098394, "grad_norm": 1.8081090450286865, "learning_rate": 9.504290244357396e-07, "loss": 0.1482, "step": 6380 }, { "epoch": 0.14899011978665733, "grad_norm": 1.4788637161254883, "learning_rate": 9.503513026176708e-07, "loss": 0.1633, "step": 6390 }, { "epoch": 0.14922328116347527, "grad_norm": 1.7914302349090576, "learning_rate": 9.502735807996021e-07, "loss": 0.1591, "step": 6400 }, { "epoch": 0.1494564425402932, "grad_norm": 1.5933923721313477, "learning_rate": 9.501958589815332e-07, "loss": 0.1642, "step": 6410 }, { "epoch": 0.14968960391711114, "grad_norm": 4.077606201171875, "learning_rate": 9.501181371634645e-07, "loss": 0.1542, "step": 6420 }, { "epoch": 0.14992276529392906, "grad_norm": 1.5627962350845337, "learning_rate": 9.500404153453957e-07, "loss": 0.1621, "step": 6430 }, { "epoch": 0.15015592667074698, "grad_norm": 3.439992904663086, "learning_rate": 9.499626935273269e-07, "loss": 0.1486, "step": 6440 }, { "epoch": 0.15038908804756493, "grad_norm": 1.6782933473587036, "learning_rate": 9.498849717092582e-07, "loss": 0.1469, "step": 6450 }, { "epoch": 0.15062224942438285, "grad_norm": 2.273865222930908, "learning_rate": 9.498072498911894e-07, "loss": 0.1575, "step": 6460 }, { "epoch": 0.15085541080120077, "grad_norm": 3.260895013809204, "learning_rate": 9.497295280731207e-07, "loss": 0.1481, "step": 6470 }, { "epoch": 0.15108857217801872, "grad_norm": 1.8021937608718872, "learning_rate": 9.49651806255052e-07, "loss": 0.1541, "step": 6480 }, { "epoch": 0.15132173355483663, "grad_norm": 1.3117870092391968, "learning_rate": 9.49574084436983e-07, "loss": 0.1527, "step": 6490 }, { "epoch": 0.15155489493165458, "grad_norm": 1.99868643283844, "learning_rate": 9.494963626189143e-07, "loss": 0.1784, "step": 6500 }, { "epoch": 0.1517880563084725, "grad_norm": 2.3579330444335938, "learning_rate": 9.494186408008455e-07, "loss": 0.1641, "step": 6510 }, { "epoch": 0.15202121768529042, "grad_norm": 2.766167163848877, "learning_rate": 9.493409189827768e-07, "loss": 0.1568, "step": 6520 }, { "epoch": 0.15225437906210837, "grad_norm": 2.198154926300049, "learning_rate": 9.492631971647081e-07, "loss": 0.1688, "step": 6530 }, { "epoch": 0.1524875404389263, "grad_norm": 1.3953890800476074, "learning_rate": 9.491854753466393e-07, "loss": 0.1745, "step": 6540 }, { "epoch": 0.1527207018157442, "grad_norm": 2.3046939373016357, "learning_rate": 9.491077535285705e-07, "loss": 0.1624, "step": 6550 }, { "epoch": 0.15295386319256216, "grad_norm": 1.3180969953536987, "learning_rate": 9.490300317105017e-07, "loss": 0.1597, "step": 6560 }, { "epoch": 0.15318702456938008, "grad_norm": 2.0685930252075195, "learning_rate": 9.489523098924329e-07, "loss": 0.1434, "step": 6570 }, { "epoch": 0.15342018594619802, "grad_norm": 5.382109642028809, "learning_rate": 9.488745880743642e-07, "loss": 0.1964, "step": 6580 }, { "epoch": 0.15365334732301594, "grad_norm": 1.6811139583587646, "learning_rate": 9.487968662562954e-07, "loss": 0.1593, "step": 6590 }, { "epoch": 0.15388650869983386, "grad_norm": 2.000264883041382, "learning_rate": 9.487191444382267e-07, "loss": 0.1523, "step": 6600 }, { "epoch": 0.1541196700766518, "grad_norm": 1.2622771263122559, "learning_rate": 9.486414226201578e-07, "loss": 0.1552, "step": 6610 }, { "epoch": 0.15435283145346973, "grad_norm": 1.940831184387207, "learning_rate": 9.485637008020891e-07, "loss": 0.1489, "step": 6620 }, { "epoch": 0.15458599283028765, "grad_norm": 1.4337176084518433, "learning_rate": 9.484859789840204e-07, "loss": 0.1786, "step": 6630 }, { "epoch": 0.1548191542071056, "grad_norm": 2.058612108230591, "learning_rate": 9.484082571659516e-07, "loss": 0.1543, "step": 6640 }, { "epoch": 0.15505231558392352, "grad_norm": 2.118762254714966, "learning_rate": 9.483305353478828e-07, "loss": 0.1399, "step": 6650 }, { "epoch": 0.15528547696074146, "grad_norm": 1.6830025911331177, "learning_rate": 9.48252813529814e-07, "loss": 0.1652, "step": 6660 }, { "epoch": 0.15551863833755938, "grad_norm": 1.4320217370986938, "learning_rate": 9.481750917117452e-07, "loss": 0.1562, "step": 6670 }, { "epoch": 0.1557517997143773, "grad_norm": 1.753218412399292, "learning_rate": 9.480973698936765e-07, "loss": 0.1655, "step": 6680 }, { "epoch": 0.15598496109119525, "grad_norm": 2.123769521713257, "learning_rate": 9.480196480756077e-07, "loss": 0.1448, "step": 6690 }, { "epoch": 0.15621812246801317, "grad_norm": 1.326846718788147, "learning_rate": 9.47941926257539e-07, "loss": 0.1554, "step": 6700 }, { "epoch": 0.1564512838448311, "grad_norm": 2.098761796951294, "learning_rate": 9.478642044394703e-07, "loss": 0.1417, "step": 6710 }, { "epoch": 0.15668444522164904, "grad_norm": 2.3294074535369873, "learning_rate": 9.477864826214015e-07, "loss": 0.1621, "step": 6720 }, { "epoch": 0.15691760659846696, "grad_norm": 3.6029257774353027, "learning_rate": 9.477087608033326e-07, "loss": 0.1547, "step": 6730 }, { "epoch": 0.1571507679752849, "grad_norm": 2.4315130710601807, "learning_rate": 9.476310389852638e-07, "loss": 0.1554, "step": 6740 }, { "epoch": 0.15738392935210282, "grad_norm": 1.6683169603347778, "learning_rate": 9.475533171671951e-07, "loss": 0.1528, "step": 6750 }, { "epoch": 0.15761709072892074, "grad_norm": 1.5812606811523438, "learning_rate": 9.474755953491264e-07, "loss": 0.1678, "step": 6760 }, { "epoch": 0.1578502521057387, "grad_norm": 5.180941104888916, "learning_rate": 9.473978735310576e-07, "loss": 0.1582, "step": 6770 }, { "epoch": 0.1580834134825566, "grad_norm": 2.0892794132232666, "learning_rate": 9.473201517129889e-07, "loss": 0.1718, "step": 6780 }, { "epoch": 0.15831657485937453, "grad_norm": 1.16647207736969, "learning_rate": 9.472424298949201e-07, "loss": 0.1593, "step": 6790 }, { "epoch": 0.15854973623619248, "grad_norm": 1.387217402458191, "learning_rate": 9.471647080768513e-07, "loss": 0.156, "step": 6800 }, { "epoch": 0.1587828976130104, "grad_norm": 1.8626468181610107, "learning_rate": 9.470869862587825e-07, "loss": 0.1399, "step": 6810 }, { "epoch": 0.15901605898982835, "grad_norm": 1.190064549446106, "learning_rate": 9.470092644407137e-07, "loss": 0.1472, "step": 6820 }, { "epoch": 0.15924922036664627, "grad_norm": 1.7027347087860107, "learning_rate": 9.46931542622645e-07, "loss": 0.1556, "step": 6830 }, { "epoch": 0.15948238174346419, "grad_norm": 2.5080525875091553, "learning_rate": 9.468538208045762e-07, "loss": 0.1546, "step": 6840 }, { "epoch": 0.15971554312028213, "grad_norm": 1.9693690538406372, "learning_rate": 9.467760989865075e-07, "loss": 0.1548, "step": 6850 }, { "epoch": 0.15994870449710005, "grad_norm": 2.7862889766693115, "learning_rate": 9.466983771684387e-07, "loss": 0.156, "step": 6860 }, { "epoch": 0.16018186587391797, "grad_norm": 1.8219844102859497, "learning_rate": 9.466206553503699e-07, "loss": 0.1505, "step": 6870 }, { "epoch": 0.16041502725073592, "grad_norm": 1.5604219436645508, "learning_rate": 9.465429335323012e-07, "loss": 0.17, "step": 6880 }, { "epoch": 0.16064818862755384, "grad_norm": 1.9138329029083252, "learning_rate": 9.464652117142323e-07, "loss": 0.1437, "step": 6890 }, { "epoch": 0.1608813500043718, "grad_norm": 2.4931576251983643, "learning_rate": 9.463874898961636e-07, "loss": 0.1507, "step": 6900 }, { "epoch": 0.1611145113811897, "grad_norm": 2.183199882507324, "learning_rate": 9.463097680780949e-07, "loss": 0.1581, "step": 6910 }, { "epoch": 0.16134767275800763, "grad_norm": 1.8057596683502197, "learning_rate": 9.46232046260026e-07, "loss": 0.1637, "step": 6920 }, { "epoch": 0.16158083413482557, "grad_norm": 1.7248204946517944, "learning_rate": 9.461543244419573e-07, "loss": 0.1709, "step": 6930 }, { "epoch": 0.1618139955116435, "grad_norm": 1.399310827255249, "learning_rate": 9.460766026238885e-07, "loss": 0.1574, "step": 6940 }, { "epoch": 0.1620471568884614, "grad_norm": 1.2829049825668335, "learning_rate": 9.459988808058198e-07, "loss": 0.1596, "step": 6950 }, { "epoch": 0.16228031826527936, "grad_norm": 2.185945749282837, "learning_rate": 9.459211589877511e-07, "loss": 0.1503, "step": 6960 }, { "epoch": 0.16251347964209728, "grad_norm": 2.0523524284362793, "learning_rate": 9.458434371696822e-07, "loss": 0.1624, "step": 6970 }, { "epoch": 0.16274664101891523, "grad_norm": 2.5071051120758057, "learning_rate": 9.457657153516134e-07, "loss": 0.1646, "step": 6980 }, { "epoch": 0.16297980239573315, "grad_norm": 2.0107274055480957, "learning_rate": 9.456879935335446e-07, "loss": 0.1474, "step": 6990 }, { "epoch": 0.16321296377255107, "grad_norm": 1.2973233461380005, "learning_rate": 9.456102717154759e-07, "loss": 0.1611, "step": 7000 }, { "epoch": 0.16344612514936901, "grad_norm": 2.5696160793304443, "learning_rate": 9.455325498974072e-07, "loss": 0.1679, "step": 7010 }, { "epoch": 0.16367928652618693, "grad_norm": 1.3326283693313599, "learning_rate": 9.454548280793384e-07, "loss": 0.1521, "step": 7020 }, { "epoch": 0.16391244790300485, "grad_norm": 2.0902209281921387, "learning_rate": 9.453771062612697e-07, "loss": 0.1571, "step": 7030 }, { "epoch": 0.1641456092798228, "grad_norm": 1.42107355594635, "learning_rate": 9.452993844432008e-07, "loss": 0.1416, "step": 7040 }, { "epoch": 0.16437877065664072, "grad_norm": 4.336833953857422, "learning_rate": 9.45221662625132e-07, "loss": 0.1653, "step": 7050 }, { "epoch": 0.16461193203345867, "grad_norm": 1.1665539741516113, "learning_rate": 9.451439408070633e-07, "loss": 0.1593, "step": 7060 }, { "epoch": 0.1648450934102766, "grad_norm": 1.170278549194336, "learning_rate": 9.450662189889945e-07, "loss": 0.172, "step": 7070 }, { "epoch": 0.1650782547870945, "grad_norm": 1.4916417598724365, "learning_rate": 9.449884971709258e-07, "loss": 0.1512, "step": 7080 }, { "epoch": 0.16531141616391246, "grad_norm": 1.4977827072143555, "learning_rate": 9.44910775352857e-07, "loss": 0.1474, "step": 7090 }, { "epoch": 0.16554457754073038, "grad_norm": 2.0605146884918213, "learning_rate": 9.448330535347882e-07, "loss": 0.1616, "step": 7100 }, { "epoch": 0.1657777389175483, "grad_norm": 3.081062078475952, "learning_rate": 9.447553317167195e-07, "loss": 0.1601, "step": 7110 }, { "epoch": 0.16601090029436624, "grad_norm": 1.1439927816390991, "learning_rate": 9.446776098986507e-07, "loss": 0.169, "step": 7120 }, { "epoch": 0.16624406167118416, "grad_norm": 1.4317312240600586, "learning_rate": 9.445998880805819e-07, "loss": 0.1424, "step": 7130 }, { "epoch": 0.1664772230480021, "grad_norm": 1.9179667234420776, "learning_rate": 9.445221662625132e-07, "loss": 0.1545, "step": 7140 }, { "epoch": 0.16671038442482003, "grad_norm": 2.0563294887542725, "learning_rate": 9.444444444444444e-07, "loss": 0.1672, "step": 7150 }, { "epoch": 0.16694354580163795, "grad_norm": 1.6269872188568115, "learning_rate": 9.443667226263756e-07, "loss": 0.1658, "step": 7160 }, { "epoch": 0.1671767071784559, "grad_norm": 1.4441537857055664, "learning_rate": 9.442890008083068e-07, "loss": 0.1601, "step": 7170 }, { "epoch": 0.16740986855527382, "grad_norm": 1.5017067193984985, "learning_rate": 9.442112789902381e-07, "loss": 0.1506, "step": 7180 }, { "epoch": 0.16764302993209174, "grad_norm": 1.6619751453399658, "learning_rate": 9.441335571721694e-07, "loss": 0.1462, "step": 7190 }, { "epoch": 0.16787619130890968, "grad_norm": 1.2759132385253906, "learning_rate": 9.440558353541006e-07, "loss": 0.154, "step": 7200 }, { "epoch": 0.1681093526857276, "grad_norm": 2.8101918697357178, "learning_rate": 9.439781135360318e-07, "loss": 0.1424, "step": 7210 }, { "epoch": 0.16834251406254555, "grad_norm": 2.368046760559082, "learning_rate": 9.43900391717963e-07, "loss": 0.1593, "step": 7220 }, { "epoch": 0.16857567543936347, "grad_norm": 2.3706257343292236, "learning_rate": 9.438226698998942e-07, "loss": 0.1494, "step": 7230 }, { "epoch": 0.1688088368161814, "grad_norm": 5.882688999176025, "learning_rate": 9.437449480818255e-07, "loss": 0.168, "step": 7240 }, { "epoch": 0.16904199819299934, "grad_norm": 1.3065600395202637, "learning_rate": 9.436672262637567e-07, "loss": 0.1483, "step": 7250 }, { "epoch": 0.16927515956981726, "grad_norm": 1.4482512474060059, "learning_rate": 9.43589504445688e-07, "loss": 0.1641, "step": 7260 }, { "epoch": 0.16950832094663518, "grad_norm": 1.2670949697494507, "learning_rate": 9.435117826276192e-07, "loss": 0.1533, "step": 7270 }, { "epoch": 0.16974148232345312, "grad_norm": 4.050296306610107, "learning_rate": 9.434340608095505e-07, "loss": 0.151, "step": 7280 }, { "epoch": 0.16997464370027104, "grad_norm": 1.4965975284576416, "learning_rate": 9.433563389914816e-07, "loss": 0.1401, "step": 7290 }, { "epoch": 0.170207805077089, "grad_norm": 1.2995047569274902, "learning_rate": 9.432786171734128e-07, "loss": 0.1603, "step": 7300 }, { "epoch": 0.1704409664539069, "grad_norm": 3.621626853942871, "learning_rate": 9.432008953553441e-07, "loss": 0.147, "step": 7310 }, { "epoch": 0.17067412783072483, "grad_norm": 1.5239312648773193, "learning_rate": 9.431231735372753e-07, "loss": 0.1483, "step": 7320 }, { "epoch": 0.17090728920754278, "grad_norm": 2.384829044342041, "learning_rate": 9.430454517192066e-07, "loss": 0.1493, "step": 7330 }, { "epoch": 0.1711404505843607, "grad_norm": 2.181385040283203, "learning_rate": 9.429677299011379e-07, "loss": 0.1505, "step": 7340 }, { "epoch": 0.17137361196117862, "grad_norm": 3.464315414428711, "learning_rate": 9.42890008083069e-07, "loss": 0.1502, "step": 7350 }, { "epoch": 0.17160677333799657, "grad_norm": 1.5752873420715332, "learning_rate": 9.428122862650003e-07, "loss": 0.1459, "step": 7360 }, { "epoch": 0.17183993471481449, "grad_norm": 2.8194162845611572, "learning_rate": 9.427345644469314e-07, "loss": 0.1616, "step": 7370 }, { "epoch": 0.17207309609163243, "grad_norm": 3.099531888961792, "learning_rate": 9.426568426288627e-07, "loss": 0.1541, "step": 7380 }, { "epoch": 0.17230625746845035, "grad_norm": 1.6780742406845093, "learning_rate": 9.42579120810794e-07, "loss": 0.1471, "step": 7390 }, { "epoch": 0.17253941884526827, "grad_norm": 1.7016980648040771, "learning_rate": 9.425013989927252e-07, "loss": 0.1642, "step": 7400 }, { "epoch": 0.17277258022208622, "grad_norm": 1.6954997777938843, "learning_rate": 9.424236771746564e-07, "loss": 0.146, "step": 7410 }, { "epoch": 0.17300574159890414, "grad_norm": 3.015798568725586, "learning_rate": 9.423459553565876e-07, "loss": 0.1626, "step": 7420 }, { "epoch": 0.17323890297572206, "grad_norm": 2.0872061252593994, "learning_rate": 9.422682335385189e-07, "loss": 0.1382, "step": 7430 }, { "epoch": 0.17347206435254, "grad_norm": 2.4494729042053223, "learning_rate": 9.421905117204502e-07, "loss": 0.161, "step": 7440 }, { "epoch": 0.17370522572935793, "grad_norm": 1.7108943462371826, "learning_rate": 9.421127899023813e-07, "loss": 0.1508, "step": 7450 }, { "epoch": 0.17393838710617587, "grad_norm": 2.768242120742798, "learning_rate": 9.420350680843126e-07, "loss": 0.1491, "step": 7460 }, { "epoch": 0.1741715484829938, "grad_norm": 1.6378484964370728, "learning_rate": 9.419573462662437e-07, "loss": 0.1604, "step": 7470 }, { "epoch": 0.1744047098598117, "grad_norm": 1.8406717777252197, "learning_rate": 9.41879624448175e-07, "loss": 0.163, "step": 7480 }, { "epoch": 0.17463787123662966, "grad_norm": 1.3872590065002441, "learning_rate": 9.418019026301063e-07, "loss": 0.1528, "step": 7490 }, { "epoch": 0.17487103261344758, "grad_norm": 5.152188301086426, "learning_rate": 9.417241808120375e-07, "loss": 0.1482, "step": 7500 }, { "epoch": 0.1751041939902655, "grad_norm": 1.5077937841415405, "learning_rate": 9.416464589939688e-07, "loss": 0.1539, "step": 7510 }, { "epoch": 0.17533735536708345, "grad_norm": 2.7160680294036865, "learning_rate": 9.415687371759001e-07, "loss": 0.1546, "step": 7520 }, { "epoch": 0.17557051674390137, "grad_norm": 1.3036359548568726, "learning_rate": 9.414910153578311e-07, "loss": 0.1547, "step": 7530 }, { "epoch": 0.17580367812071931, "grad_norm": 1.5408011674880981, "learning_rate": 9.414132935397624e-07, "loss": 0.154, "step": 7540 }, { "epoch": 0.17603683949753723, "grad_norm": 1.91024649143219, "learning_rate": 9.413355717216936e-07, "loss": 0.1547, "step": 7550 }, { "epoch": 0.17627000087435515, "grad_norm": 1.8871649503707886, "learning_rate": 9.412578499036249e-07, "loss": 0.1598, "step": 7560 }, { "epoch": 0.1765031622511731, "grad_norm": 1.8532320261001587, "learning_rate": 9.411801280855562e-07, "loss": 0.1389, "step": 7570 }, { "epoch": 0.17673632362799102, "grad_norm": 2.0961625576019287, "learning_rate": 9.411024062674874e-07, "loss": 0.1526, "step": 7580 }, { "epoch": 0.17696948500480894, "grad_norm": 1.6443394422531128, "learning_rate": 9.410246844494187e-07, "loss": 0.1526, "step": 7590 }, { "epoch": 0.1772026463816269, "grad_norm": 1.874780297279358, "learning_rate": 9.409469626313498e-07, "loss": 0.1428, "step": 7600 }, { "epoch": 0.1774358077584448, "grad_norm": 2.092271327972412, "learning_rate": 9.40869240813281e-07, "loss": 0.1606, "step": 7610 }, { "epoch": 0.17766896913526276, "grad_norm": 2.489023208618164, "learning_rate": 9.407915189952123e-07, "loss": 0.1429, "step": 7620 }, { "epoch": 0.17790213051208068, "grad_norm": 1.5462641716003418, "learning_rate": 9.407137971771435e-07, "loss": 0.1603, "step": 7630 }, { "epoch": 0.1781352918888986, "grad_norm": 2.3991451263427734, "learning_rate": 9.406360753590748e-07, "loss": 0.1476, "step": 7640 }, { "epoch": 0.17836845326571654, "grad_norm": 1.327267050743103, "learning_rate": 9.40558353541006e-07, "loss": 0.1677, "step": 7650 }, { "epoch": 0.17860161464253446, "grad_norm": 1.8814960718154907, "learning_rate": 9.404806317229372e-07, "loss": 0.1413, "step": 7660 }, { "epoch": 0.17883477601935238, "grad_norm": 2.0961084365844727, "learning_rate": 9.404029099048685e-07, "loss": 0.1568, "step": 7670 }, { "epoch": 0.17906793739617033, "grad_norm": 2.1892812252044678, "learning_rate": 9.403251880867997e-07, "loss": 0.1513, "step": 7680 }, { "epoch": 0.17930109877298825, "grad_norm": 1.962180495262146, "learning_rate": 9.40247466268731e-07, "loss": 0.1578, "step": 7690 }, { "epoch": 0.1795342601498062, "grad_norm": 1.4959896802902222, "learning_rate": 9.401697444506621e-07, "loss": 0.1534, "step": 7700 }, { "epoch": 0.17976742152662412, "grad_norm": 1.7928518056869507, "learning_rate": 9.400920226325934e-07, "loss": 0.1454, "step": 7710 }, { "epoch": 0.18000058290344204, "grad_norm": 2.734424114227295, "learning_rate": 9.400143008145246e-07, "loss": 0.1558, "step": 7720 }, { "epoch": 0.18023374428025998, "grad_norm": 1.303520917892456, "learning_rate": 9.399365789964558e-07, "loss": 0.1622, "step": 7730 }, { "epoch": 0.1804669056570779, "grad_norm": 1.384732723236084, "learning_rate": 9.398588571783871e-07, "loss": 0.1512, "step": 7740 }, { "epoch": 0.18070006703389582, "grad_norm": 2.1931910514831543, "learning_rate": 9.397811353603183e-07, "loss": 0.1474, "step": 7750 }, { "epoch": 0.18093322841071377, "grad_norm": 2.2104551792144775, "learning_rate": 9.397034135422496e-07, "loss": 0.1541, "step": 7760 }, { "epoch": 0.1811663897875317, "grad_norm": 1.7296971082687378, "learning_rate": 9.396256917241809e-07, "loss": 0.1625, "step": 7770 }, { "epoch": 0.18139955116434964, "grad_norm": 1.3442267179489136, "learning_rate": 9.395479699061119e-07, "loss": 0.1546, "step": 7780 }, { "epoch": 0.18163271254116756, "grad_norm": 1.5306183099746704, "learning_rate": 9.394702480880432e-07, "loss": 0.1447, "step": 7790 }, { "epoch": 0.18186587391798548, "grad_norm": 2.7658700942993164, "learning_rate": 9.393925262699744e-07, "loss": 0.1687, "step": 7800 }, { "epoch": 0.18209903529480342, "grad_norm": 1.5549894571304321, "learning_rate": 9.393148044519057e-07, "loss": 0.1607, "step": 7810 }, { "epoch": 0.18233219667162134, "grad_norm": 1.3501743078231812, "learning_rate": 9.39237082633837e-07, "loss": 0.1423, "step": 7820 }, { "epoch": 0.18256535804843926, "grad_norm": 1.5631622076034546, "learning_rate": 9.391593608157682e-07, "loss": 0.1431, "step": 7830 }, { "epoch": 0.1827985194252572, "grad_norm": 1.3663079738616943, "learning_rate": 9.390816389976994e-07, "loss": 0.1395, "step": 7840 }, { "epoch": 0.18303168080207513, "grad_norm": 1.3509570360183716, "learning_rate": 9.390039171796306e-07, "loss": 0.1493, "step": 7850 }, { "epoch": 0.18326484217889308, "grad_norm": 1.9293618202209473, "learning_rate": 9.389261953615618e-07, "loss": 0.1507, "step": 7860 }, { "epoch": 0.183498003555711, "grad_norm": 1.9715359210968018, "learning_rate": 9.388484735434931e-07, "loss": 0.1449, "step": 7870 }, { "epoch": 0.18373116493252892, "grad_norm": 1.9132298231124878, "learning_rate": 9.387707517254243e-07, "loss": 0.1523, "step": 7880 }, { "epoch": 0.18396432630934687, "grad_norm": 2.2099478244781494, "learning_rate": 9.386930299073556e-07, "loss": 0.1538, "step": 7890 }, { "epoch": 0.18419748768616478, "grad_norm": 3.5421242713928223, "learning_rate": 9.386153080892867e-07, "loss": 0.149, "step": 7900 }, { "epoch": 0.1844306490629827, "grad_norm": 4.678369522094727, "learning_rate": 9.38537586271218e-07, "loss": 0.1563, "step": 7910 }, { "epoch": 0.18466381043980065, "grad_norm": 3.2481913566589355, "learning_rate": 9.384598644531493e-07, "loss": 0.1638, "step": 7920 }, { "epoch": 0.18489697181661857, "grad_norm": 1.6805490255355835, "learning_rate": 9.383821426350805e-07, "loss": 0.1555, "step": 7930 }, { "epoch": 0.18513013319343652, "grad_norm": 3.014176368713379, "learning_rate": 9.383044208170117e-07, "loss": 0.1531, "step": 7940 }, { "epoch": 0.18536329457025444, "grad_norm": 1.9931023120880127, "learning_rate": 9.38226698998943e-07, "loss": 0.1624, "step": 7950 }, { "epoch": 0.18559645594707236, "grad_norm": 1.5759021043777466, "learning_rate": 9.381489771808741e-07, "loss": 0.1457, "step": 7960 }, { "epoch": 0.1858296173238903, "grad_norm": 1.2108467817306519, "learning_rate": 9.380712553628054e-07, "loss": 0.1592, "step": 7970 }, { "epoch": 0.18606277870070823, "grad_norm": 2.2080326080322266, "learning_rate": 9.379935335447366e-07, "loss": 0.153, "step": 7980 }, { "epoch": 0.18629594007752615, "grad_norm": 1.967830777168274, "learning_rate": 9.379158117266679e-07, "loss": 0.1472, "step": 7990 }, { "epoch": 0.1865291014543441, "grad_norm": 1.6648043394088745, "learning_rate": 9.378380899085992e-07, "loss": 0.1469, "step": 8000 }, { "epoch": 0.186762262831162, "grad_norm": 1.8145850896835327, "learning_rate": 9.377603680905304e-07, "loss": 0.1729, "step": 8010 }, { "epoch": 0.18699542420797996, "grad_norm": 1.3066741228103638, "learning_rate": 9.376826462724615e-07, "loss": 0.1455, "step": 8020 }, { "epoch": 0.18722858558479788, "grad_norm": 1.2605128288269043, "learning_rate": 9.376049244543927e-07, "loss": 0.1462, "step": 8030 }, { "epoch": 0.1874617469616158, "grad_norm": 1.292197346687317, "learning_rate": 9.37527202636324e-07, "loss": 0.1504, "step": 8040 }, { "epoch": 0.18769490833843375, "grad_norm": 1.9413094520568848, "learning_rate": 9.374494808182553e-07, "loss": 0.1528, "step": 8050 }, { "epoch": 0.18792806971525167, "grad_norm": 3.126004219055176, "learning_rate": 9.373717590001865e-07, "loss": 0.1565, "step": 8060 }, { "epoch": 0.1881612310920696, "grad_norm": 1.3407833576202393, "learning_rate": 9.372940371821178e-07, "loss": 0.1408, "step": 8070 }, { "epoch": 0.18839439246888753, "grad_norm": 1.4672553539276123, "learning_rate": 9.37216315364049e-07, "loss": 0.1493, "step": 8080 }, { "epoch": 0.18862755384570545, "grad_norm": 1.2751408815383911, "learning_rate": 9.371385935459802e-07, "loss": 0.1426, "step": 8090 }, { "epoch": 0.1888607152225234, "grad_norm": 1.7610366344451904, "learning_rate": 9.370686439097183e-07, "loss": 0.1487, "step": 8100 }, { "epoch": 0.18909387659934132, "grad_norm": 2.191973924636841, "learning_rate": 9.369909220916496e-07, "loss": 0.1432, "step": 8110 }, { "epoch": 0.18932703797615924, "grad_norm": 2.600963592529297, "learning_rate": 9.369132002735807e-07, "loss": 0.1501, "step": 8120 }, { "epoch": 0.1895601993529772, "grad_norm": 1.3398696184158325, "learning_rate": 9.36835478455512e-07, "loss": 0.1347, "step": 8130 }, { "epoch": 0.1897933607297951, "grad_norm": 2.6403112411499023, "learning_rate": 9.367577566374433e-07, "loss": 0.163, "step": 8140 }, { "epoch": 0.19002652210661303, "grad_norm": 3.0261974334716797, "learning_rate": 9.366800348193744e-07, "loss": 0.1469, "step": 8150 }, { "epoch": 0.19025968348343097, "grad_norm": 1.4897034168243408, "learning_rate": 9.366023130013057e-07, "loss": 0.1658, "step": 8160 }, { "epoch": 0.1904928448602489, "grad_norm": 2.115251064300537, "learning_rate": 9.365245911832369e-07, "loss": 0.1596, "step": 8170 }, { "epoch": 0.19072600623706684, "grad_norm": 3.3765451908111572, "learning_rate": 9.364468693651681e-07, "loss": 0.1447, "step": 8180 }, { "epoch": 0.19095916761388476, "grad_norm": 3.542462110519409, "learning_rate": 9.363691475470994e-07, "loss": 0.1519, "step": 8190 }, { "epoch": 0.19119232899070268, "grad_norm": 1.777998924255371, "learning_rate": 9.362914257290306e-07, "loss": 0.1489, "step": 8200 }, { "epoch": 0.19142549036752063, "grad_norm": 3.2621724605560303, "learning_rate": 9.362137039109619e-07, "loss": 0.1474, "step": 8210 }, { "epoch": 0.19165865174433855, "grad_norm": 1.277667760848999, "learning_rate": 9.361359820928932e-07, "loss": 0.1575, "step": 8220 }, { "epoch": 0.19189181312115647, "grad_norm": 1.4768494367599487, "learning_rate": 9.360582602748243e-07, "loss": 0.1461, "step": 8230 }, { "epoch": 0.19212497449797442, "grad_norm": 2.1935226917266846, "learning_rate": 9.359805384567555e-07, "loss": 0.1477, "step": 8240 }, { "epoch": 0.19235813587479234, "grad_norm": 1.9323339462280273, "learning_rate": 9.359028166386867e-07, "loss": 0.1483, "step": 8250 }, { "epoch": 0.19259129725161028, "grad_norm": 2.247521162033081, "learning_rate": 9.35825094820618e-07, "loss": 0.1544, "step": 8260 }, { "epoch": 0.1928244586284282, "grad_norm": 1.5650278329849243, "learning_rate": 9.357473730025492e-07, "loss": 0.1457, "step": 8270 }, { "epoch": 0.19305762000524612, "grad_norm": 1.9073244333267212, "learning_rate": 9.356696511844805e-07, "loss": 0.1396, "step": 8280 }, { "epoch": 0.19329078138206407, "grad_norm": 1.4939764738082886, "learning_rate": 9.355919293664118e-07, "loss": 0.1447, "step": 8290 }, { "epoch": 0.193523942758882, "grad_norm": 1.868532419204712, "learning_rate": 9.355142075483429e-07, "loss": 0.1504, "step": 8300 }, { "epoch": 0.1937571041356999, "grad_norm": 1.2652534246444702, "learning_rate": 9.354364857302741e-07, "loss": 0.155, "step": 8310 }, { "epoch": 0.19399026551251786, "grad_norm": 1.265137791633606, "learning_rate": 9.353587639122053e-07, "loss": 0.1428, "step": 8320 }, { "epoch": 0.19422342688933578, "grad_norm": 1.535997748374939, "learning_rate": 9.352810420941366e-07, "loss": 0.1611, "step": 8330 }, { "epoch": 0.19445658826615372, "grad_norm": 1.5537620782852173, "learning_rate": 9.352033202760679e-07, "loss": 0.1538, "step": 8340 }, { "epoch": 0.19468974964297164, "grad_norm": 1.1947669982910156, "learning_rate": 9.351255984579991e-07, "loss": 0.146, "step": 8350 }, { "epoch": 0.19492291101978956, "grad_norm": 1.8313277959823608, "learning_rate": 9.350478766399304e-07, "loss": 0.1424, "step": 8360 }, { "epoch": 0.1951560723966075, "grad_norm": 2.1346278190612793, "learning_rate": 9.349701548218616e-07, "loss": 0.1591, "step": 8370 }, { "epoch": 0.19538923377342543, "grad_norm": 1.492759108543396, "learning_rate": 9.348924330037928e-07, "loss": 0.1479, "step": 8380 }, { "epoch": 0.19562239515024335, "grad_norm": 1.981916904449463, "learning_rate": 9.34814711185724e-07, "loss": 0.1514, "step": 8390 }, { "epoch": 0.1958555565270613, "grad_norm": 2.7780840396881104, "learning_rate": 9.347369893676552e-07, "loss": 0.1539, "step": 8400 }, { "epoch": 0.19608871790387922, "grad_norm": 4.1791791915893555, "learning_rate": 9.346592675495865e-07, "loss": 0.1527, "step": 8410 }, { "epoch": 0.19632187928069716, "grad_norm": 2.0088164806365967, "learning_rate": 9.345815457315178e-07, "loss": 0.1615, "step": 8420 }, { "epoch": 0.19655504065751508, "grad_norm": 1.9935766458511353, "learning_rate": 9.345038239134489e-07, "loss": 0.1369, "step": 8430 }, { "epoch": 0.196788202034333, "grad_norm": 1.5870132446289062, "learning_rate": 9.344261020953802e-07, "loss": 0.1547, "step": 8440 }, { "epoch": 0.19702136341115095, "grad_norm": 1.9097555875778198, "learning_rate": 9.343483802773114e-07, "loss": 0.1476, "step": 8450 }, { "epoch": 0.19725452478796887, "grad_norm": 2.3809866905212402, "learning_rate": 9.342706584592427e-07, "loss": 0.1694, "step": 8460 }, { "epoch": 0.1974876861647868, "grad_norm": 1.8943169116973877, "learning_rate": 9.341929366411739e-07, "loss": 0.1565, "step": 8470 }, { "epoch": 0.19772084754160474, "grad_norm": 1.2661058902740479, "learning_rate": 9.341152148231051e-07, "loss": 0.1363, "step": 8480 }, { "epoch": 0.19795400891842266, "grad_norm": 1.3663382530212402, "learning_rate": 9.340374930050363e-07, "loss": 0.1571, "step": 8490 }, { "epoch": 0.1981871702952406, "grad_norm": 3.1252355575561523, "learning_rate": 9.339597711869675e-07, "loss": 0.1365, "step": 8500 }, { "epoch": 0.19842033167205853, "grad_norm": 2.4458625316619873, "learning_rate": 9.338820493688988e-07, "loss": 0.1472, "step": 8510 }, { "epoch": 0.19865349304887644, "grad_norm": 2.0193874835968018, "learning_rate": 9.338043275508301e-07, "loss": 0.1568, "step": 8520 }, { "epoch": 0.1988866544256944, "grad_norm": 1.2263933420181274, "learning_rate": 9.337266057327613e-07, "loss": 0.1575, "step": 8530 }, { "epoch": 0.1991198158025123, "grad_norm": 1.7741447687149048, "learning_rate": 9.336488839146926e-07, "loss": 0.1519, "step": 8540 }, { "epoch": 0.19935297717933023, "grad_norm": 2.196488380432129, "learning_rate": 9.335711620966236e-07, "loss": 0.1292, "step": 8550 }, { "epoch": 0.19958613855614818, "grad_norm": 1.4107648134231567, "learning_rate": 9.334934402785549e-07, "loss": 0.1413, "step": 8560 }, { "epoch": 0.1998192999329661, "grad_norm": 1.305922269821167, "learning_rate": 9.334157184604862e-07, "loss": 0.1391, "step": 8570 }, { "epoch": 0.20005246130978405, "grad_norm": 1.4912019968032837, "learning_rate": 9.333379966424174e-07, "loss": 0.1597, "step": 8580 }, { "epoch": 0.20028562268660197, "grad_norm": 1.8472713232040405, "learning_rate": 9.332602748243487e-07, "loss": 0.1593, "step": 8590 }, { "epoch": 0.20051878406341989, "grad_norm": 1.220266580581665, "learning_rate": 9.3318255300628e-07, "loss": 0.151, "step": 8600 }, { "epoch": 0.20075194544023783, "grad_norm": 2.3179144859313965, "learning_rate": 9.331048311882111e-07, "loss": 0.1583, "step": 8610 }, { "epoch": 0.20098510681705575, "grad_norm": 2.9554812908172607, "learning_rate": 9.330271093701424e-07, "loss": 0.161, "step": 8620 }, { "epoch": 0.20121826819387367, "grad_norm": 2.7648608684539795, "learning_rate": 9.329493875520735e-07, "loss": 0.1505, "step": 8630 }, { "epoch": 0.20145142957069162, "grad_norm": 1.9427447319030762, "learning_rate": 9.328716657340048e-07, "loss": 0.1549, "step": 8640 }, { "epoch": 0.20168459094750954, "grad_norm": 1.8562884330749512, "learning_rate": 9.32793943915936e-07, "loss": 0.1483, "step": 8650 }, { "epoch": 0.2019177523243275, "grad_norm": 2.9700381755828857, "learning_rate": 9.327162220978673e-07, "loss": 0.1504, "step": 8660 }, { "epoch": 0.2021509137011454, "grad_norm": 3.4886834621429443, "learning_rate": 9.326385002797985e-07, "loss": 0.153, "step": 8670 }, { "epoch": 0.20238407507796333, "grad_norm": 1.3754165172576904, "learning_rate": 9.325607784617297e-07, "loss": 0.1479, "step": 8680 }, { "epoch": 0.20261723645478127, "grad_norm": 4.473998546600342, "learning_rate": 9.32483056643661e-07, "loss": 0.1488, "step": 8690 }, { "epoch": 0.2028503978315992, "grad_norm": 3.4315035343170166, "learning_rate": 9.324053348255923e-07, "loss": 0.1506, "step": 8700 }, { "epoch": 0.2030835592084171, "grad_norm": 1.8070228099822998, "learning_rate": 9.323276130075234e-07, "loss": 0.1569, "step": 8710 }, { "epoch": 0.20331672058523506, "grad_norm": 1.6614954471588135, "learning_rate": 9.322498911894547e-07, "loss": 0.1488, "step": 8720 }, { "epoch": 0.20354988196205298, "grad_norm": 2.277071475982666, "learning_rate": 9.321721693713858e-07, "loss": 0.1452, "step": 8730 }, { "epoch": 0.20378304333887093, "grad_norm": 1.3845945596694946, "learning_rate": 9.320944475533171e-07, "loss": 0.1437, "step": 8740 }, { "epoch": 0.20401620471568885, "grad_norm": 1.088711142539978, "learning_rate": 9.320167257352484e-07, "loss": 0.1434, "step": 8750 }, { "epoch": 0.20424936609250677, "grad_norm": 2.605365037918091, "learning_rate": 9.319390039171796e-07, "loss": 0.1361, "step": 8760 }, { "epoch": 0.20448252746932472, "grad_norm": 2.295210361480713, "learning_rate": 9.318612820991109e-07, "loss": 0.1465, "step": 8770 }, { "epoch": 0.20471568884614263, "grad_norm": 1.8187580108642578, "learning_rate": 9.317835602810421e-07, "loss": 0.1628, "step": 8780 }, { "epoch": 0.20494885022296055, "grad_norm": 1.2988193035125732, "learning_rate": 9.317058384629732e-07, "loss": 0.1576, "step": 8790 }, { "epoch": 0.2051820115997785, "grad_norm": 1.6089692115783691, "learning_rate": 9.316281166449044e-07, "loss": 0.1696, "step": 8800 }, { "epoch": 0.20541517297659642, "grad_norm": 2.543070077896118, "learning_rate": 9.315503948268357e-07, "loss": 0.1682, "step": 8810 }, { "epoch": 0.20564833435341437, "grad_norm": 2.5402538776397705, "learning_rate": 9.31472673008767e-07, "loss": 0.1392, "step": 8820 }, { "epoch": 0.2058814957302323, "grad_norm": 1.1992591619491577, "learning_rate": 9.313949511906982e-07, "loss": 0.1503, "step": 8830 }, { "epoch": 0.2061146571070502, "grad_norm": 1.3666008710861206, "learning_rate": 9.313172293726295e-07, "loss": 0.1513, "step": 8840 }, { "epoch": 0.20634781848386816, "grad_norm": 2.7348570823669434, "learning_rate": 9.312395075545608e-07, "loss": 0.151, "step": 8850 }, { "epoch": 0.20658097986068608, "grad_norm": 1.347878098487854, "learning_rate": 9.311617857364919e-07, "loss": 0.1485, "step": 8860 }, { "epoch": 0.206814141237504, "grad_norm": 1.3594149351119995, "learning_rate": 9.310840639184231e-07, "loss": 0.1439, "step": 8870 }, { "epoch": 0.20704730261432194, "grad_norm": 1.728659987449646, "learning_rate": 9.310063421003543e-07, "loss": 0.1478, "step": 8880 }, { "epoch": 0.20728046399113986, "grad_norm": 4.519781112670898, "learning_rate": 9.309286202822856e-07, "loss": 0.1675, "step": 8890 }, { "epoch": 0.2075136253679578, "grad_norm": 2.210249662399292, "learning_rate": 9.308508984642169e-07, "loss": 0.1494, "step": 8900 }, { "epoch": 0.20774678674477573, "grad_norm": 2.8648438453674316, "learning_rate": 9.307731766461481e-07, "loss": 0.1547, "step": 8910 }, { "epoch": 0.20797994812159365, "grad_norm": 1.5171868801116943, "learning_rate": 9.306954548280793e-07, "loss": 0.1541, "step": 8920 }, { "epoch": 0.2082131094984116, "grad_norm": 2.6116392612457275, "learning_rate": 9.306177330100105e-07, "loss": 0.1524, "step": 8930 }, { "epoch": 0.20844627087522952, "grad_norm": 2.3050506114959717, "learning_rate": 9.305400111919418e-07, "loss": 0.1434, "step": 8940 }, { "epoch": 0.20867943225204744, "grad_norm": 2.830449342727661, "learning_rate": 9.30462289373873e-07, "loss": 0.1386, "step": 8950 }, { "epoch": 0.20891259362886538, "grad_norm": 1.012464165687561, "learning_rate": 9.303845675558042e-07, "loss": 0.1439, "step": 8960 }, { "epoch": 0.2091457550056833, "grad_norm": 3.448075771331787, "learning_rate": 9.303068457377355e-07, "loss": 0.1515, "step": 8970 }, { "epoch": 0.20937891638250125, "grad_norm": 1.9270639419555664, "learning_rate": 9.302291239196666e-07, "loss": 0.1529, "step": 8980 }, { "epoch": 0.20961207775931917, "grad_norm": 1.5684973001480103, "learning_rate": 9.301514021015979e-07, "loss": 0.1405, "step": 8990 }, { "epoch": 0.2098452391361371, "grad_norm": 4.62182092666626, "learning_rate": 9.300736802835292e-07, "loss": 0.1498, "step": 9000 }, { "epoch": 0.21007840051295504, "grad_norm": 2.4910435676574707, "learning_rate": 9.299959584654604e-07, "loss": 0.1427, "step": 9010 }, { "epoch": 0.21031156188977296, "grad_norm": 4.836916446685791, "learning_rate": 9.299182366473917e-07, "loss": 0.1465, "step": 9020 }, { "epoch": 0.21054472326659088, "grad_norm": 2.696051597595215, "learning_rate": 9.298405148293228e-07, "loss": 0.1491, "step": 9030 }, { "epoch": 0.21077788464340882, "grad_norm": 1.8219000101089478, "learning_rate": 9.29762793011254e-07, "loss": 0.1467, "step": 9040 }, { "epoch": 0.21101104602022674, "grad_norm": 2.5652685165405273, "learning_rate": 9.296850711931853e-07, "loss": 0.1422, "step": 9050 }, { "epoch": 0.2112442073970447, "grad_norm": 2.3747947216033936, "learning_rate": 9.296073493751165e-07, "loss": 0.1457, "step": 9060 }, { "epoch": 0.2114773687738626, "grad_norm": 1.3203353881835938, "learning_rate": 9.295296275570478e-07, "loss": 0.1553, "step": 9070 }, { "epoch": 0.21171053015068053, "grad_norm": 1.3061962127685547, "learning_rate": 9.29451905738979e-07, "loss": 0.1418, "step": 9080 }, { "epoch": 0.21194369152749848, "grad_norm": 4.774388313293457, "learning_rate": 9.293741839209103e-07, "loss": 0.159, "step": 9090 }, { "epoch": 0.2121768529043164, "grad_norm": 2.2395336627960205, "learning_rate": 9.292964621028415e-07, "loss": 0.1554, "step": 9100 }, { "epoch": 0.21241001428113432, "grad_norm": 1.5927256345748901, "learning_rate": 9.292187402847726e-07, "loss": 0.1341, "step": 9110 }, { "epoch": 0.21264317565795227, "grad_norm": 1.3823539018630981, "learning_rate": 9.291410184667039e-07, "loss": 0.1498, "step": 9120 }, { "epoch": 0.21287633703477019, "grad_norm": 2.140763521194458, "learning_rate": 9.290632966486351e-07, "loss": 0.1458, "step": 9130 }, { "epoch": 0.21310949841158813, "grad_norm": 1.892457127571106, "learning_rate": 9.289855748305664e-07, "loss": 0.1442, "step": 9140 }, { "epoch": 0.21334265978840605, "grad_norm": 1.8909987211227417, "learning_rate": 9.289078530124977e-07, "loss": 0.1469, "step": 9150 }, { "epoch": 0.21357582116522397, "grad_norm": 3.575843334197998, "learning_rate": 9.288301311944288e-07, "loss": 0.1487, "step": 9160 }, { "epoch": 0.21380898254204192, "grad_norm": 3.3197789192199707, "learning_rate": 9.287524093763601e-07, "loss": 0.1432, "step": 9170 }, { "epoch": 0.21404214391885984, "grad_norm": 1.3969051837921143, "learning_rate": 9.286746875582914e-07, "loss": 0.1474, "step": 9180 }, { "epoch": 0.21427530529567776, "grad_norm": 1.5786453485488892, "learning_rate": 9.285969657402225e-07, "loss": 0.1381, "step": 9190 }, { "epoch": 0.2145084666724957, "grad_norm": 2.150561571121216, "learning_rate": 9.285192439221538e-07, "loss": 0.1541, "step": 9200 }, { "epoch": 0.21474162804931363, "grad_norm": 2.0472328662872314, "learning_rate": 9.28441522104085e-07, "loss": 0.1457, "step": 9210 }, { "epoch": 0.21497478942613157, "grad_norm": 2.287034034729004, "learning_rate": 9.283638002860162e-07, "loss": 0.1355, "step": 9220 }, { "epoch": 0.2152079508029495, "grad_norm": 1.176141619682312, "learning_rate": 9.282860784679475e-07, "loss": 0.1255, "step": 9230 }, { "epoch": 0.2154411121797674, "grad_norm": 1.4322068691253662, "learning_rate": 9.282083566498787e-07, "loss": 0.139, "step": 9240 }, { "epoch": 0.21567427355658536, "grad_norm": 1.2216660976409912, "learning_rate": 9.2813063483181e-07, "loss": 0.1586, "step": 9250 }, { "epoch": 0.21590743493340328, "grad_norm": 2.748776435852051, "learning_rate": 9.280529130137412e-07, "loss": 0.1443, "step": 9260 }, { "epoch": 0.2161405963102212, "grad_norm": 1.7435873746871948, "learning_rate": 9.279751911956724e-07, "loss": 0.1599, "step": 9270 }, { "epoch": 0.21637375768703915, "grad_norm": 3.044417381286621, "learning_rate": 9.278974693776037e-07, "loss": 0.1617, "step": 9280 }, { "epoch": 0.21660691906385707, "grad_norm": 2.5143723487854004, "learning_rate": 9.278197475595348e-07, "loss": 0.1537, "step": 9290 }, { "epoch": 0.21684008044067501, "grad_norm": 3.3021161556243896, "learning_rate": 9.277420257414661e-07, "loss": 0.1392, "step": 9300 }, { "epoch": 0.21707324181749293, "grad_norm": 1.6611179113388062, "learning_rate": 9.276643039233973e-07, "loss": 0.1518, "step": 9310 }, { "epoch": 0.21730640319431085, "grad_norm": 2.304503917694092, "learning_rate": 9.275865821053286e-07, "loss": 0.1393, "step": 9320 }, { "epoch": 0.2175395645711288, "grad_norm": 4.874322414398193, "learning_rate": 9.275088602872599e-07, "loss": 0.1669, "step": 9330 }, { "epoch": 0.21777272594794672, "grad_norm": 2.411674976348877, "learning_rate": 9.274311384691911e-07, "loss": 0.1501, "step": 9340 }, { "epoch": 0.21800588732476464, "grad_norm": 3.037764072418213, "learning_rate": 9.273534166511222e-07, "loss": 0.148, "step": 9350 }, { "epoch": 0.2182390487015826, "grad_norm": 1.5102736949920654, "learning_rate": 9.272756948330534e-07, "loss": 0.1562, "step": 9360 }, { "epoch": 0.2184722100784005, "grad_norm": 2.0648698806762695, "learning_rate": 9.271979730149847e-07, "loss": 0.153, "step": 9370 }, { "epoch": 0.21870537145521846, "grad_norm": 1.6346042156219482, "learning_rate": 9.27120251196916e-07, "loss": 0.1472, "step": 9380 }, { "epoch": 0.21893853283203638, "grad_norm": 2.679522752761841, "learning_rate": 9.270425293788472e-07, "loss": 0.1557, "step": 9390 }, { "epoch": 0.2191716942088543, "grad_norm": 4.663761615753174, "learning_rate": 9.269648075607785e-07, "loss": 0.1372, "step": 9400 }, { "epoch": 0.21940485558567224, "grad_norm": 1.362194299697876, "learning_rate": 9.268870857427096e-07, "loss": 0.1509, "step": 9410 }, { "epoch": 0.21963801696249016, "grad_norm": 1.5817477703094482, "learning_rate": 9.268093639246409e-07, "loss": 0.1601, "step": 9420 }, { "epoch": 0.21987117833930808, "grad_norm": 1.534424066543579, "learning_rate": 9.267316421065721e-07, "loss": 0.1547, "step": 9430 }, { "epoch": 0.22010433971612603, "grad_norm": 1.3228341341018677, "learning_rate": 9.266539202885033e-07, "loss": 0.1496, "step": 9440 }, { "epoch": 0.22033750109294395, "grad_norm": 2.010911703109741, "learning_rate": 9.265761984704346e-07, "loss": 0.1419, "step": 9450 }, { "epoch": 0.2205706624697619, "grad_norm": 2.899832010269165, "learning_rate": 9.264984766523658e-07, "loss": 0.1449, "step": 9460 }, { "epoch": 0.22080382384657982, "grad_norm": 2.3482682704925537, "learning_rate": 9.26420754834297e-07, "loss": 0.151, "step": 9470 }, { "epoch": 0.22103698522339774, "grad_norm": 2.7036728858947754, "learning_rate": 9.263430330162283e-07, "loss": 0.1553, "step": 9480 }, { "epoch": 0.22127014660021568, "grad_norm": 2.623563766479492, "learning_rate": 9.262653111981595e-07, "loss": 0.1486, "step": 9490 }, { "epoch": 0.2215033079770336, "grad_norm": 1.708394169807434, "learning_rate": 9.261875893800908e-07, "loss": 0.1525, "step": 9500 }, { "epoch": 0.22173646935385152, "grad_norm": 1.7892268896102905, "learning_rate": 9.261098675620219e-07, "loss": 0.1564, "step": 9510 }, { "epoch": 0.22196963073066947, "grad_norm": 1.5792391300201416, "learning_rate": 9.260321457439532e-07, "loss": 0.1402, "step": 9520 }, { "epoch": 0.2222027921074874, "grad_norm": 2.500957489013672, "learning_rate": 9.259544239258844e-07, "loss": 0.1391, "step": 9530 }, { "epoch": 0.22243595348430534, "grad_norm": 2.0898847579956055, "learning_rate": 9.258767021078156e-07, "loss": 0.1487, "step": 9540 }, { "epoch": 0.22266911486112326, "grad_norm": 1.519158124923706, "learning_rate": 9.257989802897469e-07, "loss": 0.1484, "step": 9550 }, { "epoch": 0.22290227623794118, "grad_norm": 1.6454635858535767, "learning_rate": 9.257212584716781e-07, "loss": 0.1494, "step": 9560 }, { "epoch": 0.22313543761475912, "grad_norm": 2.2545948028564453, "learning_rate": 9.256435366536094e-07, "loss": 0.1445, "step": 9570 }, { "epoch": 0.22336859899157704, "grad_norm": 2.098590850830078, "learning_rate": 9.255658148355407e-07, "loss": 0.1484, "step": 9580 }, { "epoch": 0.22360176036839496, "grad_norm": 1.9312752485275269, "learning_rate": 9.254880930174717e-07, "loss": 0.1516, "step": 9590 }, { "epoch": 0.2238349217452129, "grad_norm": 1.1534717082977295, "learning_rate": 9.25410371199403e-07, "loss": 0.1509, "step": 9600 }, { "epoch": 0.22406808312203083, "grad_norm": 3.699556827545166, "learning_rate": 9.253326493813342e-07, "loss": 0.1396, "step": 9610 }, { "epoch": 0.22430124449884878, "grad_norm": 1.7980389595031738, "learning_rate": 9.252549275632655e-07, "loss": 0.1447, "step": 9620 }, { "epoch": 0.2245344058756667, "grad_norm": 3.442178726196289, "learning_rate": 9.251772057451968e-07, "loss": 0.1436, "step": 9630 }, { "epoch": 0.22476756725248462, "grad_norm": 1.3087013959884644, "learning_rate": 9.25099483927128e-07, "loss": 0.1494, "step": 9640 }, { "epoch": 0.22500072862930257, "grad_norm": 1.4020062685012817, "learning_rate": 9.250217621090593e-07, "loss": 0.143, "step": 9650 }, { "epoch": 0.22523389000612049, "grad_norm": 2.3507421016693115, "learning_rate": 9.249440402909905e-07, "loss": 0.1358, "step": 9660 }, { "epoch": 0.2254670513829384, "grad_norm": 1.2362852096557617, "learning_rate": 9.248663184729216e-07, "loss": 0.1626, "step": 9670 }, { "epoch": 0.22570021275975635, "grad_norm": 2.4332878589630127, "learning_rate": 9.247885966548529e-07, "loss": 0.1427, "step": 9680 }, { "epoch": 0.22593337413657427, "grad_norm": 2.751542329788208, "learning_rate": 9.247108748367841e-07, "loss": 0.1389, "step": 9690 }, { "epoch": 0.22616653551339222, "grad_norm": 2.7197155952453613, "learning_rate": 9.246331530187154e-07, "loss": 0.1392, "step": 9700 }, { "epoch": 0.22639969689021014, "grad_norm": 2.6909339427948, "learning_rate": 9.245554312006467e-07, "loss": 0.1587, "step": 9710 }, { "epoch": 0.22663285826702806, "grad_norm": 1.3968504667282104, "learning_rate": 9.244777093825778e-07, "loss": 0.1528, "step": 9720 }, { "epoch": 0.226866019643846, "grad_norm": 2.1319854259490967, "learning_rate": 9.243999875645091e-07, "loss": 0.1475, "step": 9730 }, { "epoch": 0.22709918102066393, "grad_norm": 2.1387648582458496, "learning_rate": 9.243222657464403e-07, "loss": 0.1557, "step": 9740 }, { "epoch": 0.22733234239748185, "grad_norm": 3.778343677520752, "learning_rate": 9.242445439283715e-07, "loss": 0.1604, "step": 9750 }, { "epoch": 0.2275655037742998, "grad_norm": 2.462409734725952, "learning_rate": 9.241668221103028e-07, "loss": 0.1403, "step": 9760 }, { "epoch": 0.2277986651511177, "grad_norm": 2.3799638748168945, "learning_rate": 9.24089100292234e-07, "loss": 0.1443, "step": 9770 }, { "epoch": 0.22803182652793566, "grad_norm": 5.1316070556640625, "learning_rate": 9.240113784741652e-07, "loss": 0.1668, "step": 9780 }, { "epoch": 0.22826498790475358, "grad_norm": 3.0819292068481445, "learning_rate": 9.239336566560964e-07, "loss": 0.1492, "step": 9790 }, { "epoch": 0.2284981492815715, "grad_norm": 1.8143244981765747, "learning_rate": 9.238559348380277e-07, "loss": 0.1461, "step": 9800 }, { "epoch": 0.22873131065838945, "grad_norm": 1.320322036743164, "learning_rate": 9.23778213019959e-07, "loss": 0.1413, "step": 9810 }, { "epoch": 0.22896447203520737, "grad_norm": 1.8687424659729004, "learning_rate": 9.237004912018902e-07, "loss": 0.1553, "step": 9820 }, { "epoch": 0.2291976334120253, "grad_norm": 1.9774503707885742, "learning_rate": 9.236227693838214e-07, "loss": 0.1484, "step": 9830 }, { "epoch": 0.22943079478884323, "grad_norm": 3.329111099243164, "learning_rate": 9.235450475657525e-07, "loss": 0.1422, "step": 9840 }, { "epoch": 0.22966395616566115, "grad_norm": 1.415888786315918, "learning_rate": 9.234673257476838e-07, "loss": 0.1606, "step": 9850 }, { "epoch": 0.2298971175424791, "grad_norm": 1.636842966079712, "learning_rate": 9.233896039296151e-07, "loss": 0.1401, "step": 9860 }, { "epoch": 0.23013027891929702, "grad_norm": 2.0595996379852295, "learning_rate": 9.233118821115463e-07, "loss": 0.1523, "step": 9870 }, { "epoch": 0.23036344029611494, "grad_norm": 2.80241060256958, "learning_rate": 9.232341602934776e-07, "loss": 0.1497, "step": 9880 }, { "epoch": 0.2305966016729329, "grad_norm": 1.3516069650650024, "learning_rate": 9.231564384754088e-07, "loss": 0.149, "step": 9890 }, { "epoch": 0.2308297630497508, "grad_norm": 2.09682297706604, "learning_rate": 9.2307871665734e-07, "loss": 0.1567, "step": 9900 }, { "epoch": 0.23106292442656873, "grad_norm": 1.3421334028244019, "learning_rate": 9.230009948392712e-07, "loss": 0.1533, "step": 9910 }, { "epoch": 0.23129608580338668, "grad_norm": 1.2486672401428223, "learning_rate": 9.229232730212024e-07, "loss": 0.1399, "step": 9920 }, { "epoch": 0.2315292471802046, "grad_norm": 1.5388051271438599, "learning_rate": 9.228455512031337e-07, "loss": 0.148, "step": 9930 }, { "epoch": 0.23176240855702254, "grad_norm": 1.9620686769485474, "learning_rate": 9.227678293850649e-07, "loss": 0.1449, "step": 9940 }, { "epoch": 0.23199556993384046, "grad_norm": 2.702714443206787, "learning_rate": 9.226901075669962e-07, "loss": 0.1428, "step": 9950 }, { "epoch": 0.23222873131065838, "grad_norm": 1.932210922241211, "learning_rate": 9.226123857489274e-07, "loss": 0.1426, "step": 9960 }, { "epoch": 0.23246189268747633, "grad_norm": 5.138180732727051, "learning_rate": 9.225346639308586e-07, "loss": 0.152, "step": 9970 }, { "epoch": 0.23269505406429425, "grad_norm": 1.3879714012145996, "learning_rate": 9.224569421127899e-07, "loss": 0.1437, "step": 9980 }, { "epoch": 0.23292821544111217, "grad_norm": 1.2734689712524414, "learning_rate": 9.22379220294721e-07, "loss": 0.1431, "step": 9990 }, { "epoch": 0.23316137681793012, "grad_norm": 5.012506008148193, "learning_rate": 9.223014984766523e-07, "loss": 0.1638, "step": 10000 }, { "epoch": 0.23316137681793012, "eval_accuracy": 0.9280503279381918, "eval_f1": 0.9481795010256887, "eval_loss": 0.18410851061344147, "eval_runtime": 3975.0075, "eval_samples_per_second": 460.356, "eval_steps_per_second": 57.545, "step": 10000 }, { "epoch": 0.23339453819474804, "grad_norm": 3.052962064743042, "learning_rate": 9.222237766585836e-07, "loss": 0.1536, "step": 10010 }, { "epoch": 0.23362769957156598, "grad_norm": 2.5085856914520264, "learning_rate": 9.221460548405147e-07, "loss": 0.149, "step": 10020 }, { "epoch": 0.2338608609483839, "grad_norm": 1.2989205121994019, "learning_rate": 9.22068333022446e-07, "loss": 0.1435, "step": 10030 }, { "epoch": 0.23409402232520182, "grad_norm": 1.2859785556793213, "learning_rate": 9.219906112043773e-07, "loss": 0.1367, "step": 10040 }, { "epoch": 0.23432718370201977, "grad_norm": 1.4703576564788818, "learning_rate": 9.219128893863085e-07, "loss": 0.1526, "step": 10050 }, { "epoch": 0.2345603450788377, "grad_norm": 1.5140269994735718, "learning_rate": 9.218351675682398e-07, "loss": 0.1467, "step": 10060 }, { "epoch": 0.2347935064556556, "grad_norm": 2.4527530670166016, "learning_rate": 9.217574457501709e-07, "loss": 0.1474, "step": 10070 }, { "epoch": 0.23502666783247356, "grad_norm": 3.03503155708313, "learning_rate": 9.216797239321021e-07, "loss": 0.1539, "step": 10080 }, { "epoch": 0.23525982920929148, "grad_norm": 1.7020747661590576, "learning_rate": 9.216020021140333e-07, "loss": 0.1423, "step": 10090 }, { "epoch": 0.23549299058610942, "grad_norm": 1.2115970849990845, "learning_rate": 9.215242802959646e-07, "loss": 0.1388, "step": 10100 }, { "epoch": 0.23572615196292734, "grad_norm": 4.052295684814453, "learning_rate": 9.214543306597028e-07, "loss": 0.1466, "step": 10110 }, { "epoch": 0.23595931333974526, "grad_norm": 3.071192741394043, "learning_rate": 9.213766088416339e-07, "loss": 0.1354, "step": 10120 }, { "epoch": 0.2361924747165632, "grad_norm": 3.70544171333313, "learning_rate": 9.212988870235652e-07, "loss": 0.1462, "step": 10130 }, { "epoch": 0.23642563609338113, "grad_norm": 1.5248229503631592, "learning_rate": 9.212211652054964e-07, "loss": 0.1367, "step": 10140 }, { "epoch": 0.23665879747019905, "grad_norm": 1.3339427709579468, "learning_rate": 9.211434433874277e-07, "loss": 0.148, "step": 10150 }, { "epoch": 0.236891958847017, "grad_norm": 3.2058920860290527, "learning_rate": 9.210657215693589e-07, "loss": 0.1517, "step": 10160 }, { "epoch": 0.23712512022383492, "grad_norm": 1.3347463607788086, "learning_rate": 9.209879997512902e-07, "loss": 0.1477, "step": 10170 }, { "epoch": 0.23735828160065287, "grad_norm": 1.4779430627822876, "learning_rate": 9.209102779332214e-07, "loss": 0.1485, "step": 10180 }, { "epoch": 0.23759144297747078, "grad_norm": 1.5279446840286255, "learning_rate": 9.208325561151526e-07, "loss": 0.1375, "step": 10190 }, { "epoch": 0.2378246043542887, "grad_norm": 1.8613137006759644, "learning_rate": 9.207548342970838e-07, "loss": 0.1416, "step": 10200 }, { "epoch": 0.23805776573110665, "grad_norm": 1.818942666053772, "learning_rate": 9.20677112479015e-07, "loss": 0.1471, "step": 10210 }, { "epoch": 0.23829092710792457, "grad_norm": 1.9770898818969727, "learning_rate": 9.205993906609463e-07, "loss": 0.1505, "step": 10220 }, { "epoch": 0.2385240884847425, "grad_norm": 1.1609891653060913, "learning_rate": 9.205216688428776e-07, "loss": 0.1316, "step": 10230 }, { "epoch": 0.23875724986156044, "grad_norm": 2.520059823989868, "learning_rate": 9.204439470248087e-07, "loss": 0.1513, "step": 10240 }, { "epoch": 0.23899041123837836, "grad_norm": 1.2523597478866577, "learning_rate": 9.2036622520674e-07, "loss": 0.1397, "step": 10250 }, { "epoch": 0.2392235726151963, "grad_norm": 1.4460135698318481, "learning_rate": 9.202885033886712e-07, "loss": 0.1443, "step": 10260 }, { "epoch": 0.23945673399201423, "grad_norm": 1.6885714530944824, "learning_rate": 9.202107815706025e-07, "loss": 0.1365, "step": 10270 }, { "epoch": 0.23968989536883215, "grad_norm": 2.492640733718872, "learning_rate": 9.201330597525337e-07, "loss": 0.1377, "step": 10280 }, { "epoch": 0.2399230567456501, "grad_norm": 1.4798362255096436, "learning_rate": 9.200553379344649e-07, "loss": 0.1483, "step": 10290 }, { "epoch": 0.240156218122468, "grad_norm": 1.6882628202438354, "learning_rate": 9.199776161163961e-07, "loss": 0.1401, "step": 10300 }, { "epoch": 0.24038937949928593, "grad_norm": 1.7754348516464233, "learning_rate": 9.198998942983273e-07, "loss": 0.1353, "step": 10310 }, { "epoch": 0.24062254087610388, "grad_norm": 3.3792107105255127, "learning_rate": 9.198221724802586e-07, "loss": 0.1332, "step": 10320 }, { "epoch": 0.2408557022529218, "grad_norm": 1.8155198097229004, "learning_rate": 9.197444506621899e-07, "loss": 0.1486, "step": 10330 }, { "epoch": 0.24108886362973975, "grad_norm": 1.6190265417099, "learning_rate": 9.196667288441211e-07, "loss": 0.159, "step": 10340 }, { "epoch": 0.24132202500655767, "grad_norm": 4.1872878074646, "learning_rate": 9.195890070260524e-07, "loss": 0.146, "step": 10350 }, { "epoch": 0.2415551863833756, "grad_norm": 1.6078665256500244, "learning_rate": 9.195112852079834e-07, "loss": 0.1467, "step": 10360 }, { "epoch": 0.24178834776019353, "grad_norm": 1.9358569383621216, "learning_rate": 9.194335633899147e-07, "loss": 0.137, "step": 10370 }, { "epoch": 0.24202150913701145, "grad_norm": 2.342205762863159, "learning_rate": 9.19355841571846e-07, "loss": 0.1574, "step": 10380 }, { "epoch": 0.24225467051382937, "grad_norm": 1.505733847618103, "learning_rate": 9.192781197537772e-07, "loss": 0.1557, "step": 10390 }, { "epoch": 0.24248783189064732, "grad_norm": 1.4578557014465332, "learning_rate": 9.192003979357085e-07, "loss": 0.1377, "step": 10400 }, { "epoch": 0.24272099326746524, "grad_norm": 1.3189812898635864, "learning_rate": 9.191226761176398e-07, "loss": 0.1416, "step": 10410 }, { "epoch": 0.2429541546442832, "grad_norm": 3.3186209201812744, "learning_rate": 9.190449542995709e-07, "loss": 0.1572, "step": 10420 }, { "epoch": 0.2431873160211011, "grad_norm": 1.8088712692260742, "learning_rate": 9.189672324815022e-07, "loss": 0.1589, "step": 10430 }, { "epoch": 0.24342047739791903, "grad_norm": 1.377885341644287, "learning_rate": 9.188895106634333e-07, "loss": 0.1476, "step": 10440 }, { "epoch": 0.24365363877473697, "grad_norm": 1.1496453285217285, "learning_rate": 9.188117888453646e-07, "loss": 0.1389, "step": 10450 }, { "epoch": 0.2438868001515549, "grad_norm": 1.357235312461853, "learning_rate": 9.187340670272959e-07, "loss": 0.1634, "step": 10460 }, { "epoch": 0.24411996152837281, "grad_norm": 1.487939715385437, "learning_rate": 9.186563452092271e-07, "loss": 0.139, "step": 10470 }, { "epoch": 0.24435312290519076, "grad_norm": 3.1384761333465576, "learning_rate": 9.185786233911584e-07, "loss": 0.1581, "step": 10480 }, { "epoch": 0.24458628428200868, "grad_norm": 1.5850224494934082, "learning_rate": 9.185009015730895e-07, "loss": 0.1553, "step": 10490 }, { "epoch": 0.24481944565882663, "grad_norm": 1.3270548582077026, "learning_rate": 9.184231797550208e-07, "loss": 0.1459, "step": 10500 }, { "epoch": 0.24505260703564455, "grad_norm": 2.0087718963623047, "learning_rate": 9.183454579369521e-07, "loss": 0.1564, "step": 10510 }, { "epoch": 0.24528576841246247, "grad_norm": 1.6170734167099, "learning_rate": 9.182677361188832e-07, "loss": 0.1465, "step": 10520 }, { "epoch": 0.24551892978928042, "grad_norm": 4.295595645904541, "learning_rate": 9.181900143008145e-07, "loss": 0.1571, "step": 10530 }, { "epoch": 0.24575209116609834, "grad_norm": 1.3371142148971558, "learning_rate": 9.181122924827457e-07, "loss": 0.1477, "step": 10540 }, { "epoch": 0.24598525254291626, "grad_norm": 2.3615007400512695, "learning_rate": 9.180345706646769e-07, "loss": 0.1486, "step": 10550 }, { "epoch": 0.2462184139197342, "grad_norm": 2.668365478515625, "learning_rate": 9.179568488466082e-07, "loss": 0.1429, "step": 10560 }, { "epoch": 0.24645157529655212, "grad_norm": 1.1011155843734741, "learning_rate": 9.178791270285394e-07, "loss": 0.1289, "step": 10570 }, { "epoch": 0.24668473667337007, "grad_norm": 2.1526551246643066, "learning_rate": 9.178014052104707e-07, "loss": 0.1413, "step": 10580 }, { "epoch": 0.246917898050188, "grad_norm": 5.623586177825928, "learning_rate": 9.177236833924019e-07, "loss": 0.147, "step": 10590 }, { "epoch": 0.2471510594270059, "grad_norm": 2.1426808834075928, "learning_rate": 9.176459615743332e-07, "loss": 0.1388, "step": 10600 }, { "epoch": 0.24738422080382386, "grad_norm": 1.3369297981262207, "learning_rate": 9.175682397562643e-07, "loss": 0.131, "step": 10610 }, { "epoch": 0.24761738218064178, "grad_norm": 1.2720515727996826, "learning_rate": 9.174905179381955e-07, "loss": 0.1383, "step": 10620 }, { "epoch": 0.2478505435574597, "grad_norm": 1.9696390628814697, "learning_rate": 9.174127961201268e-07, "loss": 0.1431, "step": 10630 }, { "epoch": 0.24808370493427764, "grad_norm": 1.258918046951294, "learning_rate": 9.17335074302058e-07, "loss": 0.1357, "step": 10640 }, { "epoch": 0.24831686631109556, "grad_norm": 2.421710252761841, "learning_rate": 9.172573524839893e-07, "loss": 0.1282, "step": 10650 }, { "epoch": 0.2485500276879135, "grad_norm": 2.490924119949341, "learning_rate": 9.171796306659206e-07, "loss": 0.1466, "step": 10660 }, { "epoch": 0.24878318906473143, "grad_norm": 1.9591745138168335, "learning_rate": 9.171019088478517e-07, "loss": 0.145, "step": 10670 }, { "epoch": 0.24901635044154935, "grad_norm": 2.2763283252716064, "learning_rate": 9.17024187029783e-07, "loss": 0.1326, "step": 10680 }, { "epoch": 0.2492495118183673, "grad_norm": 2.0074377059936523, "learning_rate": 9.169464652117141e-07, "loss": 0.1449, "step": 10690 }, { "epoch": 0.24948267319518522, "grad_norm": 4.994042873382568, "learning_rate": 9.168687433936454e-07, "loss": 0.1433, "step": 10700 }, { "epoch": 0.24971583457200314, "grad_norm": 1.6962251663208008, "learning_rate": 9.167910215755767e-07, "loss": 0.1627, "step": 10710 }, { "epoch": 0.24994899594882108, "grad_norm": 1.3602683544158936, "learning_rate": 9.167132997575079e-07, "loss": 0.1543, "step": 10720 }, { "epoch": 0.25018215732563903, "grad_norm": 2.7574567794799805, "learning_rate": 9.166355779394391e-07, "loss": 0.1394, "step": 10730 }, { "epoch": 0.2504153187024569, "grad_norm": 1.2039580345153809, "learning_rate": 9.165578561213703e-07, "loss": 0.1481, "step": 10740 }, { "epoch": 0.25064848007927487, "grad_norm": 2.024813652038574, "learning_rate": 9.164801343033016e-07, "loss": 0.1381, "step": 10750 }, { "epoch": 0.2508816414560928, "grad_norm": 2.64245343208313, "learning_rate": 9.164024124852329e-07, "loss": 0.1423, "step": 10760 }, { "epoch": 0.2511148028329107, "grad_norm": 2.3835368156433105, "learning_rate": 9.16324690667164e-07, "loss": 0.1434, "step": 10770 }, { "epoch": 0.25134796420972866, "grad_norm": 4.4351983070373535, "learning_rate": 9.162469688490953e-07, "loss": 0.1479, "step": 10780 }, { "epoch": 0.2515811255865466, "grad_norm": 2.2731869220733643, "learning_rate": 9.161692470310264e-07, "loss": 0.1489, "step": 10790 }, { "epoch": 0.2518142869633645, "grad_norm": 4.456081390380859, "learning_rate": 9.160915252129577e-07, "loss": 0.1459, "step": 10800 }, { "epoch": 0.25204744834018245, "grad_norm": 1.1467233896255493, "learning_rate": 9.16013803394889e-07, "loss": 0.142, "step": 10810 }, { "epoch": 0.2522806097170004, "grad_norm": 1.3448916673660278, "learning_rate": 9.159360815768202e-07, "loss": 0.1407, "step": 10820 }, { "epoch": 0.2525137710938183, "grad_norm": 1.9277260303497314, "learning_rate": 9.158583597587515e-07, "loss": 0.1293, "step": 10830 }, { "epoch": 0.25274693247063623, "grad_norm": 3.0537796020507812, "learning_rate": 9.157806379406828e-07, "loss": 0.1365, "step": 10840 }, { "epoch": 0.2529800938474542, "grad_norm": 1.3687810897827148, "learning_rate": 9.157029161226138e-07, "loss": 0.1379, "step": 10850 }, { "epoch": 0.2532132552242721, "grad_norm": 1.745794415473938, "learning_rate": 9.156251943045451e-07, "loss": 0.1522, "step": 10860 }, { "epoch": 0.25344641660109, "grad_norm": 1.2845479249954224, "learning_rate": 9.155474724864763e-07, "loss": 0.1442, "step": 10870 }, { "epoch": 0.25367957797790797, "grad_norm": 3.9299545288085938, "learning_rate": 9.154697506684076e-07, "loss": 0.1316, "step": 10880 }, { "epoch": 0.2539127393547259, "grad_norm": 1.4491753578186035, "learning_rate": 9.153920288503389e-07, "loss": 0.1317, "step": 10890 }, { "epoch": 0.2541459007315438, "grad_norm": 1.5986779928207397, "learning_rate": 9.153143070322701e-07, "loss": 0.1243, "step": 10900 }, { "epoch": 0.25437906210836175, "grad_norm": 3.976266860961914, "learning_rate": 9.152365852142014e-07, "loss": 0.1457, "step": 10910 }, { "epoch": 0.2546122234851797, "grad_norm": 1.537340760231018, "learning_rate": 9.151588633961325e-07, "loss": 0.137, "step": 10920 }, { "epoch": 0.2548453848619976, "grad_norm": 1.872429370880127, "learning_rate": 9.150811415780637e-07, "loss": 0.1303, "step": 10930 }, { "epoch": 0.25507854623881554, "grad_norm": 1.3102537393569946, "learning_rate": 9.15003419759995e-07, "loss": 0.1394, "step": 10940 }, { "epoch": 0.2553117076156335, "grad_norm": 2.6091508865356445, "learning_rate": 9.149256979419262e-07, "loss": 0.1441, "step": 10950 }, { "epoch": 0.2555448689924514, "grad_norm": 3.1084179878234863, "learning_rate": 9.148479761238575e-07, "loss": 0.1564, "step": 10960 }, { "epoch": 0.2557780303692693, "grad_norm": 1.9197086095809937, "learning_rate": 9.147702543057887e-07, "loss": 0.1403, "step": 10970 }, { "epoch": 0.2560111917460873, "grad_norm": 1.8540964126586914, "learning_rate": 9.146925324877199e-07, "loss": 0.1464, "step": 10980 }, { "epoch": 0.25624435312290517, "grad_norm": 1.7522201538085938, "learning_rate": 9.146148106696512e-07, "loss": 0.1477, "step": 10990 }, { "epoch": 0.2564775144997231, "grad_norm": 2.5851011276245117, "learning_rate": 9.145370888515824e-07, "loss": 0.1411, "step": 11000 }, { "epoch": 0.25671067587654106, "grad_norm": 2.4028353691101074, "learning_rate": 9.144593670335136e-07, "loss": 0.1472, "step": 11010 }, { "epoch": 0.256943837253359, "grad_norm": 2.0681445598602295, "learning_rate": 9.143816452154448e-07, "loss": 0.1423, "step": 11020 }, { "epoch": 0.2571769986301769, "grad_norm": 3.2311580181121826, "learning_rate": 9.143039233973761e-07, "loss": 0.148, "step": 11030 }, { "epoch": 0.25741016000699485, "grad_norm": 1.3336422443389893, "learning_rate": 9.142262015793073e-07, "loss": 0.1379, "step": 11040 }, { "epoch": 0.2576433213838128, "grad_norm": 1.440940260887146, "learning_rate": 9.141484797612385e-07, "loss": 0.1332, "step": 11050 }, { "epoch": 0.2578764827606307, "grad_norm": 3.4755232334136963, "learning_rate": 9.140707579431698e-07, "loss": 0.138, "step": 11060 }, { "epoch": 0.25810964413744863, "grad_norm": 3.1064417362213135, "learning_rate": 9.13993036125101e-07, "loss": 0.1415, "step": 11070 }, { "epoch": 0.2583428055142666, "grad_norm": 1.6762382984161377, "learning_rate": 9.139153143070323e-07, "loss": 0.1311, "step": 11080 }, { "epoch": 0.2585759668910845, "grad_norm": 1.2336044311523438, "learning_rate": 9.138375924889635e-07, "loss": 0.1429, "step": 11090 }, { "epoch": 0.2588091282679024, "grad_norm": 1.7866157293319702, "learning_rate": 9.137598706708946e-07, "loss": 0.1388, "step": 11100 }, { "epoch": 0.25904228964472037, "grad_norm": 2.3250317573547363, "learning_rate": 9.136821488528259e-07, "loss": 0.1471, "step": 11110 }, { "epoch": 0.25927545102153826, "grad_norm": 1.5241590738296509, "learning_rate": 9.136044270347571e-07, "loss": 0.1324, "step": 11120 }, { "epoch": 0.2595086123983562, "grad_norm": 1.6840764284133911, "learning_rate": 9.135267052166884e-07, "loss": 0.1559, "step": 11130 }, { "epoch": 0.25974177377517416, "grad_norm": 1.4315211772918701, "learning_rate": 9.134489833986197e-07, "loss": 0.135, "step": 11140 }, { "epoch": 0.25997493515199205, "grad_norm": 2.558234453201294, "learning_rate": 9.133712615805509e-07, "loss": 0.1498, "step": 11150 }, { "epoch": 0.26020809652881, "grad_norm": 2.1477818489074707, "learning_rate": 9.132935397624821e-07, "loss": 0.1443, "step": 11160 }, { "epoch": 0.26044125790562794, "grad_norm": 2.241983413696289, "learning_rate": 9.132158179444132e-07, "loss": 0.1514, "step": 11170 }, { "epoch": 0.2606744192824459, "grad_norm": 3.4103989601135254, "learning_rate": 9.131380961263445e-07, "loss": 0.1523, "step": 11180 }, { "epoch": 0.2609075806592638, "grad_norm": 1.4832119941711426, "learning_rate": 9.130603743082758e-07, "loss": 0.1557, "step": 11190 }, { "epoch": 0.26114074203608173, "grad_norm": 1.2336223125457764, "learning_rate": 9.12982652490207e-07, "loss": 0.1424, "step": 11200 }, { "epoch": 0.2613739034128997, "grad_norm": 3.8175976276397705, "learning_rate": 9.129049306721383e-07, "loss": 0.1503, "step": 11210 }, { "epoch": 0.26160706478971757, "grad_norm": 1.239007830619812, "learning_rate": 9.128272088540694e-07, "loss": 0.1372, "step": 11220 }, { "epoch": 0.2618402261665355, "grad_norm": 2.6076478958129883, "learning_rate": 9.127494870360007e-07, "loss": 0.1416, "step": 11230 }, { "epoch": 0.26207338754335346, "grad_norm": 1.1181957721710205, "learning_rate": 9.12671765217932e-07, "loss": 0.1368, "step": 11240 }, { "epoch": 0.26230654892017136, "grad_norm": 2.19626784324646, "learning_rate": 9.125940433998631e-07, "loss": 0.1569, "step": 11250 }, { "epoch": 0.2625397102969893, "grad_norm": 2.4732816219329834, "learning_rate": 9.125163215817944e-07, "loss": 0.1327, "step": 11260 }, { "epoch": 0.26277287167380725, "grad_norm": 3.976492404937744, "learning_rate": 9.124385997637257e-07, "loss": 0.1401, "step": 11270 }, { "epoch": 0.26300603305062514, "grad_norm": 1.4476948976516724, "learning_rate": 9.123608779456568e-07, "loss": 0.1463, "step": 11280 }, { "epoch": 0.2632391944274431, "grad_norm": 2.4067556858062744, "learning_rate": 9.122831561275881e-07, "loss": 0.1333, "step": 11290 }, { "epoch": 0.26347235580426104, "grad_norm": 4.718403339385986, "learning_rate": 9.122054343095193e-07, "loss": 0.145, "step": 11300 }, { "epoch": 0.26370551718107893, "grad_norm": 1.7448599338531494, "learning_rate": 9.121277124914506e-07, "loss": 0.1482, "step": 11310 }, { "epoch": 0.2639386785578969, "grad_norm": 3.436903238296509, "learning_rate": 9.120499906733819e-07, "loss": 0.1414, "step": 11320 }, { "epoch": 0.2641718399347148, "grad_norm": 3.06777286529541, "learning_rate": 9.11972268855313e-07, "loss": 0.1389, "step": 11330 }, { "epoch": 0.2644050013115327, "grad_norm": 1.4213557243347168, "learning_rate": 9.118945470372442e-07, "loss": 0.1397, "step": 11340 }, { "epoch": 0.26463816268835066, "grad_norm": 1.6798162460327148, "learning_rate": 9.118168252191754e-07, "loss": 0.1532, "step": 11350 }, { "epoch": 0.2648713240651686, "grad_norm": 2.034242868423462, "learning_rate": 9.117391034011067e-07, "loss": 0.1333, "step": 11360 }, { "epoch": 0.26510448544198656, "grad_norm": 2.2621710300445557, "learning_rate": 9.11661381583038e-07, "loss": 0.1547, "step": 11370 }, { "epoch": 0.26533764681880445, "grad_norm": 1.167153000831604, "learning_rate": 9.115836597649692e-07, "loss": 0.1523, "step": 11380 }, { "epoch": 0.2655708081956224, "grad_norm": 1.7284599542617798, "learning_rate": 9.115059379469005e-07, "loss": 0.1411, "step": 11390 }, { "epoch": 0.26580396957244035, "grad_norm": 1.459493637084961, "learning_rate": 9.114282161288317e-07, "loss": 0.1386, "step": 11400 }, { "epoch": 0.26603713094925824, "grad_norm": 1.5548287630081177, "learning_rate": 9.113504943107628e-07, "loss": 0.145, "step": 11410 }, { "epoch": 0.2662702923260762, "grad_norm": 1.4557907581329346, "learning_rate": 9.112727724926941e-07, "loss": 0.1394, "step": 11420 }, { "epoch": 0.26650345370289413, "grad_norm": 2.4454472064971924, "learning_rate": 9.111950506746253e-07, "loss": 0.1515, "step": 11430 }, { "epoch": 0.266736615079712, "grad_norm": 3.277773141860962, "learning_rate": 9.111173288565566e-07, "loss": 0.1375, "step": 11440 }, { "epoch": 0.26696977645653, "grad_norm": 1.380448341369629, "learning_rate": 9.110396070384878e-07, "loss": 0.1336, "step": 11450 }, { "epoch": 0.2672029378333479, "grad_norm": 1.40792977809906, "learning_rate": 9.109618852204191e-07, "loss": 0.1336, "step": 11460 }, { "epoch": 0.2674360992101658, "grad_norm": 2.0717413425445557, "learning_rate": 9.108841634023503e-07, "loss": 0.1545, "step": 11470 }, { "epoch": 0.26766926058698376, "grad_norm": 1.69373619556427, "learning_rate": 9.108064415842815e-07, "loss": 0.1452, "step": 11480 }, { "epoch": 0.2679024219638017, "grad_norm": 1.2469851970672607, "learning_rate": 9.107287197662127e-07, "loss": 0.1453, "step": 11490 }, { "epoch": 0.2681355833406196, "grad_norm": 1.3049758672714233, "learning_rate": 9.106509979481439e-07, "loss": 0.1356, "step": 11500 }, { "epoch": 0.26836874471743755, "grad_norm": 1.5746216773986816, "learning_rate": 9.105732761300752e-07, "loss": 0.137, "step": 11510 }, { "epoch": 0.2686019060942555, "grad_norm": 1.2124404907226562, "learning_rate": 9.104955543120065e-07, "loss": 0.1342, "step": 11520 }, { "epoch": 0.26883506747107344, "grad_norm": 1.5993270874023438, "learning_rate": 9.104178324939376e-07, "loss": 0.1358, "step": 11530 }, { "epoch": 0.26906822884789133, "grad_norm": 1.2447624206542969, "learning_rate": 9.103401106758689e-07, "loss": 0.14, "step": 11540 }, { "epoch": 0.2693013902247093, "grad_norm": 3.20639967918396, "learning_rate": 9.102623888578001e-07, "loss": 0.1413, "step": 11550 }, { "epoch": 0.26953455160152723, "grad_norm": 2.6884636878967285, "learning_rate": 9.101846670397314e-07, "loss": 0.1626, "step": 11560 }, { "epoch": 0.2697677129783451, "grad_norm": 2.161407709121704, "learning_rate": 9.101069452216626e-07, "loss": 0.1517, "step": 11570 }, { "epoch": 0.27000087435516307, "grad_norm": 1.2147630453109741, "learning_rate": 9.100292234035938e-07, "loss": 0.1441, "step": 11580 }, { "epoch": 0.270234035731981, "grad_norm": 1.6884632110595703, "learning_rate": 9.09951501585525e-07, "loss": 0.1422, "step": 11590 }, { "epoch": 0.2704671971087989, "grad_norm": 1.1167924404144287, "learning_rate": 9.098737797674562e-07, "loss": 0.1264, "step": 11600 }, { "epoch": 0.27070035848561685, "grad_norm": 1.9887903928756714, "learning_rate": 9.097960579493875e-07, "loss": 0.1368, "step": 11610 }, { "epoch": 0.2709335198624348, "grad_norm": 1.4867738485336304, "learning_rate": 9.097183361313188e-07, "loss": 0.1512, "step": 11620 }, { "epoch": 0.2711666812392527, "grad_norm": 1.4648957252502441, "learning_rate": 9.0964061431325e-07, "loss": 0.1465, "step": 11630 }, { "epoch": 0.27139984261607064, "grad_norm": 1.3650212287902832, "learning_rate": 9.095628924951813e-07, "loss": 0.1468, "step": 11640 }, { "epoch": 0.2716330039928886, "grad_norm": 2.722277879714966, "learning_rate": 9.094851706771123e-07, "loss": 0.1413, "step": 11650 }, { "epoch": 0.2718661653697065, "grad_norm": 3.7577669620513916, "learning_rate": 9.094074488590436e-07, "loss": 0.1498, "step": 11660 }, { "epoch": 0.27209932674652443, "grad_norm": 3.0034232139587402, "learning_rate": 9.093297270409749e-07, "loss": 0.15, "step": 11670 }, { "epoch": 0.2723324881233424, "grad_norm": 2.87931752204895, "learning_rate": 9.092520052229061e-07, "loss": 0.146, "step": 11680 }, { "epoch": 0.2725656495001603, "grad_norm": 3.594182014465332, "learning_rate": 9.091742834048374e-07, "loss": 0.1365, "step": 11690 }, { "epoch": 0.2727988108769782, "grad_norm": 1.499436378479004, "learning_rate": 9.090965615867687e-07, "loss": 0.1407, "step": 11700 }, { "epoch": 0.27303197225379616, "grad_norm": 3.9509775638580322, "learning_rate": 9.090188397686999e-07, "loss": 0.1368, "step": 11710 }, { "epoch": 0.2732651336306141, "grad_norm": 1.9844970703125, "learning_rate": 9.089411179506311e-07, "loss": 0.14, "step": 11720 }, { "epoch": 0.273498295007432, "grad_norm": 1.2018269300460815, "learning_rate": 9.088633961325622e-07, "loss": 0.1347, "step": 11730 }, { "epoch": 0.27373145638424995, "grad_norm": 5.2834978103637695, "learning_rate": 9.087856743144935e-07, "loss": 0.1423, "step": 11740 }, { "epoch": 0.2739646177610679, "grad_norm": 3.5468525886535645, "learning_rate": 9.087079524964248e-07, "loss": 0.1383, "step": 11750 }, { "epoch": 0.2741977791378858, "grad_norm": 1.8254536390304565, "learning_rate": 9.08630230678356e-07, "loss": 0.1396, "step": 11760 }, { "epoch": 0.27443094051470374, "grad_norm": 1.1966979503631592, "learning_rate": 9.085525088602873e-07, "loss": 0.1273, "step": 11770 }, { "epoch": 0.2746641018915217, "grad_norm": 1.116134524345398, "learning_rate": 9.084747870422184e-07, "loss": 0.1666, "step": 11780 }, { "epoch": 0.2748972632683396, "grad_norm": 1.3259869813919067, "learning_rate": 9.083970652241497e-07, "loss": 0.144, "step": 11790 }, { "epoch": 0.2751304246451575, "grad_norm": 1.5508968830108643, "learning_rate": 9.08319343406081e-07, "loss": 0.151, "step": 11800 }, { "epoch": 0.27536358602197547, "grad_norm": 1.6171120405197144, "learning_rate": 9.082416215880121e-07, "loss": 0.1336, "step": 11810 }, { "epoch": 0.27559674739879336, "grad_norm": 3.938976764678955, "learning_rate": 9.081638997699434e-07, "loss": 0.1379, "step": 11820 }, { "epoch": 0.2758299087756113, "grad_norm": 4.354508876800537, "learning_rate": 9.080861779518746e-07, "loss": 0.1467, "step": 11830 }, { "epoch": 0.27606307015242926, "grad_norm": 1.2604446411132812, "learning_rate": 9.080084561338058e-07, "loss": 0.1347, "step": 11840 }, { "epoch": 0.2762962315292472, "grad_norm": 5.7984700202941895, "learning_rate": 9.079307343157371e-07, "loss": 0.1549, "step": 11850 }, { "epoch": 0.2765293929060651, "grad_norm": 3.4094974994659424, "learning_rate": 9.078530124976683e-07, "loss": 0.1467, "step": 11860 }, { "epoch": 0.27676255428288304, "grad_norm": 2.536184072494507, "learning_rate": 9.077752906795996e-07, "loss": 0.1355, "step": 11870 }, { "epoch": 0.276995715659701, "grad_norm": 2.8100392818450928, "learning_rate": 9.076975688615308e-07, "loss": 0.1509, "step": 11880 }, { "epoch": 0.2772288770365189, "grad_norm": 2.2076799869537354, "learning_rate": 9.07619847043462e-07, "loss": 0.1399, "step": 11890 }, { "epoch": 0.27746203841333683, "grad_norm": 3.2657175064086914, "learning_rate": 9.075421252253932e-07, "loss": 0.1372, "step": 11900 }, { "epoch": 0.2776951997901548, "grad_norm": 2.896230459213257, "learning_rate": 9.074644034073244e-07, "loss": 0.1541, "step": 11910 }, { "epoch": 0.27792836116697267, "grad_norm": 1.5972888469696045, "learning_rate": 9.073866815892557e-07, "loss": 0.1463, "step": 11920 }, { "epoch": 0.2781615225437906, "grad_norm": 1.8940038681030273, "learning_rate": 9.073089597711869e-07, "loss": 0.1424, "step": 11930 }, { "epoch": 0.27839468392060857, "grad_norm": 1.3869976997375488, "learning_rate": 9.072312379531182e-07, "loss": 0.15, "step": 11940 }, { "epoch": 0.27862784529742646, "grad_norm": 1.6464424133300781, "learning_rate": 9.071535161350495e-07, "loss": 0.1404, "step": 11950 }, { "epoch": 0.2788610066742444, "grad_norm": 1.1936028003692627, "learning_rate": 9.070757943169806e-07, "loss": 0.1448, "step": 11960 }, { "epoch": 0.27909416805106235, "grad_norm": 1.7567750215530396, "learning_rate": 9.069980724989118e-07, "loss": 0.1441, "step": 11970 }, { "epoch": 0.27932732942788024, "grad_norm": 1.3455073833465576, "learning_rate": 9.06920350680843e-07, "loss": 0.1388, "step": 11980 }, { "epoch": 0.2795604908046982, "grad_norm": 2.5730912685394287, "learning_rate": 9.068426288627743e-07, "loss": 0.1468, "step": 11990 }, { "epoch": 0.27979365218151614, "grad_norm": 1.5729963779449463, "learning_rate": 9.067649070447056e-07, "loss": 0.1242, "step": 12000 }, { "epoch": 0.2800268135583341, "grad_norm": 1.6741278171539307, "learning_rate": 9.066871852266368e-07, "loss": 0.1305, "step": 12010 }, { "epoch": 0.280259974935152, "grad_norm": 2.21150803565979, "learning_rate": 9.06609463408568e-07, "loss": 0.1351, "step": 12020 }, { "epoch": 0.2804931363119699, "grad_norm": 1.8275911808013916, "learning_rate": 9.065317415904992e-07, "loss": 0.151, "step": 12030 }, { "epoch": 0.2807262976887879, "grad_norm": 1.3990172147750854, "learning_rate": 9.064540197724305e-07, "loss": 0.1398, "step": 12040 }, { "epoch": 0.28095945906560577, "grad_norm": 3.2348310947418213, "learning_rate": 9.063762979543617e-07, "loss": 0.1331, "step": 12050 }, { "epoch": 0.2811926204424237, "grad_norm": 1.3689723014831543, "learning_rate": 9.062985761362929e-07, "loss": 0.135, "step": 12060 }, { "epoch": 0.28142578181924166, "grad_norm": 3.24348783493042, "learning_rate": 9.062208543182242e-07, "loss": 0.1385, "step": 12070 }, { "epoch": 0.28165894319605955, "grad_norm": 1.3253954648971558, "learning_rate": 9.061431325001553e-07, "loss": 0.1445, "step": 12080 }, { "epoch": 0.2818921045728775, "grad_norm": 1.9470670223236084, "learning_rate": 9.060654106820866e-07, "loss": 0.1348, "step": 12090 }, { "epoch": 0.28212526594969545, "grad_norm": 2.015430212020874, "learning_rate": 9.059876888640179e-07, "loss": 0.131, "step": 12100 }, { "epoch": 0.28235842732651334, "grad_norm": 1.6833577156066895, "learning_rate": 9.059099670459491e-07, "loss": 0.1494, "step": 12110 }, { "epoch": 0.2825915887033313, "grad_norm": 1.6429252624511719, "learning_rate": 9.058400174096872e-07, "loss": 0.1439, "step": 12120 }, { "epoch": 0.28282475008014923, "grad_norm": 1.0694143772125244, "learning_rate": 9.057622955916184e-07, "loss": 0.1348, "step": 12130 }, { "epoch": 0.2830579114569671, "grad_norm": 2.907345771789551, "learning_rate": 9.056845737735497e-07, "loss": 0.1433, "step": 12140 }, { "epoch": 0.2832910728337851, "grad_norm": 2.0747573375701904, "learning_rate": 9.056068519554809e-07, "loss": 0.1347, "step": 12150 }, { "epoch": 0.283524234210603, "grad_norm": 5.031894207000732, "learning_rate": 9.055291301374122e-07, "loss": 0.144, "step": 12160 }, { "epoch": 0.28375739558742097, "grad_norm": 2.591632127761841, "learning_rate": 9.054514083193435e-07, "loss": 0.1476, "step": 12170 }, { "epoch": 0.28399055696423886, "grad_norm": 1.3321609497070312, "learning_rate": 9.053736865012745e-07, "loss": 0.1554, "step": 12180 }, { "epoch": 0.2842237183410568, "grad_norm": 2.185532808303833, "learning_rate": 9.052959646832058e-07, "loss": 0.1552, "step": 12190 }, { "epoch": 0.28445687971787476, "grad_norm": 2.189018487930298, "learning_rate": 9.05218242865137e-07, "loss": 0.1331, "step": 12200 }, { "epoch": 0.28469004109469265, "grad_norm": 1.469828486442566, "learning_rate": 9.051405210470683e-07, "loss": 0.146, "step": 12210 }, { "epoch": 0.2849232024715106, "grad_norm": 4.745151042938232, "learning_rate": 9.050627992289996e-07, "loss": 0.127, "step": 12220 }, { "epoch": 0.28515636384832854, "grad_norm": 3.133021831512451, "learning_rate": 9.049850774109308e-07, "loss": 0.1389, "step": 12230 }, { "epoch": 0.28538952522514643, "grad_norm": 1.5790678262710571, "learning_rate": 9.04907355592862e-07, "loss": 0.1503, "step": 12240 }, { "epoch": 0.2856226866019644, "grad_norm": 1.4832017421722412, "learning_rate": 9.048296337747932e-07, "loss": 0.1371, "step": 12250 }, { "epoch": 0.28585584797878233, "grad_norm": 1.3236784934997559, "learning_rate": 9.047519119567244e-07, "loss": 0.1456, "step": 12260 }, { "epoch": 0.2860890093556002, "grad_norm": 2.386317729949951, "learning_rate": 9.046741901386557e-07, "loss": 0.1241, "step": 12270 }, { "epoch": 0.28632217073241817, "grad_norm": 3.087676763534546, "learning_rate": 9.045964683205869e-07, "loss": 0.1248, "step": 12280 }, { "epoch": 0.2865553321092361, "grad_norm": 5.14018440246582, "learning_rate": 9.045187465025182e-07, "loss": 0.1414, "step": 12290 }, { "epoch": 0.286788493486054, "grad_norm": 3.682955741882324, "learning_rate": 9.044410246844493e-07, "loss": 0.1478, "step": 12300 }, { "epoch": 0.28702165486287196, "grad_norm": 1.8950999975204468, "learning_rate": 9.043633028663806e-07, "loss": 0.1382, "step": 12310 }, { "epoch": 0.2872548162396899, "grad_norm": 1.2208623886108398, "learning_rate": 9.042855810483119e-07, "loss": 0.1444, "step": 12320 }, { "epoch": 0.28748797761650785, "grad_norm": 2.019446611404419, "learning_rate": 9.042078592302431e-07, "loss": 0.1469, "step": 12330 }, { "epoch": 0.28772113899332574, "grad_norm": 1.515626072883606, "learning_rate": 9.041301374121743e-07, "loss": 0.1407, "step": 12340 }, { "epoch": 0.2879543003701437, "grad_norm": 1.3947932720184326, "learning_rate": 9.040524155941055e-07, "loss": 0.133, "step": 12350 }, { "epoch": 0.28818746174696164, "grad_norm": 3.7797398567199707, "learning_rate": 9.039746937760367e-07, "loss": 0.1349, "step": 12360 }, { "epoch": 0.28842062312377953, "grad_norm": 1.3020930290222168, "learning_rate": 9.03896971957968e-07, "loss": 0.1177, "step": 12370 }, { "epoch": 0.2886537845005975, "grad_norm": 1.8307514190673828, "learning_rate": 9.038192501398992e-07, "loss": 0.137, "step": 12380 }, { "epoch": 0.2888869458774154, "grad_norm": 1.5299723148345947, "learning_rate": 9.037415283218305e-07, "loss": 0.1381, "step": 12390 }, { "epoch": 0.2891201072542333, "grad_norm": 1.9986660480499268, "learning_rate": 9.036638065037618e-07, "loss": 0.1393, "step": 12400 }, { "epoch": 0.28935326863105126, "grad_norm": 1.8711748123168945, "learning_rate": 9.03586084685693e-07, "loss": 0.1396, "step": 12410 }, { "epoch": 0.2895864300078692, "grad_norm": 2.8034911155700684, "learning_rate": 9.035083628676241e-07, "loss": 0.1396, "step": 12420 }, { "epoch": 0.2898195913846871, "grad_norm": 1.1173442602157593, "learning_rate": 9.034306410495553e-07, "loss": 0.1396, "step": 12430 }, { "epoch": 0.29005275276150505, "grad_norm": 1.688125729560852, "learning_rate": 9.033529192314866e-07, "loss": 0.1387, "step": 12440 }, { "epoch": 0.290285914138323, "grad_norm": 2.0440003871917725, "learning_rate": 9.032751974134178e-07, "loss": 0.1347, "step": 12450 }, { "epoch": 0.2905190755151409, "grad_norm": 2.2866837978363037, "learning_rate": 9.031974755953491e-07, "loss": 0.1568, "step": 12460 }, { "epoch": 0.29075223689195884, "grad_norm": 1.7586098909378052, "learning_rate": 9.031197537772804e-07, "loss": 0.1461, "step": 12470 }, { "epoch": 0.2909853982687768, "grad_norm": 1.5123730897903442, "learning_rate": 9.030420319592115e-07, "loss": 0.1315, "step": 12480 }, { "epoch": 0.29121855964559473, "grad_norm": 2.647444725036621, "learning_rate": 9.029643101411428e-07, "loss": 0.1493, "step": 12490 }, { "epoch": 0.2914517210224126, "grad_norm": 1.3470712900161743, "learning_rate": 9.02886588323074e-07, "loss": 0.1356, "step": 12500 }, { "epoch": 0.29168488239923057, "grad_norm": 1.200919508934021, "learning_rate": 9.028088665050052e-07, "loss": 0.1329, "step": 12510 }, { "epoch": 0.2919180437760485, "grad_norm": 3.021392583847046, "learning_rate": 9.027311446869365e-07, "loss": 0.1517, "step": 12520 }, { "epoch": 0.2921512051528664, "grad_norm": 2.0969371795654297, "learning_rate": 9.026534228688677e-07, "loss": 0.1389, "step": 12530 }, { "epoch": 0.29238436652968436, "grad_norm": 2.71535062789917, "learning_rate": 9.025757010507989e-07, "loss": 0.1505, "step": 12540 }, { "epoch": 0.2926175279065023, "grad_norm": 2.9528324604034424, "learning_rate": 9.024979792327302e-07, "loss": 0.1463, "step": 12550 }, { "epoch": 0.2928506892833202, "grad_norm": 1.832571029663086, "learning_rate": 9.024202574146614e-07, "loss": 0.1325, "step": 12560 }, { "epoch": 0.29308385066013815, "grad_norm": 1.1134779453277588, "learning_rate": 9.023425355965927e-07, "loss": 0.1256, "step": 12570 }, { "epoch": 0.2933170120369561, "grad_norm": 1.684539556503296, "learning_rate": 9.022648137785238e-07, "loss": 0.15, "step": 12580 }, { "epoch": 0.293550173413774, "grad_norm": 2.5009686946868896, "learning_rate": 9.021870919604551e-07, "loss": 0.1509, "step": 12590 }, { "epoch": 0.29378333479059193, "grad_norm": 2.447676658630371, "learning_rate": 9.021093701423864e-07, "loss": 0.1412, "step": 12600 }, { "epoch": 0.2940164961674099, "grad_norm": 1.8720282316207886, "learning_rate": 9.020316483243175e-07, "loss": 0.1399, "step": 12610 }, { "epoch": 0.29424965754422777, "grad_norm": 3.154740571975708, "learning_rate": 9.019539265062488e-07, "loss": 0.1309, "step": 12620 }, { "epoch": 0.2944828189210457, "grad_norm": 2.8166613578796387, "learning_rate": 9.0187620468818e-07, "loss": 0.1336, "step": 12630 }, { "epoch": 0.29471598029786367, "grad_norm": 2.710157632827759, "learning_rate": 9.017984828701113e-07, "loss": 0.148, "step": 12640 }, { "epoch": 0.2949491416746816, "grad_norm": 1.4036943912506104, "learning_rate": 9.017207610520426e-07, "loss": 0.1349, "step": 12650 }, { "epoch": 0.2951823030514995, "grad_norm": 2.703144073486328, "learning_rate": 9.016430392339737e-07, "loss": 0.1393, "step": 12660 }, { "epoch": 0.29541546442831745, "grad_norm": 1.7768923044204712, "learning_rate": 9.015653174159049e-07, "loss": 0.1408, "step": 12670 }, { "epoch": 0.2956486258051354, "grad_norm": 1.3963549137115479, "learning_rate": 9.014875955978361e-07, "loss": 0.1399, "step": 12680 }, { "epoch": 0.2958817871819533, "grad_norm": 3.0632364749908447, "learning_rate": 9.014098737797674e-07, "loss": 0.149, "step": 12690 }, { "epoch": 0.29611494855877124, "grad_norm": 1.7273012399673462, "learning_rate": 9.013321519616987e-07, "loss": 0.1358, "step": 12700 }, { "epoch": 0.2963481099355892, "grad_norm": 1.9226545095443726, "learning_rate": 9.012544301436299e-07, "loss": 0.1397, "step": 12710 }, { "epoch": 0.2965812713124071, "grad_norm": 1.483575463294983, "learning_rate": 9.011767083255612e-07, "loss": 0.1391, "step": 12720 }, { "epoch": 0.296814432689225, "grad_norm": 2.4057090282440186, "learning_rate": 9.010989865074923e-07, "loss": 0.1265, "step": 12730 }, { "epoch": 0.297047594066043, "grad_norm": 1.0725581645965576, "learning_rate": 9.010212646894235e-07, "loss": 0.1399, "step": 12740 }, { "epoch": 0.29728075544286087, "grad_norm": 1.1692622900009155, "learning_rate": 9.009435428713548e-07, "loss": 0.1402, "step": 12750 }, { "epoch": 0.2975139168196788, "grad_norm": 1.4178802967071533, "learning_rate": 9.00865821053286e-07, "loss": 0.1165, "step": 12760 }, { "epoch": 0.29774707819649676, "grad_norm": 1.3124949932098389, "learning_rate": 9.007880992352173e-07, "loss": 0.1427, "step": 12770 }, { "epoch": 0.29798023957331465, "grad_norm": 3.6629726886749268, "learning_rate": 9.007103774171485e-07, "loss": 0.1414, "step": 12780 }, { "epoch": 0.2982134009501326, "grad_norm": 3.000933885574341, "learning_rate": 9.006326555990797e-07, "loss": 0.1382, "step": 12790 }, { "epoch": 0.29844656232695055, "grad_norm": 1.294832468032837, "learning_rate": 9.00554933781011e-07, "loss": 0.1334, "step": 12800 }, { "epoch": 0.2986797237037685, "grad_norm": 1.7500734329223633, "learning_rate": 9.004772119629422e-07, "loss": 0.1401, "step": 12810 }, { "epoch": 0.2989128850805864, "grad_norm": 1.5178587436676025, "learning_rate": 9.003994901448734e-07, "loss": 0.137, "step": 12820 }, { "epoch": 0.29914604645740434, "grad_norm": 5.748288631439209, "learning_rate": 9.003217683268046e-07, "loss": 0.1443, "step": 12830 }, { "epoch": 0.2993792078342223, "grad_norm": 1.3450446128845215, "learning_rate": 9.002440465087359e-07, "loss": 0.145, "step": 12840 }, { "epoch": 0.2996123692110402, "grad_norm": 2.2153825759887695, "learning_rate": 9.001663246906671e-07, "loss": 0.1325, "step": 12850 }, { "epoch": 0.2998455305878581, "grad_norm": 1.4884543418884277, "learning_rate": 9.000886028725983e-07, "loss": 0.1431, "step": 12860 }, { "epoch": 0.30007869196467607, "grad_norm": 1.4877405166625977, "learning_rate": 9.000108810545296e-07, "loss": 0.1413, "step": 12870 }, { "epoch": 0.30031185334149396, "grad_norm": 1.3891538381576538, "learning_rate": 8.999331592364609e-07, "loss": 0.1414, "step": 12880 }, { "epoch": 0.3005450147183119, "grad_norm": 1.8877928256988525, "learning_rate": 8.998554374183921e-07, "loss": 0.1602, "step": 12890 }, { "epoch": 0.30077817609512986, "grad_norm": 2.2260537147521973, "learning_rate": 8.997777156003233e-07, "loss": 0.1351, "step": 12900 }, { "epoch": 0.30101133747194775, "grad_norm": 5.591747760772705, "learning_rate": 8.996999937822544e-07, "loss": 0.1398, "step": 12910 }, { "epoch": 0.3012444988487657, "grad_norm": 1.3235862255096436, "learning_rate": 8.996222719641857e-07, "loss": 0.1324, "step": 12920 }, { "epoch": 0.30147766022558364, "grad_norm": 1.1278547048568726, "learning_rate": 8.99544550146117e-07, "loss": 0.129, "step": 12930 }, { "epoch": 0.30171082160240154, "grad_norm": 1.0532299280166626, "learning_rate": 8.994668283280482e-07, "loss": 0.1381, "step": 12940 }, { "epoch": 0.3019439829792195, "grad_norm": 4.698642253875732, "learning_rate": 8.993891065099795e-07, "loss": 0.1377, "step": 12950 }, { "epoch": 0.30217714435603743, "grad_norm": 1.6263331174850464, "learning_rate": 8.993113846919107e-07, "loss": 0.1488, "step": 12960 }, { "epoch": 0.3024103057328554, "grad_norm": 2.0519449710845947, "learning_rate": 8.99233662873842e-07, "loss": 0.1349, "step": 12970 }, { "epoch": 0.30264346710967327, "grad_norm": 1.2178645133972168, "learning_rate": 8.99155941055773e-07, "loss": 0.1424, "step": 12980 }, { "epoch": 0.3028766284864912, "grad_norm": 2.3055319786071777, "learning_rate": 8.990782192377043e-07, "loss": 0.1459, "step": 12990 }, { "epoch": 0.30310978986330916, "grad_norm": 1.2081538438796997, "learning_rate": 8.990004974196356e-07, "loss": 0.1408, "step": 13000 }, { "epoch": 0.30334295124012706, "grad_norm": 1.4058305025100708, "learning_rate": 8.989227756015668e-07, "loss": 0.1337, "step": 13010 }, { "epoch": 0.303576112616945, "grad_norm": 1.7509889602661133, "learning_rate": 8.988450537834981e-07, "loss": 0.1472, "step": 13020 }, { "epoch": 0.30380927399376295, "grad_norm": 1.5076762437820435, "learning_rate": 8.987673319654294e-07, "loss": 0.1407, "step": 13030 }, { "epoch": 0.30404243537058084, "grad_norm": 1.3990048170089722, "learning_rate": 8.986896101473605e-07, "loss": 0.1395, "step": 13040 }, { "epoch": 0.3042755967473988, "grad_norm": 1.7950727939605713, "learning_rate": 8.986118883292918e-07, "loss": 0.1528, "step": 13050 }, { "epoch": 0.30450875812421674, "grad_norm": 2.429320812225342, "learning_rate": 8.985341665112229e-07, "loss": 0.1376, "step": 13060 }, { "epoch": 0.30474191950103463, "grad_norm": 3.7119297981262207, "learning_rate": 8.984564446931542e-07, "loss": 0.1297, "step": 13070 }, { "epoch": 0.3049750808778526, "grad_norm": 2.27311372756958, "learning_rate": 8.983787228750855e-07, "loss": 0.1289, "step": 13080 }, { "epoch": 0.3052082422546705, "grad_norm": 2.0988306999206543, "learning_rate": 8.983010010570167e-07, "loss": 0.1339, "step": 13090 }, { "epoch": 0.3054414036314884, "grad_norm": 1.1822923421859741, "learning_rate": 8.982232792389479e-07, "loss": 0.1376, "step": 13100 }, { "epoch": 0.30567456500830636, "grad_norm": 1.8677029609680176, "learning_rate": 8.981455574208791e-07, "loss": 0.1446, "step": 13110 }, { "epoch": 0.3059077263851243, "grad_norm": 2.005387306213379, "learning_rate": 8.980678356028104e-07, "loss": 0.1352, "step": 13120 }, { "epoch": 0.30614088776194226, "grad_norm": 1.773943543434143, "learning_rate": 8.979901137847417e-07, "loss": 0.1388, "step": 13130 }, { "epoch": 0.30637404913876015, "grad_norm": 1.4666805267333984, "learning_rate": 8.979123919666728e-07, "loss": 0.1406, "step": 13140 }, { "epoch": 0.3066072105155781, "grad_norm": 1.930408239364624, "learning_rate": 8.978346701486041e-07, "loss": 0.1435, "step": 13150 }, { "epoch": 0.30684037189239605, "grad_norm": 4.709733009338379, "learning_rate": 8.977569483305352e-07, "loss": 0.1538, "step": 13160 }, { "epoch": 0.30707353326921394, "grad_norm": 2.814955472946167, "learning_rate": 8.976792265124665e-07, "loss": 0.141, "step": 13170 }, { "epoch": 0.3073066946460319, "grad_norm": 1.7163292169570923, "learning_rate": 8.976015046943978e-07, "loss": 0.151, "step": 13180 }, { "epoch": 0.30753985602284983, "grad_norm": 1.8125691413879395, "learning_rate": 8.97523782876329e-07, "loss": 0.1382, "step": 13190 }, { "epoch": 0.3077730173996677, "grad_norm": 2.2464990615844727, "learning_rate": 8.974460610582603e-07, "loss": 0.1432, "step": 13200 }, { "epoch": 0.3080061787764857, "grad_norm": 1.2426531314849854, "learning_rate": 8.973683392401915e-07, "loss": 0.1489, "step": 13210 }, { "epoch": 0.3082393401533036, "grad_norm": 1.7868356704711914, "learning_rate": 8.972906174221227e-07, "loss": 0.13, "step": 13220 }, { "epoch": 0.3084725015301215, "grad_norm": 3.7698636054992676, "learning_rate": 8.972128956040539e-07, "loss": 0.1282, "step": 13230 }, { "epoch": 0.30870566290693946, "grad_norm": 1.9205694198608398, "learning_rate": 8.971351737859851e-07, "loss": 0.1418, "step": 13240 }, { "epoch": 0.3089388242837574, "grad_norm": 2.6270315647125244, "learning_rate": 8.970574519679164e-07, "loss": 0.1503, "step": 13250 }, { "epoch": 0.3091719856605753, "grad_norm": 1.6375296115875244, "learning_rate": 8.969797301498476e-07, "loss": 0.137, "step": 13260 }, { "epoch": 0.30940514703739325, "grad_norm": 1.696530818939209, "learning_rate": 8.969020083317789e-07, "loss": 0.1386, "step": 13270 }, { "epoch": 0.3096383084142112, "grad_norm": 1.710629940032959, "learning_rate": 8.968242865137101e-07, "loss": 0.1333, "step": 13280 }, { "epoch": 0.30987146979102914, "grad_norm": 4.274246692657471, "learning_rate": 8.967465646956413e-07, "loss": 0.1362, "step": 13290 }, { "epoch": 0.31010463116784703, "grad_norm": 1.1897577047348022, "learning_rate": 8.966688428775726e-07, "loss": 0.1467, "step": 13300 }, { "epoch": 0.310337792544665, "grad_norm": 2.0683181285858154, "learning_rate": 8.965911210595037e-07, "loss": 0.1463, "step": 13310 }, { "epoch": 0.31057095392148293, "grad_norm": 2.2112064361572266, "learning_rate": 8.96513399241435e-07, "loss": 0.1386, "step": 13320 }, { "epoch": 0.3108041152983008, "grad_norm": 3.9221763610839844, "learning_rate": 8.964356774233663e-07, "loss": 0.1375, "step": 13330 }, { "epoch": 0.31103727667511877, "grad_norm": 2.02091908454895, "learning_rate": 8.963579556052974e-07, "loss": 0.1394, "step": 13340 }, { "epoch": 0.3112704380519367, "grad_norm": 2.077855348587036, "learning_rate": 8.962802337872287e-07, "loss": 0.1555, "step": 13350 }, { "epoch": 0.3115035994287546, "grad_norm": 1.4958140850067139, "learning_rate": 8.9620251196916e-07, "loss": 0.1424, "step": 13360 }, { "epoch": 0.31173676080557255, "grad_norm": 2.9663407802581787, "learning_rate": 8.961247901510912e-07, "loss": 0.1404, "step": 13370 }, { "epoch": 0.3119699221823905, "grad_norm": 1.494958519935608, "learning_rate": 8.960470683330225e-07, "loss": 0.1474, "step": 13380 }, { "epoch": 0.3122030835592084, "grad_norm": 2.312697172164917, "learning_rate": 8.959693465149536e-07, "loss": 0.1336, "step": 13390 }, { "epoch": 0.31243624493602634, "grad_norm": 2.712702989578247, "learning_rate": 8.958916246968848e-07, "loss": 0.1366, "step": 13400 }, { "epoch": 0.3126694063128443, "grad_norm": 3.267896890640259, "learning_rate": 8.95813902878816e-07, "loss": 0.1407, "step": 13410 }, { "epoch": 0.3129025676896622, "grad_norm": 3.8953616619110107, "learning_rate": 8.957361810607473e-07, "loss": 0.1434, "step": 13420 }, { "epoch": 0.31313572906648013, "grad_norm": 2.17686128616333, "learning_rate": 8.956584592426786e-07, "loss": 0.1363, "step": 13430 }, { "epoch": 0.3133688904432981, "grad_norm": 4.053796291351318, "learning_rate": 8.955807374246098e-07, "loss": 0.1434, "step": 13440 }, { "epoch": 0.313602051820116, "grad_norm": 1.0221121311187744, "learning_rate": 8.955030156065411e-07, "loss": 0.1234, "step": 13450 }, { "epoch": 0.3138352131969339, "grad_norm": 1.9651106595993042, "learning_rate": 8.954252937884724e-07, "loss": 0.1511, "step": 13460 }, { "epoch": 0.31406837457375186, "grad_norm": 1.9881354570388794, "learning_rate": 8.953475719704034e-07, "loss": 0.1287, "step": 13470 }, { "epoch": 0.3143015359505698, "grad_norm": 1.6858934164047241, "learning_rate": 8.952698501523347e-07, "loss": 0.1389, "step": 13480 }, { "epoch": 0.3145346973273877, "grad_norm": 2.6078743934631348, "learning_rate": 8.951921283342659e-07, "loss": 0.1246, "step": 13490 }, { "epoch": 0.31476785870420565, "grad_norm": 1.7657063007354736, "learning_rate": 8.951144065161972e-07, "loss": 0.1458, "step": 13500 }, { "epoch": 0.3150010200810236, "grad_norm": 1.1663442850112915, "learning_rate": 8.950366846981285e-07, "loss": 0.1351, "step": 13510 }, { "epoch": 0.3152341814578415, "grad_norm": 3.7222812175750732, "learning_rate": 8.949589628800597e-07, "loss": 0.1399, "step": 13520 }, { "epoch": 0.31546734283465944, "grad_norm": 3.166208267211914, "learning_rate": 8.948812410619909e-07, "loss": 0.1345, "step": 13530 }, { "epoch": 0.3157005042114774, "grad_norm": 1.2077481746673584, "learning_rate": 8.948035192439221e-07, "loss": 0.1379, "step": 13540 }, { "epoch": 0.3159336655882953, "grad_norm": 1.618333339691162, "learning_rate": 8.947257974258533e-07, "loss": 0.1346, "step": 13550 }, { "epoch": 0.3161668269651132, "grad_norm": 3.5193700790405273, "learning_rate": 8.946480756077846e-07, "loss": 0.1324, "step": 13560 }, { "epoch": 0.31639998834193117, "grad_norm": 1.4687329530715942, "learning_rate": 8.945703537897158e-07, "loss": 0.1372, "step": 13570 }, { "epoch": 0.31663314971874906, "grad_norm": 1.630411982536316, "learning_rate": 8.944926319716471e-07, "loss": 0.1469, "step": 13580 }, { "epoch": 0.316866311095567, "grad_norm": 1.715895175933838, "learning_rate": 8.944149101535782e-07, "loss": 0.1359, "step": 13590 }, { "epoch": 0.31709947247238496, "grad_norm": 1.2755076885223389, "learning_rate": 8.943371883355095e-07, "loss": 0.1358, "step": 13600 }, { "epoch": 0.3173326338492029, "grad_norm": 1.7270058393478394, "learning_rate": 8.942594665174408e-07, "loss": 0.1401, "step": 13610 }, { "epoch": 0.3175657952260208, "grad_norm": 2.5899009704589844, "learning_rate": 8.94181744699372e-07, "loss": 0.1254, "step": 13620 }, { "epoch": 0.31779895660283874, "grad_norm": 1.334354043006897, "learning_rate": 8.941040228813032e-07, "loss": 0.1348, "step": 13630 }, { "epoch": 0.3180321179796567, "grad_norm": 2.740208864212036, "learning_rate": 8.940263010632344e-07, "loss": 0.1364, "step": 13640 }, { "epoch": 0.3182652793564746, "grad_norm": 1.3240913152694702, "learning_rate": 8.939485792451656e-07, "loss": 0.1447, "step": 13650 }, { "epoch": 0.31849844073329253, "grad_norm": 1.4488554000854492, "learning_rate": 8.938708574270969e-07, "loss": 0.1375, "step": 13660 }, { "epoch": 0.3187316021101105, "grad_norm": 1.3329739570617676, "learning_rate": 8.937931356090281e-07, "loss": 0.1423, "step": 13670 }, { "epoch": 0.31896476348692837, "grad_norm": 1.833931565284729, "learning_rate": 8.937154137909594e-07, "loss": 0.1487, "step": 13680 }, { "epoch": 0.3191979248637463, "grad_norm": 1.403592824935913, "learning_rate": 8.936376919728907e-07, "loss": 0.1343, "step": 13690 }, { "epoch": 0.31943108624056427, "grad_norm": 2.0558178424835205, "learning_rate": 8.935599701548219e-07, "loss": 0.1314, "step": 13700 }, { "epoch": 0.31966424761738216, "grad_norm": 2.4965548515319824, "learning_rate": 8.93482248336753e-07, "loss": 0.137, "step": 13710 }, { "epoch": 0.3198974089942001, "grad_norm": 1.436169147491455, "learning_rate": 8.934045265186842e-07, "loss": 0.1319, "step": 13720 }, { "epoch": 0.32013057037101805, "grad_norm": 1.6671514511108398, "learning_rate": 8.933268047006155e-07, "loss": 0.14, "step": 13730 }, { "epoch": 0.32036373174783594, "grad_norm": 1.443210482597351, "learning_rate": 8.932490828825467e-07, "loss": 0.133, "step": 13740 }, { "epoch": 0.3205968931246539, "grad_norm": 3.8140079975128174, "learning_rate": 8.93171361064478e-07, "loss": 0.1387, "step": 13750 }, { "epoch": 0.32083005450147184, "grad_norm": 1.5530126094818115, "learning_rate": 8.930936392464093e-07, "loss": 0.1475, "step": 13760 }, { "epoch": 0.3210632158782898, "grad_norm": 1.543496012687683, "learning_rate": 8.930159174283404e-07, "loss": 0.1323, "step": 13770 }, { "epoch": 0.3212963772551077, "grad_norm": 1.50673246383667, "learning_rate": 8.929381956102717e-07, "loss": 0.1397, "step": 13780 }, { "epoch": 0.3215295386319256, "grad_norm": 1.652678370475769, "learning_rate": 8.928604737922028e-07, "loss": 0.142, "step": 13790 }, { "epoch": 0.3217627000087436, "grad_norm": 1.1997991800308228, "learning_rate": 8.927827519741341e-07, "loss": 0.1417, "step": 13800 }, { "epoch": 0.32199586138556147, "grad_norm": 1.8417531251907349, "learning_rate": 8.927050301560654e-07, "loss": 0.1484, "step": 13810 }, { "epoch": 0.3222290227623794, "grad_norm": 3.4413633346557617, "learning_rate": 8.926273083379966e-07, "loss": 0.1343, "step": 13820 }, { "epoch": 0.32246218413919736, "grad_norm": 1.470099925994873, "learning_rate": 8.925495865199279e-07, "loss": 0.1365, "step": 13830 }, { "epoch": 0.32269534551601525, "grad_norm": 1.9527400732040405, "learning_rate": 8.92471864701859e-07, "loss": 0.1384, "step": 13840 }, { "epoch": 0.3229285068928332, "grad_norm": 1.373500108718872, "learning_rate": 8.923941428837903e-07, "loss": 0.1477, "step": 13850 }, { "epoch": 0.32316166826965115, "grad_norm": 1.7668076753616333, "learning_rate": 8.923164210657216e-07, "loss": 0.1516, "step": 13860 }, { "epoch": 0.32339482964646904, "grad_norm": 1.3700449466705322, "learning_rate": 8.922386992476527e-07, "loss": 0.1324, "step": 13870 }, { "epoch": 0.323627991023287, "grad_norm": 1.1516938209533691, "learning_rate": 8.92160977429584e-07, "loss": 0.1233, "step": 13880 }, { "epoch": 0.32386115240010493, "grad_norm": 1.6326502561569214, "learning_rate": 8.920832556115153e-07, "loss": 0.1414, "step": 13890 }, { "epoch": 0.3240943137769228, "grad_norm": 1.428101658821106, "learning_rate": 8.920055337934464e-07, "loss": 0.1474, "step": 13900 }, { "epoch": 0.3243274751537408, "grad_norm": 3.2120730876922607, "learning_rate": 8.919278119753777e-07, "loss": 0.1294, "step": 13910 }, { "epoch": 0.3245606365305587, "grad_norm": 1.4468212127685547, "learning_rate": 8.918500901573089e-07, "loss": 0.1285, "step": 13920 }, { "epoch": 0.32479379790737667, "grad_norm": 1.8230328559875488, "learning_rate": 8.917723683392402e-07, "loss": 0.1373, "step": 13930 }, { "epoch": 0.32502695928419456, "grad_norm": 1.5215564966201782, "learning_rate": 8.916946465211715e-07, "loss": 0.1399, "step": 13940 }, { "epoch": 0.3252601206610125, "grad_norm": 2.7089779376983643, "learning_rate": 8.916169247031026e-07, "loss": 0.1305, "step": 13950 }, { "epoch": 0.32549328203783046, "grad_norm": 2.5246407985687256, "learning_rate": 8.915392028850338e-07, "loss": 0.135, "step": 13960 }, { "epoch": 0.32572644341464835, "grad_norm": 2.7754812240600586, "learning_rate": 8.91461481066965e-07, "loss": 0.1381, "step": 13970 }, { "epoch": 0.3259596047914663, "grad_norm": 2.68619441986084, "learning_rate": 8.913837592488963e-07, "loss": 0.1429, "step": 13980 }, { "epoch": 0.32619276616828424, "grad_norm": 1.7636505365371704, "learning_rate": 8.913060374308276e-07, "loss": 0.1256, "step": 13990 }, { "epoch": 0.32642592754510213, "grad_norm": 1.2019169330596924, "learning_rate": 8.912283156127588e-07, "loss": 0.142, "step": 14000 }, { "epoch": 0.3266590889219201, "grad_norm": 1.4519755840301514, "learning_rate": 8.911505937946901e-07, "loss": 0.1289, "step": 14010 }, { "epoch": 0.32689225029873803, "grad_norm": 3.6495649814605713, "learning_rate": 8.910728719766212e-07, "loss": 0.1323, "step": 14020 }, { "epoch": 0.3271254116755559, "grad_norm": 3.039003849029541, "learning_rate": 8.909951501585524e-07, "loss": 0.1406, "step": 14030 }, { "epoch": 0.32735857305237387, "grad_norm": 1.4119465351104736, "learning_rate": 8.909174283404837e-07, "loss": 0.1481, "step": 14040 }, { "epoch": 0.3275917344291918, "grad_norm": 2.856067180633545, "learning_rate": 8.908397065224149e-07, "loss": 0.1399, "step": 14050 }, { "epoch": 0.3278248958060097, "grad_norm": 2.516979217529297, "learning_rate": 8.907619847043462e-07, "loss": 0.1348, "step": 14060 }, { "epoch": 0.32805805718282766, "grad_norm": 3.0837466716766357, "learning_rate": 8.906842628862774e-07, "loss": 0.1353, "step": 14070 }, { "epoch": 0.3282912185596456, "grad_norm": 2.032292604446411, "learning_rate": 8.906065410682086e-07, "loss": 0.1312, "step": 14080 }, { "epoch": 0.32852437993646355, "grad_norm": 1.2905040979385376, "learning_rate": 8.905288192501399e-07, "loss": 0.1318, "step": 14090 }, { "epoch": 0.32875754131328144, "grad_norm": 3.137739419937134, "learning_rate": 8.904510974320711e-07, "loss": 0.1426, "step": 14100 }, { "epoch": 0.3289907026900994, "grad_norm": 1.624365210533142, "learning_rate": 8.903733756140023e-07, "loss": 0.1406, "step": 14110 }, { "epoch": 0.32922386406691734, "grad_norm": 1.799228310585022, "learning_rate": 8.903034259777404e-07, "loss": 0.1403, "step": 14120 }, { "epoch": 0.32945702544373523, "grad_norm": 4.183528423309326, "learning_rate": 8.902257041596717e-07, "loss": 0.1289, "step": 14130 }, { "epoch": 0.3296901868205532, "grad_norm": 2.5459606647491455, "learning_rate": 8.901479823416029e-07, "loss": 0.1323, "step": 14140 }, { "epoch": 0.3299233481973711, "grad_norm": 1.4283517599105835, "learning_rate": 8.900702605235342e-07, "loss": 0.146, "step": 14150 }, { "epoch": 0.330156509574189, "grad_norm": 4.80924129486084, "learning_rate": 8.899925387054654e-07, "loss": 0.1432, "step": 14160 }, { "epoch": 0.33038967095100696, "grad_norm": 2.384676933288574, "learning_rate": 8.899148168873965e-07, "loss": 0.1406, "step": 14170 }, { "epoch": 0.3306228323278249, "grad_norm": 1.3992362022399902, "learning_rate": 8.898370950693278e-07, "loss": 0.1358, "step": 14180 }, { "epoch": 0.3308559937046428, "grad_norm": 2.2352499961853027, "learning_rate": 8.89759373251259e-07, "loss": 0.1395, "step": 14190 }, { "epoch": 0.33108915508146075, "grad_norm": 3.2102131843566895, "learning_rate": 8.896816514331903e-07, "loss": 0.1375, "step": 14200 }, { "epoch": 0.3313223164582787, "grad_norm": 1.3674569129943848, "learning_rate": 8.896039296151216e-07, "loss": 0.1423, "step": 14210 }, { "epoch": 0.3315554778350966, "grad_norm": 2.361687183380127, "learning_rate": 8.895262077970528e-07, "loss": 0.1413, "step": 14220 }, { "epoch": 0.33178863921191454, "grad_norm": 2.4387412071228027, "learning_rate": 8.894484859789841e-07, "loss": 0.1386, "step": 14230 }, { "epoch": 0.3320218005887325, "grad_norm": 1.5714573860168457, "learning_rate": 8.893707641609151e-07, "loss": 0.1353, "step": 14240 }, { "epoch": 0.33225496196555043, "grad_norm": 1.503699541091919, "learning_rate": 8.892930423428464e-07, "loss": 0.1533, "step": 14250 }, { "epoch": 0.3324881233423683, "grad_norm": 2.047105550765991, "learning_rate": 8.892153205247777e-07, "loss": 0.1396, "step": 14260 }, { "epoch": 0.33272128471918627, "grad_norm": 1.7269877195358276, "learning_rate": 8.891375987067089e-07, "loss": 0.1405, "step": 14270 }, { "epoch": 0.3329544460960042, "grad_norm": 2.987769603729248, "learning_rate": 8.890598768886402e-07, "loss": 0.1297, "step": 14280 }, { "epoch": 0.3331876074728221, "grad_norm": 1.3583296537399292, "learning_rate": 8.889821550705714e-07, "loss": 0.1303, "step": 14290 }, { "epoch": 0.33342076884964006, "grad_norm": 4.618428707122803, "learning_rate": 8.889044332525026e-07, "loss": 0.1378, "step": 14300 }, { "epoch": 0.333653930226458, "grad_norm": 2.1984403133392334, "learning_rate": 8.888267114344339e-07, "loss": 0.1378, "step": 14310 }, { "epoch": 0.3338870916032759, "grad_norm": 2.456956148147583, "learning_rate": 8.88748989616365e-07, "loss": 0.1476, "step": 14320 }, { "epoch": 0.33412025298009385, "grad_norm": 1.3345409631729126, "learning_rate": 8.886712677982963e-07, "loss": 0.1358, "step": 14330 }, { "epoch": 0.3343534143569118, "grad_norm": 2.620656967163086, "learning_rate": 8.885935459802275e-07, "loss": 0.1399, "step": 14340 }, { "epoch": 0.3345865757337297, "grad_norm": 2.1172549724578857, "learning_rate": 8.885158241621588e-07, "loss": 0.1301, "step": 14350 }, { "epoch": 0.33481973711054763, "grad_norm": 3.7220993041992188, "learning_rate": 8.8843810234409e-07, "loss": 0.1447, "step": 14360 }, { "epoch": 0.3350528984873656, "grad_norm": 1.690332055091858, "learning_rate": 8.883603805260212e-07, "loss": 0.131, "step": 14370 }, { "epoch": 0.33528605986418347, "grad_norm": 2.5240378379821777, "learning_rate": 8.882826587079525e-07, "loss": 0.1406, "step": 14380 }, { "epoch": 0.3355192212410014, "grad_norm": 2.7495322227478027, "learning_rate": 8.882049368898837e-07, "loss": 0.1406, "step": 14390 }, { "epoch": 0.33575238261781937, "grad_norm": 3.4927186965942383, "learning_rate": 8.881272150718149e-07, "loss": 0.1418, "step": 14400 }, { "epoch": 0.3359855439946373, "grad_norm": 1.4631426334381104, "learning_rate": 8.880494932537462e-07, "loss": 0.135, "step": 14410 }, { "epoch": 0.3362187053714552, "grad_norm": 2.576174736022949, "learning_rate": 8.879717714356773e-07, "loss": 0.1333, "step": 14420 }, { "epoch": 0.33645186674827315, "grad_norm": 2.787463426589966, "learning_rate": 8.878940496176086e-07, "loss": 0.1403, "step": 14430 }, { "epoch": 0.3366850281250911, "grad_norm": 1.6669033765792847, "learning_rate": 8.878163277995398e-07, "loss": 0.1204, "step": 14440 }, { "epoch": 0.336918189501909, "grad_norm": 1.467349886894226, "learning_rate": 8.877386059814711e-07, "loss": 0.1378, "step": 14450 }, { "epoch": 0.33715135087872694, "grad_norm": 1.1792429685592651, "learning_rate": 8.876608841634024e-07, "loss": 0.1389, "step": 14460 }, { "epoch": 0.3373845122555449, "grad_norm": 1.3339639902114868, "learning_rate": 8.875831623453336e-07, "loss": 0.1429, "step": 14470 }, { "epoch": 0.3376176736323628, "grad_norm": 1.424220085144043, "learning_rate": 8.875054405272647e-07, "loss": 0.1403, "step": 14480 }, { "epoch": 0.3378508350091807, "grad_norm": 2.204131841659546, "learning_rate": 8.874277187091959e-07, "loss": 0.1404, "step": 14490 }, { "epoch": 0.3380839963859987, "grad_norm": 2.0131638050079346, "learning_rate": 8.873499968911272e-07, "loss": 0.1431, "step": 14500 }, { "epoch": 0.33831715776281657, "grad_norm": 1.4078984260559082, "learning_rate": 8.872722750730585e-07, "loss": 0.134, "step": 14510 }, { "epoch": 0.3385503191396345, "grad_norm": 2.2499191761016846, "learning_rate": 8.871945532549897e-07, "loss": 0.1319, "step": 14520 }, { "epoch": 0.33878348051645246, "grad_norm": 1.8657482862472534, "learning_rate": 8.87116831436921e-07, "loss": 0.1367, "step": 14530 }, { "epoch": 0.33901664189327035, "grad_norm": 1.8980034589767456, "learning_rate": 8.870391096188521e-07, "loss": 0.1503, "step": 14540 }, { "epoch": 0.3392498032700883, "grad_norm": 2.465742588043213, "learning_rate": 8.869613878007834e-07, "loss": 0.1218, "step": 14550 }, { "epoch": 0.33948296464690625, "grad_norm": 1.3478240966796875, "learning_rate": 8.868836659827146e-07, "loss": 0.1478, "step": 14560 }, { "epoch": 0.3397161260237242, "grad_norm": 1.6982805728912354, "learning_rate": 8.868059441646458e-07, "loss": 0.1347, "step": 14570 }, { "epoch": 0.3399492874005421, "grad_norm": 2.7167162895202637, "learning_rate": 8.867282223465771e-07, "loss": 0.1403, "step": 14580 }, { "epoch": 0.34018244877736004, "grad_norm": 3.7537477016448975, "learning_rate": 8.866505005285084e-07, "loss": 0.1347, "step": 14590 }, { "epoch": 0.340415610154178, "grad_norm": 1.4840087890625, "learning_rate": 8.865727787104395e-07, "loss": 0.1319, "step": 14600 }, { "epoch": 0.3406487715309959, "grad_norm": 1.4885210990905762, "learning_rate": 8.864950568923708e-07, "loss": 0.1404, "step": 14610 }, { "epoch": 0.3408819329078138, "grad_norm": 6.491543292999268, "learning_rate": 8.86417335074302e-07, "loss": 0.1497, "step": 14620 }, { "epoch": 0.34111509428463177, "grad_norm": 2.4926955699920654, "learning_rate": 8.863396132562333e-07, "loss": 0.1302, "step": 14630 }, { "epoch": 0.34134825566144966, "grad_norm": 2.3440189361572266, "learning_rate": 8.862618914381645e-07, "loss": 0.1251, "step": 14640 }, { "epoch": 0.3415814170382676, "grad_norm": 2.2642650604248047, "learning_rate": 8.861841696200957e-07, "loss": 0.1359, "step": 14650 }, { "epoch": 0.34181457841508556, "grad_norm": 1.6929441690444946, "learning_rate": 8.861064478020269e-07, "loss": 0.1332, "step": 14660 }, { "epoch": 0.34204773979190345, "grad_norm": 1.916637897491455, "learning_rate": 8.860287259839581e-07, "loss": 0.1352, "step": 14670 }, { "epoch": 0.3422809011687214, "grad_norm": 1.149725317955017, "learning_rate": 8.859510041658894e-07, "loss": 0.1411, "step": 14680 }, { "epoch": 0.34251406254553934, "grad_norm": 1.8737952709197998, "learning_rate": 8.858732823478207e-07, "loss": 0.1405, "step": 14690 }, { "epoch": 0.34274722392235724, "grad_norm": 1.500564455986023, "learning_rate": 8.857955605297519e-07, "loss": 0.1391, "step": 14700 }, { "epoch": 0.3429803852991752, "grad_norm": 2.643176555633545, "learning_rate": 8.857178387116832e-07, "loss": 0.1414, "step": 14710 }, { "epoch": 0.34321354667599313, "grad_norm": 2.4608232975006104, "learning_rate": 8.856401168936143e-07, "loss": 0.1269, "step": 14720 }, { "epoch": 0.3434467080528111, "grad_norm": 1.7617883682250977, "learning_rate": 8.855623950755455e-07, "loss": 0.1422, "step": 14730 }, { "epoch": 0.34367986942962897, "grad_norm": 2.0196034908294678, "learning_rate": 8.854846732574768e-07, "loss": 0.1382, "step": 14740 }, { "epoch": 0.3439130308064469, "grad_norm": 2.079979419708252, "learning_rate": 8.85406951439408e-07, "loss": 0.136, "step": 14750 }, { "epoch": 0.34414619218326487, "grad_norm": 1.5850732326507568, "learning_rate": 8.853292296213393e-07, "loss": 0.1439, "step": 14760 }, { "epoch": 0.34437935356008276, "grad_norm": 1.5618247985839844, "learning_rate": 8.852515078032705e-07, "loss": 0.1391, "step": 14770 }, { "epoch": 0.3446125149369007, "grad_norm": 2.6714794635772705, "learning_rate": 8.851737859852018e-07, "loss": 0.1397, "step": 14780 }, { "epoch": 0.34484567631371865, "grad_norm": 3.5973095893859863, "learning_rate": 8.85096064167133e-07, "loss": 0.1357, "step": 14790 }, { "epoch": 0.34507883769053654, "grad_norm": 2.2764577865600586, "learning_rate": 8.850183423490641e-07, "loss": 0.1497, "step": 14800 }, { "epoch": 0.3453119990673545, "grad_norm": 1.7504140138626099, "learning_rate": 8.849406205309954e-07, "loss": 0.1342, "step": 14810 }, { "epoch": 0.34554516044417244, "grad_norm": 1.3799431324005127, "learning_rate": 8.848628987129266e-07, "loss": 0.1345, "step": 14820 }, { "epoch": 0.34577832182099033, "grad_norm": 2.8346407413482666, "learning_rate": 8.847851768948579e-07, "loss": 0.1228, "step": 14830 }, { "epoch": 0.3460114831978083, "grad_norm": 1.1810246706008911, "learning_rate": 8.847074550767892e-07, "loss": 0.1387, "step": 14840 }, { "epoch": 0.3462446445746262, "grad_norm": 1.5482118129730225, "learning_rate": 8.846297332587203e-07, "loss": 0.1411, "step": 14850 }, { "epoch": 0.3464778059514441, "grad_norm": 2.135044574737549, "learning_rate": 8.845520114406516e-07, "loss": 0.1458, "step": 14860 }, { "epoch": 0.34671096732826207, "grad_norm": 3.8938581943511963, "learning_rate": 8.844742896225828e-07, "loss": 0.1231, "step": 14870 }, { "epoch": 0.34694412870508, "grad_norm": 2.133964776992798, "learning_rate": 8.84396567804514e-07, "loss": 0.1296, "step": 14880 }, { "epoch": 0.34717729008189796, "grad_norm": 3.7647223472595215, "learning_rate": 8.843188459864453e-07, "loss": 0.1268, "step": 14890 }, { "epoch": 0.34741045145871585, "grad_norm": 1.2204699516296387, "learning_rate": 8.842411241683765e-07, "loss": 0.1373, "step": 14900 }, { "epoch": 0.3476436128355338, "grad_norm": 1.534919261932373, "learning_rate": 8.841634023503077e-07, "loss": 0.1231, "step": 14910 }, { "epoch": 0.34787677421235175, "grad_norm": 1.0374616384506226, "learning_rate": 8.840856805322389e-07, "loss": 0.1153, "step": 14920 }, { "epoch": 0.34810993558916964, "grad_norm": 2.7382357120513916, "learning_rate": 8.840079587141702e-07, "loss": 0.1327, "step": 14930 }, { "epoch": 0.3483430969659876, "grad_norm": 1.3849025964736938, "learning_rate": 8.839302368961015e-07, "loss": 0.1339, "step": 14940 }, { "epoch": 0.34857625834280553, "grad_norm": 1.899838924407959, "learning_rate": 8.838525150780327e-07, "loss": 0.1354, "step": 14950 }, { "epoch": 0.3488094197196234, "grad_norm": 1.4070786237716675, "learning_rate": 8.837747932599639e-07, "loss": 0.1241, "step": 14960 }, { "epoch": 0.3490425810964414, "grad_norm": 3.283252239227295, "learning_rate": 8.83697071441895e-07, "loss": 0.126, "step": 14970 }, { "epoch": 0.3492757424732593, "grad_norm": 1.2959879636764526, "learning_rate": 8.836193496238263e-07, "loss": 0.1343, "step": 14980 }, { "epoch": 0.3495089038500772, "grad_norm": 1.4706169366836548, "learning_rate": 8.835416278057576e-07, "loss": 0.1401, "step": 14990 }, { "epoch": 0.34974206522689516, "grad_norm": 2.822200298309326, "learning_rate": 8.834639059876888e-07, "loss": 0.1249, "step": 15000 }, { "epoch": 0.3499752266037131, "grad_norm": 2.3544342517852783, "learning_rate": 8.833861841696201e-07, "loss": 0.1356, "step": 15010 }, { "epoch": 0.350208387980531, "grad_norm": 1.548917531967163, "learning_rate": 8.833084623515514e-07, "loss": 0.1276, "step": 15020 }, { "epoch": 0.35044154935734895, "grad_norm": 1.3580636978149414, "learning_rate": 8.832307405334826e-07, "loss": 0.127, "step": 15030 }, { "epoch": 0.3506747107341669, "grad_norm": 1.538211703300476, "learning_rate": 8.831530187154137e-07, "loss": 0.1387, "step": 15040 }, { "epoch": 0.35090787211098484, "grad_norm": 2.993055820465088, "learning_rate": 8.830752968973449e-07, "loss": 0.1408, "step": 15050 }, { "epoch": 0.35114103348780273, "grad_norm": 1.440277099609375, "learning_rate": 8.829975750792762e-07, "loss": 0.1349, "step": 15060 }, { "epoch": 0.3513741948646207, "grad_norm": 1.2474209070205688, "learning_rate": 8.829198532612075e-07, "loss": 0.1286, "step": 15070 }, { "epoch": 0.35160735624143863, "grad_norm": 1.555381417274475, "learning_rate": 8.828421314431387e-07, "loss": 0.143, "step": 15080 }, { "epoch": 0.3518405176182565, "grad_norm": 2.218867778778076, "learning_rate": 8.8276440962507e-07, "loss": 0.1441, "step": 15090 }, { "epoch": 0.35207367899507447, "grad_norm": 2.999371290206909, "learning_rate": 8.826866878070011e-07, "loss": 0.1388, "step": 15100 }, { "epoch": 0.3523068403718924, "grad_norm": 1.663413643836975, "learning_rate": 8.826089659889324e-07, "loss": 0.1346, "step": 15110 }, { "epoch": 0.3525400017487103, "grad_norm": 2.122267484664917, "learning_rate": 8.825312441708636e-07, "loss": 0.1278, "step": 15120 }, { "epoch": 0.35277316312552826, "grad_norm": 1.2835016250610352, "learning_rate": 8.824535223527948e-07, "loss": 0.1234, "step": 15130 }, { "epoch": 0.3530063245023462, "grad_norm": 1.3494186401367188, "learning_rate": 8.823758005347261e-07, "loss": 0.1198, "step": 15140 }, { "epoch": 0.3532394858791641, "grad_norm": 2.4559030532836914, "learning_rate": 8.822980787166573e-07, "loss": 0.1413, "step": 15150 }, { "epoch": 0.35347264725598204, "grad_norm": 1.6807056665420532, "learning_rate": 8.822203568985885e-07, "loss": 0.135, "step": 15160 }, { "epoch": 0.3537058086328, "grad_norm": 1.0854557752609253, "learning_rate": 8.821426350805198e-07, "loss": 0.1241, "step": 15170 }, { "epoch": 0.3539389700096179, "grad_norm": 3.2744979858398438, "learning_rate": 8.82064913262451e-07, "loss": 0.1311, "step": 15180 }, { "epoch": 0.35417213138643583, "grad_norm": 2.9927568435668945, "learning_rate": 8.819871914443823e-07, "loss": 0.1287, "step": 15190 }, { "epoch": 0.3544052927632538, "grad_norm": 3.549748420715332, "learning_rate": 8.819094696263134e-07, "loss": 0.1275, "step": 15200 }, { "epoch": 0.3546384541400717, "grad_norm": 3.1676454544067383, "learning_rate": 8.818317478082447e-07, "loss": 0.1284, "step": 15210 }, { "epoch": 0.3548716155168896, "grad_norm": 1.8554750680923462, "learning_rate": 8.817540259901759e-07, "loss": 0.1321, "step": 15220 }, { "epoch": 0.35510477689370756, "grad_norm": 2.0594730377197266, "learning_rate": 8.816763041721071e-07, "loss": 0.13, "step": 15230 }, { "epoch": 0.3553379382705255, "grad_norm": 2.3340647220611572, "learning_rate": 8.815985823540384e-07, "loss": 0.1395, "step": 15240 }, { "epoch": 0.3555710996473434, "grad_norm": 2.604783535003662, "learning_rate": 8.815208605359696e-07, "loss": 0.1378, "step": 15250 }, { "epoch": 0.35580426102416135, "grad_norm": 1.6030900478363037, "learning_rate": 8.814431387179009e-07, "loss": 0.1553, "step": 15260 }, { "epoch": 0.3560374224009793, "grad_norm": 1.56256103515625, "learning_rate": 8.813654168998322e-07, "loss": 0.1272, "step": 15270 }, { "epoch": 0.3562705837777972, "grad_norm": 2.557343006134033, "learning_rate": 8.812876950817632e-07, "loss": 0.1356, "step": 15280 }, { "epoch": 0.35650374515461514, "grad_norm": 1.7645169496536255, "learning_rate": 8.812099732636945e-07, "loss": 0.1359, "step": 15290 }, { "epoch": 0.3567369065314331, "grad_norm": 1.9533673524856567, "learning_rate": 8.811322514456257e-07, "loss": 0.1376, "step": 15300 }, { "epoch": 0.356970067908251, "grad_norm": 1.8062645196914673, "learning_rate": 8.81054529627557e-07, "loss": 0.1311, "step": 15310 }, { "epoch": 0.3572032292850689, "grad_norm": 2.3999476432800293, "learning_rate": 8.809768078094883e-07, "loss": 0.1274, "step": 15320 }, { "epoch": 0.35743639066188687, "grad_norm": 1.7787199020385742, "learning_rate": 8.808990859914195e-07, "loss": 0.146, "step": 15330 }, { "epoch": 0.35766955203870476, "grad_norm": 1.6691211462020874, "learning_rate": 8.808213641733507e-07, "loss": 0.1357, "step": 15340 }, { "epoch": 0.3579027134155227, "grad_norm": 2.158095121383667, "learning_rate": 8.80743642355282e-07, "loss": 0.1349, "step": 15350 }, { "epoch": 0.35813587479234066, "grad_norm": 2.2009928226470947, "learning_rate": 8.806659205372131e-07, "loss": 0.1336, "step": 15360 }, { "epoch": 0.3583690361691586, "grad_norm": 1.3198360204696655, "learning_rate": 8.805881987191444e-07, "loss": 0.1398, "step": 15370 }, { "epoch": 0.3586021975459765, "grad_norm": 2.282410144805908, "learning_rate": 8.805104769010756e-07, "loss": 0.1288, "step": 15380 }, { "epoch": 0.35883535892279445, "grad_norm": 3.367795705795288, "learning_rate": 8.804327550830069e-07, "loss": 0.1357, "step": 15390 }, { "epoch": 0.3590685202996124, "grad_norm": 1.2040061950683594, "learning_rate": 8.80355033264938e-07, "loss": 0.1327, "step": 15400 }, { "epoch": 0.3593016816764303, "grad_norm": 1.6694109439849854, "learning_rate": 8.802773114468693e-07, "loss": 0.1242, "step": 15410 }, { "epoch": 0.35953484305324823, "grad_norm": 1.4711799621582031, "learning_rate": 8.801995896288006e-07, "loss": 0.1364, "step": 15420 }, { "epoch": 0.3597680044300662, "grad_norm": 1.672440528869629, "learning_rate": 8.801218678107318e-07, "loss": 0.1488, "step": 15430 }, { "epoch": 0.36000116580688407, "grad_norm": 2.022994041442871, "learning_rate": 8.80044145992663e-07, "loss": 0.1353, "step": 15440 }, { "epoch": 0.360234327183702, "grad_norm": 3.0548360347747803, "learning_rate": 8.799664241745943e-07, "loss": 0.1309, "step": 15450 }, { "epoch": 0.36046748856051997, "grad_norm": 2.8393096923828125, "learning_rate": 8.798887023565254e-07, "loss": 0.1286, "step": 15460 }, { "epoch": 0.36070064993733786, "grad_norm": 1.9888348579406738, "learning_rate": 8.798109805384567e-07, "loss": 0.132, "step": 15470 }, { "epoch": 0.3609338113141558, "grad_norm": 1.2992274761199951, "learning_rate": 8.797332587203879e-07, "loss": 0.1261, "step": 15480 }, { "epoch": 0.36116697269097375, "grad_norm": 1.5036565065383911, "learning_rate": 8.796555369023192e-07, "loss": 0.127, "step": 15490 }, { "epoch": 0.36140013406779165, "grad_norm": 2.3333990573883057, "learning_rate": 8.795778150842505e-07, "loss": 0.1368, "step": 15500 }, { "epoch": 0.3616332954446096, "grad_norm": 1.709687352180481, "learning_rate": 8.795000932661817e-07, "loss": 0.1265, "step": 15510 }, { "epoch": 0.36186645682142754, "grad_norm": 1.439237117767334, "learning_rate": 8.794223714481128e-07, "loss": 0.1305, "step": 15520 }, { "epoch": 0.3620996181982455, "grad_norm": 4.78853178024292, "learning_rate": 8.79344649630044e-07, "loss": 0.1428, "step": 15530 }, { "epoch": 0.3623327795750634, "grad_norm": 1.2218928337097168, "learning_rate": 8.792669278119753e-07, "loss": 0.1326, "step": 15540 }, { "epoch": 0.3625659409518813, "grad_norm": 2.4250152111053467, "learning_rate": 8.791892059939066e-07, "loss": 0.1276, "step": 15550 }, { "epoch": 0.3627991023286993, "grad_norm": 1.3668214082717896, "learning_rate": 8.791114841758378e-07, "loss": 0.1363, "step": 15560 }, { "epoch": 0.36303226370551717, "grad_norm": 1.7724261283874512, "learning_rate": 8.790337623577691e-07, "loss": 0.138, "step": 15570 }, { "epoch": 0.3632654250823351, "grad_norm": 1.5179548263549805, "learning_rate": 8.789560405397003e-07, "loss": 0.1248, "step": 15580 }, { "epoch": 0.36349858645915306, "grad_norm": 3.6114022731781006, "learning_rate": 8.788783187216315e-07, "loss": 0.1326, "step": 15590 }, { "epoch": 0.36373174783597095, "grad_norm": 1.6900838613510132, "learning_rate": 8.788005969035627e-07, "loss": 0.1408, "step": 15600 }, { "epoch": 0.3639649092127889, "grad_norm": 1.3060518503189087, "learning_rate": 8.787228750854939e-07, "loss": 0.1259, "step": 15610 }, { "epoch": 0.36419807058960685, "grad_norm": 2.126866340637207, "learning_rate": 8.786451532674252e-07, "loss": 0.1446, "step": 15620 }, { "epoch": 0.36443123196642474, "grad_norm": 2.8756866455078125, "learning_rate": 8.785674314493564e-07, "loss": 0.1476, "step": 15630 }, { "epoch": 0.3646643933432427, "grad_norm": 1.3215007781982422, "learning_rate": 8.784897096312877e-07, "loss": 0.1234, "step": 15640 }, { "epoch": 0.36489755472006064, "grad_norm": 2.317068576812744, "learning_rate": 8.784119878132189e-07, "loss": 0.1523, "step": 15650 }, { "epoch": 0.3651307160968785, "grad_norm": 1.6288387775421143, "learning_rate": 8.783342659951501e-07, "loss": 0.1256, "step": 15660 }, { "epoch": 0.3653638774736965, "grad_norm": 1.3623466491699219, "learning_rate": 8.782565441770814e-07, "loss": 0.1256, "step": 15670 }, { "epoch": 0.3655970388505144, "grad_norm": 1.5471745729446411, "learning_rate": 8.781788223590125e-07, "loss": 0.1443, "step": 15680 }, { "epoch": 0.36583020022733237, "grad_norm": 3.6286122798919678, "learning_rate": 8.781011005409438e-07, "loss": 0.1327, "step": 15690 }, { "epoch": 0.36606336160415026, "grad_norm": 1.5544604063034058, "learning_rate": 8.780233787228751e-07, "loss": 0.1522, "step": 15700 }, { "epoch": 0.3662965229809682, "grad_norm": 1.3882954120635986, "learning_rate": 8.779456569048062e-07, "loss": 0.1265, "step": 15710 }, { "epoch": 0.36652968435778616, "grad_norm": 1.8526707887649536, "learning_rate": 8.778679350867375e-07, "loss": 0.1238, "step": 15720 }, { "epoch": 0.36676284573460405, "grad_norm": 1.2960621118545532, "learning_rate": 8.777902132686687e-07, "loss": 0.1416, "step": 15730 }, { "epoch": 0.366996007111422, "grad_norm": 1.4155279397964478, "learning_rate": 8.777124914506e-07, "loss": 0.1438, "step": 15740 }, { "epoch": 0.36722916848823994, "grad_norm": 1.8601843118667603, "learning_rate": 8.776347696325313e-07, "loss": 0.1363, "step": 15750 }, { "epoch": 0.36746232986505784, "grad_norm": 1.8526866436004639, "learning_rate": 8.775570478144624e-07, "loss": 0.1297, "step": 15760 }, { "epoch": 0.3676954912418758, "grad_norm": 3.2852578163146973, "learning_rate": 8.774793259963936e-07, "loss": 0.1309, "step": 15770 }, { "epoch": 0.36792865261869373, "grad_norm": 2.7516636848449707, "learning_rate": 8.774016041783248e-07, "loss": 0.1368, "step": 15780 }, { "epoch": 0.3681618139955116, "grad_norm": 1.203426480293274, "learning_rate": 8.773238823602561e-07, "loss": 0.1405, "step": 15790 }, { "epoch": 0.36839497537232957, "grad_norm": 1.5263959169387817, "learning_rate": 8.772461605421874e-07, "loss": 0.1364, "step": 15800 }, { "epoch": 0.3686281367491475, "grad_norm": 1.5971925258636475, "learning_rate": 8.771684387241186e-07, "loss": 0.1412, "step": 15810 }, { "epoch": 0.3688612981259654, "grad_norm": 2.6197547912597656, "learning_rate": 8.770907169060499e-07, "loss": 0.1249, "step": 15820 }, { "epoch": 0.36909445950278336, "grad_norm": 2.4797608852386475, "learning_rate": 8.77012995087981e-07, "loss": 0.1277, "step": 15830 }, { "epoch": 0.3693276208796013, "grad_norm": 2.23030686378479, "learning_rate": 8.769352732699122e-07, "loss": 0.1324, "step": 15840 }, { "epoch": 0.36956078225641925, "grad_norm": 3.379575490951538, "learning_rate": 8.768575514518435e-07, "loss": 0.1276, "step": 15850 }, { "epoch": 0.36979394363323714, "grad_norm": 2.2646501064300537, "learning_rate": 8.767798296337747e-07, "loss": 0.1421, "step": 15860 }, { "epoch": 0.3700271050100551, "grad_norm": 2.601970911026001, "learning_rate": 8.76702107815706e-07, "loss": 0.1217, "step": 15870 }, { "epoch": 0.37026026638687304, "grad_norm": 1.6703084707260132, "learning_rate": 8.766243859976373e-07, "loss": 0.1275, "step": 15880 }, { "epoch": 0.37049342776369093, "grad_norm": 1.19613778591156, "learning_rate": 8.765466641795684e-07, "loss": 0.1384, "step": 15890 }, { "epoch": 0.3707265891405089, "grad_norm": 2.585622787475586, "learning_rate": 8.764689423614997e-07, "loss": 0.1358, "step": 15900 }, { "epoch": 0.3709597505173268, "grad_norm": 2.9158451557159424, "learning_rate": 8.763912205434309e-07, "loss": 0.1206, "step": 15910 }, { "epoch": 0.3711929118941447, "grad_norm": 4.853145599365234, "learning_rate": 8.763134987253622e-07, "loss": 0.1286, "step": 15920 }, { "epoch": 0.37142607327096266, "grad_norm": 3.349980354309082, "learning_rate": 8.762357769072934e-07, "loss": 0.1408, "step": 15930 }, { "epoch": 0.3716592346477806, "grad_norm": 2.0669894218444824, "learning_rate": 8.761580550892246e-07, "loss": 0.1333, "step": 15940 }, { "epoch": 0.3718923960245985, "grad_norm": 1.5181355476379395, "learning_rate": 8.760803332711559e-07, "loss": 0.1399, "step": 15950 }, { "epoch": 0.37212555740141645, "grad_norm": 1.5578700304031372, "learning_rate": 8.76002611453087e-07, "loss": 0.1298, "step": 15960 }, { "epoch": 0.3723587187782344, "grad_norm": 2.236844539642334, "learning_rate": 8.759248896350183e-07, "loss": 0.1434, "step": 15970 }, { "epoch": 0.3725918801550523, "grad_norm": 1.3446286916732788, "learning_rate": 8.758471678169496e-07, "loss": 0.1329, "step": 15980 }, { "epoch": 0.37282504153187024, "grad_norm": 2.0666913986206055, "learning_rate": 8.757694459988808e-07, "loss": 0.1309, "step": 15990 }, { "epoch": 0.3730582029086882, "grad_norm": 1.6176471710205078, "learning_rate": 8.756917241808121e-07, "loss": 0.1433, "step": 16000 }, { "epoch": 0.37329136428550613, "grad_norm": 1.2968028783798218, "learning_rate": 8.756140023627432e-07, "loss": 0.1385, "step": 16010 }, { "epoch": 0.373524525662324, "grad_norm": 1.997534990310669, "learning_rate": 8.755362805446744e-07, "loss": 0.128, "step": 16020 }, { "epoch": 0.373757687039142, "grad_norm": 1.1280149221420288, "learning_rate": 8.754585587266057e-07, "loss": 0.1239, "step": 16030 }, { "epoch": 0.3739908484159599, "grad_norm": 1.3465979099273682, "learning_rate": 8.753808369085369e-07, "loss": 0.122, "step": 16040 }, { "epoch": 0.3742240097927778, "grad_norm": 1.8605303764343262, "learning_rate": 8.753031150904682e-07, "loss": 0.1235, "step": 16050 }, { "epoch": 0.37445717116959576, "grad_norm": 1.6932133436203003, "learning_rate": 8.752253932723994e-07, "loss": 0.1369, "step": 16060 }, { "epoch": 0.3746903325464137, "grad_norm": 1.8933714628219604, "learning_rate": 8.751476714543307e-07, "loss": 0.1381, "step": 16070 }, { "epoch": 0.3749234939232316, "grad_norm": 1.856406569480896, "learning_rate": 8.750699496362619e-07, "loss": 0.1472, "step": 16080 }, { "epoch": 0.37515665530004955, "grad_norm": 2.652500867843628, "learning_rate": 8.74992227818193e-07, "loss": 0.1498, "step": 16090 }, { "epoch": 0.3753898166768675, "grad_norm": 1.5748658180236816, "learning_rate": 8.749145060001243e-07, "loss": 0.137, "step": 16100 }, { "epoch": 0.3756229780536854, "grad_norm": 1.5493146181106567, "learning_rate": 8.748367841820555e-07, "loss": 0.1358, "step": 16110 }, { "epoch": 0.37585613943050333, "grad_norm": 1.8439927101135254, "learning_rate": 8.747590623639868e-07, "loss": 0.132, "step": 16120 }, { "epoch": 0.3760893008073213, "grad_norm": 3.3467981815338135, "learning_rate": 8.746891127277249e-07, "loss": 0.1395, "step": 16130 }, { "epoch": 0.3763224621841392, "grad_norm": 1.3037248849868774, "learning_rate": 8.746113909096561e-07, "loss": 0.138, "step": 16140 }, { "epoch": 0.3765556235609571, "grad_norm": 3.0726912021636963, "learning_rate": 8.745336690915873e-07, "loss": 0.1246, "step": 16150 }, { "epoch": 0.37678878493777507, "grad_norm": 2.02734375, "learning_rate": 8.744559472735186e-07, "loss": 0.1352, "step": 16160 }, { "epoch": 0.377021946314593, "grad_norm": 2.2910289764404297, "learning_rate": 8.743782254554498e-07, "loss": 0.1461, "step": 16170 }, { "epoch": 0.3772551076914109, "grad_norm": 1.509100079536438, "learning_rate": 8.74300503637381e-07, "loss": 0.1396, "step": 16180 }, { "epoch": 0.37748826906822885, "grad_norm": 1.359711766242981, "learning_rate": 8.742227818193123e-07, "loss": 0.1297, "step": 16190 }, { "epoch": 0.3777214304450468, "grad_norm": 1.6400173902511597, "learning_rate": 8.741450600012436e-07, "loss": 0.1308, "step": 16200 }, { "epoch": 0.3779545918218647, "grad_norm": 2.6388649940490723, "learning_rate": 8.740673381831748e-07, "loss": 0.1295, "step": 16210 }, { "epoch": 0.37818775319868264, "grad_norm": 1.476657509803772, "learning_rate": 8.73989616365106e-07, "loss": 0.1381, "step": 16220 }, { "epoch": 0.3784209145755006, "grad_norm": 1.5781691074371338, "learning_rate": 8.739118945470371e-07, "loss": 0.1325, "step": 16230 }, { "epoch": 0.3786540759523185, "grad_norm": 3.361797332763672, "learning_rate": 8.738341727289684e-07, "loss": 0.127, "step": 16240 }, { "epoch": 0.37888723732913643, "grad_norm": 2.584003448486328, "learning_rate": 8.737564509108997e-07, "loss": 0.1273, "step": 16250 }, { "epoch": 0.3791203987059544, "grad_norm": 1.296814203262329, "learning_rate": 8.736787290928309e-07, "loss": 0.119, "step": 16260 }, { "epoch": 0.37935356008277227, "grad_norm": 1.3015645742416382, "learning_rate": 8.736010072747622e-07, "loss": 0.1277, "step": 16270 }, { "epoch": 0.3795867214595902, "grad_norm": 1.4195148944854736, "learning_rate": 8.735232854566934e-07, "loss": 0.1389, "step": 16280 }, { "epoch": 0.37981988283640816, "grad_norm": 1.5117418766021729, "learning_rate": 8.734455636386247e-07, "loss": 0.1323, "step": 16290 }, { "epoch": 0.38005304421322605, "grad_norm": 1.2838040590286255, "learning_rate": 8.733678418205558e-07, "loss": 0.1379, "step": 16300 }, { "epoch": 0.380286205590044, "grad_norm": 1.7798833847045898, "learning_rate": 8.73290120002487e-07, "loss": 0.1259, "step": 16310 }, { "epoch": 0.38051936696686195, "grad_norm": 1.1842836141586304, "learning_rate": 8.732123981844183e-07, "loss": 0.1279, "step": 16320 }, { "epoch": 0.3807525283436799, "grad_norm": 2.1938812732696533, "learning_rate": 8.731346763663495e-07, "loss": 0.1442, "step": 16330 }, { "epoch": 0.3809856897204978, "grad_norm": 4.196570873260498, "learning_rate": 8.730569545482808e-07, "loss": 0.1455, "step": 16340 }, { "epoch": 0.38121885109731574, "grad_norm": 1.8331167697906494, "learning_rate": 8.729792327302121e-07, "loss": 0.1359, "step": 16350 }, { "epoch": 0.3814520124741337, "grad_norm": 2.038362979888916, "learning_rate": 8.729015109121432e-07, "loss": 0.1379, "step": 16360 }, { "epoch": 0.3816851738509516, "grad_norm": 2.1961169242858887, "learning_rate": 8.728237890940745e-07, "loss": 0.1352, "step": 16370 }, { "epoch": 0.3819183352277695, "grad_norm": 1.6587620973587036, "learning_rate": 8.727460672760056e-07, "loss": 0.1148, "step": 16380 }, { "epoch": 0.38215149660458747, "grad_norm": 2.48441481590271, "learning_rate": 8.726683454579369e-07, "loss": 0.1212, "step": 16390 }, { "epoch": 0.38238465798140536, "grad_norm": 2.2665295600891113, "learning_rate": 8.725906236398682e-07, "loss": 0.1232, "step": 16400 }, { "epoch": 0.3826178193582233, "grad_norm": 2.992354393005371, "learning_rate": 8.725129018217994e-07, "loss": 0.1346, "step": 16410 }, { "epoch": 0.38285098073504126, "grad_norm": 1.459833025932312, "learning_rate": 8.724351800037306e-07, "loss": 0.1417, "step": 16420 }, { "epoch": 0.38308414211185915, "grad_norm": 1.254077434539795, "learning_rate": 8.723574581856618e-07, "loss": 0.1308, "step": 16430 }, { "epoch": 0.3833173034886771, "grad_norm": 1.543615698814392, "learning_rate": 8.722797363675931e-07, "loss": 0.1399, "step": 16440 }, { "epoch": 0.38355046486549504, "grad_norm": 1.511270523071289, "learning_rate": 8.722020145495244e-07, "loss": 0.1244, "step": 16450 }, { "epoch": 0.38378362624231294, "grad_norm": 1.3233979940414429, "learning_rate": 8.721242927314555e-07, "loss": 0.1399, "step": 16460 }, { "epoch": 0.3840167876191309, "grad_norm": 3.0042755603790283, "learning_rate": 8.720465709133868e-07, "loss": 0.1309, "step": 16470 }, { "epoch": 0.38424994899594883, "grad_norm": 2.3106565475463867, "learning_rate": 8.719688490953179e-07, "loss": 0.129, "step": 16480 }, { "epoch": 0.3844831103727668, "grad_norm": 2.6406309604644775, "learning_rate": 8.718911272772492e-07, "loss": 0.1276, "step": 16490 }, { "epoch": 0.38471627174958467, "grad_norm": 2.5526087284088135, "learning_rate": 8.718134054591805e-07, "loss": 0.1287, "step": 16500 }, { "epoch": 0.3849494331264026, "grad_norm": 2.1722588539123535, "learning_rate": 8.717356836411117e-07, "loss": 0.1454, "step": 16510 }, { "epoch": 0.38518259450322057, "grad_norm": 2.9101998805999756, "learning_rate": 8.71657961823043e-07, "loss": 0.1332, "step": 16520 }, { "epoch": 0.38541575588003846, "grad_norm": 1.2313460111618042, "learning_rate": 8.715802400049743e-07, "loss": 0.13, "step": 16530 }, { "epoch": 0.3856489172568564, "grad_norm": 1.9222431182861328, "learning_rate": 8.715025181869053e-07, "loss": 0.1272, "step": 16540 }, { "epoch": 0.38588207863367435, "grad_norm": 2.1335089206695557, "learning_rate": 8.714247963688366e-07, "loss": 0.1352, "step": 16550 }, { "epoch": 0.38611524001049224, "grad_norm": 3.63944935798645, "learning_rate": 8.713470745507678e-07, "loss": 0.1271, "step": 16560 }, { "epoch": 0.3863484013873102, "grad_norm": 2.2006046772003174, "learning_rate": 8.712693527326991e-07, "loss": 0.1406, "step": 16570 }, { "epoch": 0.38658156276412814, "grad_norm": 1.4729796648025513, "learning_rate": 8.711916309146303e-07, "loss": 0.1462, "step": 16580 }, { "epoch": 0.38681472414094603, "grad_norm": 1.315673828125, "learning_rate": 8.711139090965616e-07, "loss": 0.1322, "step": 16590 }, { "epoch": 0.387047885517764, "grad_norm": 2.1680121421813965, "learning_rate": 8.710361872784928e-07, "loss": 0.1291, "step": 16600 }, { "epoch": 0.3872810468945819, "grad_norm": 1.2277528047561646, "learning_rate": 8.70958465460424e-07, "loss": 0.1322, "step": 16610 }, { "epoch": 0.3875142082713998, "grad_norm": 1.9034746885299683, "learning_rate": 8.708807436423552e-07, "loss": 0.1394, "step": 16620 }, { "epoch": 0.38774736964821777, "grad_norm": 4.498508930206299, "learning_rate": 8.708030218242864e-07, "loss": 0.1281, "step": 16630 }, { "epoch": 0.3879805310250357, "grad_norm": 2.555044174194336, "learning_rate": 8.707253000062177e-07, "loss": 0.1403, "step": 16640 }, { "epoch": 0.38821369240185366, "grad_norm": 1.6244919300079346, "learning_rate": 8.706553503699558e-07, "loss": 0.1367, "step": 16650 }, { "epoch": 0.38844685377867155, "grad_norm": 1.3371093273162842, "learning_rate": 8.705776285518871e-07, "loss": 0.124, "step": 16660 }, { "epoch": 0.3886800151554895, "grad_norm": 1.617161512374878, "learning_rate": 8.704999067338183e-07, "loss": 0.1253, "step": 16670 }, { "epoch": 0.38891317653230745, "grad_norm": 1.8498241901397705, "learning_rate": 8.704221849157495e-07, "loss": 0.1427, "step": 16680 }, { "epoch": 0.38914633790912534, "grad_norm": 1.9513883590698242, "learning_rate": 8.703444630976807e-07, "loss": 0.1287, "step": 16690 }, { "epoch": 0.3893794992859433, "grad_norm": 3.3291149139404297, "learning_rate": 8.702667412796119e-07, "loss": 0.1332, "step": 16700 }, { "epoch": 0.38961266066276123, "grad_norm": 3.0755393505096436, "learning_rate": 8.701890194615432e-07, "loss": 0.1154, "step": 16710 }, { "epoch": 0.3898458220395791, "grad_norm": 1.5384019613265991, "learning_rate": 8.701112976434745e-07, "loss": 0.1305, "step": 16720 }, { "epoch": 0.3900789834163971, "grad_norm": 2.770692825317383, "learning_rate": 8.700335758254057e-07, "loss": 0.1381, "step": 16730 }, { "epoch": 0.390312144793215, "grad_norm": 2.2039005756378174, "learning_rate": 8.69955854007337e-07, "loss": 0.1369, "step": 16740 }, { "epoch": 0.3905453061700329, "grad_norm": 1.4546325206756592, "learning_rate": 8.698781321892681e-07, "loss": 0.1196, "step": 16750 }, { "epoch": 0.39077846754685086, "grad_norm": 1.2050985097885132, "learning_rate": 8.698004103711993e-07, "loss": 0.1265, "step": 16760 }, { "epoch": 0.3910116289236688, "grad_norm": 1.525009036064148, "learning_rate": 8.697226885531306e-07, "loss": 0.132, "step": 16770 }, { "epoch": 0.3912447903004867, "grad_norm": 1.484649658203125, "learning_rate": 8.696449667350618e-07, "loss": 0.1352, "step": 16780 }, { "epoch": 0.39147795167730465, "grad_norm": 1.1263837814331055, "learning_rate": 8.695672449169931e-07, "loss": 0.1293, "step": 16790 }, { "epoch": 0.3917111130541226, "grad_norm": 1.598279595375061, "learning_rate": 8.694895230989243e-07, "loss": 0.1319, "step": 16800 }, { "epoch": 0.39194427443094054, "grad_norm": 1.1120654344558716, "learning_rate": 8.694118012808556e-07, "loss": 0.1355, "step": 16810 }, { "epoch": 0.39217743580775843, "grad_norm": 2.973209857940674, "learning_rate": 8.693340794627868e-07, "loss": 0.1374, "step": 16820 }, { "epoch": 0.3924105971845764, "grad_norm": 2.235090970993042, "learning_rate": 8.692563576447179e-07, "loss": 0.1202, "step": 16830 }, { "epoch": 0.39264375856139433, "grad_norm": 2.3318212032318115, "learning_rate": 8.691786358266492e-07, "loss": 0.1386, "step": 16840 }, { "epoch": 0.3928769199382122, "grad_norm": 4.059518337249756, "learning_rate": 8.691009140085804e-07, "loss": 0.1351, "step": 16850 }, { "epoch": 0.39311008131503017, "grad_norm": 1.7449617385864258, "learning_rate": 8.690231921905117e-07, "loss": 0.1539, "step": 16860 }, { "epoch": 0.3933432426918481, "grad_norm": 1.3198844194412231, "learning_rate": 8.68945470372443e-07, "loss": 0.1311, "step": 16870 }, { "epoch": 0.393576404068666, "grad_norm": 1.857430100440979, "learning_rate": 8.688677485543741e-07, "loss": 0.1228, "step": 16880 }, { "epoch": 0.39380956544548396, "grad_norm": 1.8706423044204712, "learning_rate": 8.687900267363054e-07, "loss": 0.1322, "step": 16890 }, { "epoch": 0.3940427268223019, "grad_norm": 1.569727897644043, "learning_rate": 8.687123049182366e-07, "loss": 0.1261, "step": 16900 }, { "epoch": 0.3942758881991198, "grad_norm": 3.6822855472564697, "learning_rate": 8.686345831001678e-07, "loss": 0.1449, "step": 16910 }, { "epoch": 0.39450904957593774, "grad_norm": 1.4401839971542358, "learning_rate": 8.685568612820991e-07, "loss": 0.1289, "step": 16920 }, { "epoch": 0.3947422109527557, "grad_norm": 1.9282184839248657, "learning_rate": 8.684791394640303e-07, "loss": 0.1402, "step": 16930 }, { "epoch": 0.3949753723295736, "grad_norm": 1.8620617389678955, "learning_rate": 8.684014176459615e-07, "loss": 0.1295, "step": 16940 }, { "epoch": 0.39520853370639153, "grad_norm": 2.0035018920898438, "learning_rate": 8.683236958278927e-07, "loss": 0.1269, "step": 16950 }, { "epoch": 0.3954416950832095, "grad_norm": 1.1754662990570068, "learning_rate": 8.68245974009824e-07, "loss": 0.1215, "step": 16960 }, { "epoch": 0.39567485646002737, "grad_norm": 1.3146405220031738, "learning_rate": 8.681682521917553e-07, "loss": 0.1256, "step": 16970 }, { "epoch": 0.3959080178368453, "grad_norm": 1.2488282918930054, "learning_rate": 8.680905303736865e-07, "loss": 0.1227, "step": 16980 }, { "epoch": 0.39614117921366326, "grad_norm": 3.2428135871887207, "learning_rate": 8.680128085556177e-07, "loss": 0.1419, "step": 16990 }, { "epoch": 0.3963743405904812, "grad_norm": 2.300042152404785, "learning_rate": 8.679350867375488e-07, "loss": 0.1235, "step": 17000 }, { "epoch": 0.3966075019672991, "grad_norm": 1.4919278621673584, "learning_rate": 8.678573649194801e-07, "loss": 0.1254, "step": 17010 }, { "epoch": 0.39684066334411705, "grad_norm": 1.262893795967102, "learning_rate": 8.677796431014114e-07, "loss": 0.1307, "step": 17020 }, { "epoch": 0.397073824720935, "grad_norm": 2.0724079608917236, "learning_rate": 8.677019212833426e-07, "loss": 0.1514, "step": 17030 }, { "epoch": 0.3973069860977529, "grad_norm": 1.4980350732803345, "learning_rate": 8.676241994652739e-07, "loss": 0.1248, "step": 17040 }, { "epoch": 0.39754014747457084, "grad_norm": 1.382866621017456, "learning_rate": 8.675464776472052e-07, "loss": 0.1346, "step": 17050 }, { "epoch": 0.3977733088513888, "grad_norm": 1.4530954360961914, "learning_rate": 8.674687558291364e-07, "loss": 0.1261, "step": 17060 }, { "epoch": 0.3980064702282067, "grad_norm": 3.0745842456817627, "learning_rate": 8.673910340110675e-07, "loss": 0.1282, "step": 17070 }, { "epoch": 0.3982396316050246, "grad_norm": 1.4414923191070557, "learning_rate": 8.673133121929987e-07, "loss": 0.1475, "step": 17080 }, { "epoch": 0.39847279298184257, "grad_norm": 1.2089489698410034, "learning_rate": 8.6723559037493e-07, "loss": 0.1396, "step": 17090 }, { "epoch": 0.39870595435866046, "grad_norm": 3.2150115966796875, "learning_rate": 8.671578685568613e-07, "loss": 0.1452, "step": 17100 }, { "epoch": 0.3989391157354784, "grad_norm": 1.377091646194458, "learning_rate": 8.670801467387925e-07, "loss": 0.13, "step": 17110 }, { "epoch": 0.39917227711229636, "grad_norm": 1.818225383758545, "learning_rate": 8.670024249207238e-07, "loss": 0.1245, "step": 17120 }, { "epoch": 0.39940543848911425, "grad_norm": 2.785620927810669, "learning_rate": 8.669247031026549e-07, "loss": 0.1306, "step": 17130 }, { "epoch": 0.3996385998659322, "grad_norm": 1.5101157426834106, "learning_rate": 8.668469812845862e-07, "loss": 0.1407, "step": 17140 }, { "epoch": 0.39987176124275015, "grad_norm": 1.3094860315322876, "learning_rate": 8.667692594665174e-07, "loss": 0.1257, "step": 17150 }, { "epoch": 0.4001049226195681, "grad_norm": 2.7318003177642822, "learning_rate": 8.666915376484486e-07, "loss": 0.1278, "step": 17160 }, { "epoch": 0.400338083996386, "grad_norm": 1.61189603805542, "learning_rate": 8.666138158303799e-07, "loss": 0.1331, "step": 17170 }, { "epoch": 0.40057124537320393, "grad_norm": 2.7682242393493652, "learning_rate": 8.665360940123111e-07, "loss": 0.1223, "step": 17180 }, { "epoch": 0.4008044067500219, "grad_norm": 1.7517813444137573, "learning_rate": 8.664583721942423e-07, "loss": 0.1367, "step": 17190 }, { "epoch": 0.40103756812683977, "grad_norm": 1.6385728120803833, "learning_rate": 8.663806503761736e-07, "loss": 0.1245, "step": 17200 }, { "epoch": 0.4012707295036577, "grad_norm": 1.45878267288208, "learning_rate": 8.663029285581048e-07, "loss": 0.147, "step": 17210 }, { "epoch": 0.40150389088047567, "grad_norm": 1.7192590236663818, "learning_rate": 8.662252067400361e-07, "loss": 0.1436, "step": 17220 }, { "epoch": 0.40173705225729356, "grad_norm": 1.4733644723892212, "learning_rate": 8.661474849219672e-07, "loss": 0.13, "step": 17230 }, { "epoch": 0.4019702136341115, "grad_norm": 1.7280056476593018, "learning_rate": 8.660697631038985e-07, "loss": 0.1183, "step": 17240 }, { "epoch": 0.40220337501092945, "grad_norm": 1.7128546237945557, "learning_rate": 8.659920412858297e-07, "loss": 0.1276, "step": 17250 }, { "epoch": 0.40243653638774735, "grad_norm": 4.483219623565674, "learning_rate": 8.659143194677609e-07, "loss": 0.1446, "step": 17260 }, { "epoch": 0.4026696977645653, "grad_norm": 1.5383543968200684, "learning_rate": 8.658365976496922e-07, "loss": 0.1328, "step": 17270 }, { "epoch": 0.40290285914138324, "grad_norm": 2.427729368209839, "learning_rate": 8.657588758316234e-07, "loss": 0.1164, "step": 17280 }, { "epoch": 0.40313602051820113, "grad_norm": 1.6254650354385376, "learning_rate": 8.656811540135547e-07, "loss": 0.1319, "step": 17290 }, { "epoch": 0.4033691818950191, "grad_norm": 2.939563035964966, "learning_rate": 8.65603432195486e-07, "loss": 0.1254, "step": 17300 }, { "epoch": 0.403602343271837, "grad_norm": 1.9006409645080566, "learning_rate": 8.65525710377417e-07, "loss": 0.1286, "step": 17310 }, { "epoch": 0.403835504648655, "grad_norm": 2.360701084136963, "learning_rate": 8.654479885593483e-07, "loss": 0.1279, "step": 17320 }, { "epoch": 0.40406866602547287, "grad_norm": 1.3370870351791382, "learning_rate": 8.653702667412795e-07, "loss": 0.1196, "step": 17330 }, { "epoch": 0.4043018274022908, "grad_norm": 1.6374059915542603, "learning_rate": 8.652925449232108e-07, "loss": 0.1377, "step": 17340 }, { "epoch": 0.40453498877910876, "grad_norm": 1.547142744064331, "learning_rate": 8.652148231051421e-07, "loss": 0.1458, "step": 17350 }, { "epoch": 0.40476815015592665, "grad_norm": 1.4217668771743774, "learning_rate": 8.651371012870733e-07, "loss": 0.1361, "step": 17360 }, { "epoch": 0.4050013115327446, "grad_norm": 3.2540535926818848, "learning_rate": 8.650593794690045e-07, "loss": 0.1326, "step": 17370 }, { "epoch": 0.40523447290956255, "grad_norm": 1.8096650838851929, "learning_rate": 8.649816576509357e-07, "loss": 0.1324, "step": 17380 }, { "epoch": 0.40546763428638044, "grad_norm": 1.9057236909866333, "learning_rate": 8.649039358328669e-07, "loss": 0.1294, "step": 17390 }, { "epoch": 0.4057007956631984, "grad_norm": 2.0553131103515625, "learning_rate": 8.648262140147982e-07, "loss": 0.1254, "step": 17400 }, { "epoch": 0.40593395704001634, "grad_norm": 1.5503429174423218, "learning_rate": 8.647484921967294e-07, "loss": 0.1323, "step": 17410 }, { "epoch": 0.4061671184168342, "grad_norm": 2.6956262588500977, "learning_rate": 8.646707703786607e-07, "loss": 0.1399, "step": 17420 }, { "epoch": 0.4064002797936522, "grad_norm": 3.9386954307556152, "learning_rate": 8.645930485605918e-07, "loss": 0.1278, "step": 17430 }, { "epoch": 0.4066334411704701, "grad_norm": 2.2349462509155273, "learning_rate": 8.645153267425231e-07, "loss": 0.1341, "step": 17440 }, { "epoch": 0.406866602547288, "grad_norm": 3.632215738296509, "learning_rate": 8.644376049244544e-07, "loss": 0.1354, "step": 17450 }, { "epoch": 0.40709976392410596, "grad_norm": 2.7468621730804443, "learning_rate": 8.643598831063856e-07, "loss": 0.1329, "step": 17460 }, { "epoch": 0.4073329253009239, "grad_norm": 5.095355987548828, "learning_rate": 8.642821612883168e-07, "loss": 0.1341, "step": 17470 }, { "epoch": 0.40756608667774186, "grad_norm": 1.7379709482192993, "learning_rate": 8.642044394702481e-07, "loss": 0.1211, "step": 17480 }, { "epoch": 0.40779924805455975, "grad_norm": 1.4352885484695435, "learning_rate": 8.641267176521792e-07, "loss": 0.1268, "step": 17490 }, { "epoch": 0.4080324094313777, "grad_norm": 1.853758454322815, "learning_rate": 8.640489958341105e-07, "loss": 0.1225, "step": 17500 }, { "epoch": 0.40826557080819564, "grad_norm": 2.2397379875183105, "learning_rate": 8.639712740160417e-07, "loss": 0.1313, "step": 17510 }, { "epoch": 0.40849873218501354, "grad_norm": 1.6583410501480103, "learning_rate": 8.63893552197973e-07, "loss": 0.1094, "step": 17520 }, { "epoch": 0.4087318935618315, "grad_norm": 1.6314542293548584, "learning_rate": 8.638158303799043e-07, "loss": 0.1435, "step": 17530 }, { "epoch": 0.40896505493864943, "grad_norm": 3.04307222366333, "learning_rate": 8.637381085618355e-07, "loss": 0.136, "step": 17540 }, { "epoch": 0.4091982163154673, "grad_norm": 2.6437699794769287, "learning_rate": 8.636603867437666e-07, "loss": 0.1374, "step": 17550 }, { "epoch": 0.40943137769228527, "grad_norm": 2.3659064769744873, "learning_rate": 8.635826649256978e-07, "loss": 0.1386, "step": 17560 }, { "epoch": 0.4096645390691032, "grad_norm": 1.6604009866714478, "learning_rate": 8.635049431076291e-07, "loss": 0.134, "step": 17570 }, { "epoch": 0.4098977004459211, "grad_norm": 1.7262535095214844, "learning_rate": 8.634272212895604e-07, "loss": 0.1311, "step": 17580 }, { "epoch": 0.41013086182273906, "grad_norm": 2.3882787227630615, "learning_rate": 8.633494994714916e-07, "loss": 0.1241, "step": 17590 }, { "epoch": 0.410364023199557, "grad_norm": 3.411562919616699, "learning_rate": 8.632717776534229e-07, "loss": 0.141, "step": 17600 }, { "epoch": 0.4105971845763749, "grad_norm": 1.9570475816726685, "learning_rate": 8.631940558353541e-07, "loss": 0.1307, "step": 17610 }, { "epoch": 0.41083034595319284, "grad_norm": 1.6297876834869385, "learning_rate": 8.631163340172853e-07, "loss": 0.1298, "step": 17620 }, { "epoch": 0.4110635073300108, "grad_norm": 1.4002907276153564, "learning_rate": 8.630386121992165e-07, "loss": 0.1292, "step": 17630 }, { "epoch": 0.41129666870682874, "grad_norm": 2.6945598125457764, "learning_rate": 8.629608903811477e-07, "loss": 0.1244, "step": 17640 }, { "epoch": 0.41152983008364663, "grad_norm": 1.7311735153198242, "learning_rate": 8.62883168563079e-07, "loss": 0.1357, "step": 17650 }, { "epoch": 0.4117629914604646, "grad_norm": 1.986706018447876, "learning_rate": 8.628054467450102e-07, "loss": 0.1268, "step": 17660 }, { "epoch": 0.4119961528372825, "grad_norm": 1.5399383306503296, "learning_rate": 8.627277249269415e-07, "loss": 0.1255, "step": 17670 }, { "epoch": 0.4122293142141004, "grad_norm": 1.223272442817688, "learning_rate": 8.626500031088727e-07, "loss": 0.1306, "step": 17680 }, { "epoch": 0.41246247559091836, "grad_norm": 2.2904839515686035, "learning_rate": 8.625722812908039e-07, "loss": 0.1265, "step": 17690 }, { "epoch": 0.4126956369677363, "grad_norm": 1.7779359817504883, "learning_rate": 8.624945594727352e-07, "loss": 0.1304, "step": 17700 }, { "epoch": 0.4129287983445542, "grad_norm": 1.3021352291107178, "learning_rate": 8.624168376546663e-07, "loss": 0.1321, "step": 17710 }, { "epoch": 0.41316195972137215, "grad_norm": 1.3626550436019897, "learning_rate": 8.623391158365976e-07, "loss": 0.1303, "step": 17720 }, { "epoch": 0.4133951210981901, "grad_norm": 3.495695114135742, "learning_rate": 8.622613940185289e-07, "loss": 0.1309, "step": 17730 }, { "epoch": 0.413628282475008, "grad_norm": 1.5364856719970703, "learning_rate": 8.6218367220046e-07, "loss": 0.1347, "step": 17740 }, { "epoch": 0.41386144385182594, "grad_norm": 1.448160171508789, "learning_rate": 8.621059503823913e-07, "loss": 0.1384, "step": 17750 }, { "epoch": 0.4140946052286439, "grad_norm": 1.437875509262085, "learning_rate": 8.620282285643225e-07, "loss": 0.1467, "step": 17760 }, { "epoch": 0.4143277666054618, "grad_norm": 1.151649832725525, "learning_rate": 8.619505067462538e-07, "loss": 0.116, "step": 17770 }, { "epoch": 0.4145609279822797, "grad_norm": 1.2284289598464966, "learning_rate": 8.618727849281851e-07, "loss": 0.133, "step": 17780 }, { "epoch": 0.4147940893590977, "grad_norm": 2.7674291133880615, "learning_rate": 8.617950631101162e-07, "loss": 0.1348, "step": 17790 }, { "epoch": 0.4150272507359156, "grad_norm": 2.063771963119507, "learning_rate": 8.617173412920474e-07, "loss": 0.151, "step": 17800 }, { "epoch": 0.4152604121127335, "grad_norm": 2.7487452030181885, "learning_rate": 8.616396194739786e-07, "loss": 0.1126, "step": 17810 }, { "epoch": 0.41549357348955146, "grad_norm": 4.885368824005127, "learning_rate": 8.615618976559099e-07, "loss": 0.1091, "step": 17820 }, { "epoch": 0.4157267348663694, "grad_norm": 1.6920982599258423, "learning_rate": 8.614841758378412e-07, "loss": 0.144, "step": 17830 }, { "epoch": 0.4159598962431873, "grad_norm": 2.0162248611450195, "learning_rate": 8.614064540197724e-07, "loss": 0.121, "step": 17840 }, { "epoch": 0.41619305762000525, "grad_norm": 1.4927695989608765, "learning_rate": 8.613287322017037e-07, "loss": 0.1285, "step": 17850 }, { "epoch": 0.4164262189968232, "grad_norm": 1.6588174104690552, "learning_rate": 8.612510103836349e-07, "loss": 0.1367, "step": 17860 }, { "epoch": 0.4166593803736411, "grad_norm": 1.2086811065673828, "learning_rate": 8.61173288565566e-07, "loss": 0.1219, "step": 17870 }, { "epoch": 0.41689254175045903, "grad_norm": 1.355463981628418, "learning_rate": 8.610955667474973e-07, "loss": 0.1118, "step": 17880 }, { "epoch": 0.417125703127277, "grad_norm": 1.241012454032898, "learning_rate": 8.610178449294285e-07, "loss": 0.1291, "step": 17890 }, { "epoch": 0.4173588645040949, "grad_norm": 1.9932793378829956, "learning_rate": 8.609401231113598e-07, "loss": 0.127, "step": 17900 }, { "epoch": 0.4175920258809128, "grad_norm": 3.8567230701446533, "learning_rate": 8.608624012932911e-07, "loss": 0.127, "step": 17910 }, { "epoch": 0.41782518725773077, "grad_norm": 1.9146982431411743, "learning_rate": 8.607846794752222e-07, "loss": 0.1338, "step": 17920 }, { "epoch": 0.41805834863454866, "grad_norm": 1.4641871452331543, "learning_rate": 8.607069576571535e-07, "loss": 0.1361, "step": 17930 }, { "epoch": 0.4182915100113666, "grad_norm": 1.4671638011932373, "learning_rate": 8.606292358390847e-07, "loss": 0.1319, "step": 17940 }, { "epoch": 0.41852467138818455, "grad_norm": 1.755160927772522, "learning_rate": 8.605515140210159e-07, "loss": 0.1307, "step": 17950 }, { "epoch": 0.4187578327650025, "grad_norm": 2.489193916320801, "learning_rate": 8.604737922029472e-07, "loss": 0.1298, "step": 17960 }, { "epoch": 0.4189909941418204, "grad_norm": 1.5646861791610718, "learning_rate": 8.603960703848784e-07, "loss": 0.1266, "step": 17970 }, { "epoch": 0.41922415551863834, "grad_norm": 2.0463786125183105, "learning_rate": 8.603183485668097e-07, "loss": 0.1387, "step": 17980 }, { "epoch": 0.4194573168954563, "grad_norm": 1.4826207160949707, "learning_rate": 8.602406267487408e-07, "loss": 0.1361, "step": 17990 }, { "epoch": 0.4196904782722742, "grad_norm": 1.2928611040115356, "learning_rate": 8.601629049306721e-07, "loss": 0.1319, "step": 18000 }, { "epoch": 0.41992363964909213, "grad_norm": 1.5832942724227905, "learning_rate": 8.600851831126034e-07, "loss": 0.1272, "step": 18010 }, { "epoch": 0.4201568010259101, "grad_norm": 1.491339087486267, "learning_rate": 8.600074612945346e-07, "loss": 0.1465, "step": 18020 }, { "epoch": 0.42038996240272797, "grad_norm": 1.8291674852371216, "learning_rate": 8.599297394764658e-07, "loss": 0.1355, "step": 18030 }, { "epoch": 0.4206231237795459, "grad_norm": 1.3948290348052979, "learning_rate": 8.59852017658397e-07, "loss": 0.1458, "step": 18040 }, { "epoch": 0.42085628515636386, "grad_norm": 1.7375106811523438, "learning_rate": 8.597742958403282e-07, "loss": 0.1262, "step": 18050 }, { "epoch": 0.42108944653318175, "grad_norm": 1.3703950643539429, "learning_rate": 8.596965740222595e-07, "loss": 0.1377, "step": 18060 }, { "epoch": 0.4213226079099997, "grad_norm": 1.710795521736145, "learning_rate": 8.596188522041907e-07, "loss": 0.1378, "step": 18070 }, { "epoch": 0.42155576928681765, "grad_norm": 1.6188710927963257, "learning_rate": 8.59541130386122e-07, "loss": 0.1215, "step": 18080 }, { "epoch": 0.42178893066363554, "grad_norm": 2.018289566040039, "learning_rate": 8.594634085680532e-07, "loss": 0.1204, "step": 18090 }, { "epoch": 0.4220220920404535, "grad_norm": 1.2991223335266113, "learning_rate": 8.593856867499845e-07, "loss": 0.1303, "step": 18100 }, { "epoch": 0.42225525341727144, "grad_norm": 1.2755961418151855, "learning_rate": 8.593079649319156e-07, "loss": 0.1411, "step": 18110 }, { "epoch": 0.4224884147940894, "grad_norm": 3.0583136081695557, "learning_rate": 8.592302431138468e-07, "loss": 0.1397, "step": 18120 }, { "epoch": 0.4227215761709073, "grad_norm": 4.567086696624756, "learning_rate": 8.591525212957781e-07, "loss": 0.1276, "step": 18130 }, { "epoch": 0.4229547375477252, "grad_norm": 2.7276344299316406, "learning_rate": 8.590747994777093e-07, "loss": 0.1364, "step": 18140 }, { "epoch": 0.42318789892454317, "grad_norm": 2.7520029544830322, "learning_rate": 8.589970776596406e-07, "loss": 0.136, "step": 18150 }, { "epoch": 0.42342106030136106, "grad_norm": 2.3112573623657227, "learning_rate": 8.589193558415719e-07, "loss": 0.1362, "step": 18160 }, { "epoch": 0.423654221678179, "grad_norm": 1.5465625524520874, "learning_rate": 8.58841634023503e-07, "loss": 0.1271, "step": 18170 }, { "epoch": 0.42388738305499696, "grad_norm": 2.3328840732574463, "learning_rate": 8.587639122054343e-07, "loss": 0.1319, "step": 18180 }, { "epoch": 0.42412054443181485, "grad_norm": 1.3270093202590942, "learning_rate": 8.586861903873654e-07, "loss": 0.1273, "step": 18190 }, { "epoch": 0.4243537058086328, "grad_norm": 1.3993785381317139, "learning_rate": 8.586084685692967e-07, "loss": 0.1305, "step": 18200 }, { "epoch": 0.42458686718545074, "grad_norm": 1.2716072797775269, "learning_rate": 8.58530746751228e-07, "loss": 0.1343, "step": 18210 }, { "epoch": 0.42482002856226864, "grad_norm": 1.3889676332473755, "learning_rate": 8.584530249331592e-07, "loss": 0.1343, "step": 18220 }, { "epoch": 0.4250531899390866, "grad_norm": 3.974015235900879, "learning_rate": 8.583753031150904e-07, "loss": 0.1264, "step": 18230 }, { "epoch": 0.42528635131590453, "grad_norm": 1.9759587049484253, "learning_rate": 8.582975812970216e-07, "loss": 0.125, "step": 18240 }, { "epoch": 0.4255195126927224, "grad_norm": 1.5726109743118286, "learning_rate": 8.582198594789529e-07, "loss": 0.1302, "step": 18250 }, { "epoch": 0.42575267406954037, "grad_norm": 1.667640209197998, "learning_rate": 8.581421376608842e-07, "loss": 0.1339, "step": 18260 }, { "epoch": 0.4259858354463583, "grad_norm": 1.7960747480392456, "learning_rate": 8.580644158428153e-07, "loss": 0.1473, "step": 18270 }, { "epoch": 0.42621899682317627, "grad_norm": 1.5532716512680054, "learning_rate": 8.579866940247466e-07, "loss": 0.1358, "step": 18280 }, { "epoch": 0.42645215819999416, "grad_norm": 3.488325834274292, "learning_rate": 8.579089722066777e-07, "loss": 0.1341, "step": 18290 }, { "epoch": 0.4266853195768121, "grad_norm": 4.148684978485107, "learning_rate": 8.57831250388609e-07, "loss": 0.1451, "step": 18300 }, { "epoch": 0.42691848095363005, "grad_norm": 1.2585899829864502, "learning_rate": 8.577535285705403e-07, "loss": 0.137, "step": 18310 }, { "epoch": 0.42715164233044794, "grad_norm": 1.4854521751403809, "learning_rate": 8.576758067524715e-07, "loss": 0.1289, "step": 18320 }, { "epoch": 0.4273848037072659, "grad_norm": 1.9856929779052734, "learning_rate": 8.575980849344028e-07, "loss": 0.1254, "step": 18330 }, { "epoch": 0.42761796508408384, "grad_norm": 1.615125298500061, "learning_rate": 8.575203631163341e-07, "loss": 0.1352, "step": 18340 }, { "epoch": 0.42785112646090173, "grad_norm": 1.8688220977783203, "learning_rate": 8.574426412982651e-07, "loss": 0.1165, "step": 18350 }, { "epoch": 0.4280842878377197, "grad_norm": 2.4388444423675537, "learning_rate": 8.573649194801964e-07, "loss": 0.1494, "step": 18360 }, { "epoch": 0.4283174492145376, "grad_norm": 2.674532651901245, "learning_rate": 8.572871976621276e-07, "loss": 0.1311, "step": 18370 }, { "epoch": 0.4285506105913555, "grad_norm": 1.9555230140686035, "learning_rate": 8.572094758440589e-07, "loss": 0.1435, "step": 18380 }, { "epoch": 0.42878377196817347, "grad_norm": 2.442540407180786, "learning_rate": 8.571317540259902e-07, "loss": 0.123, "step": 18390 }, { "epoch": 0.4290169333449914, "grad_norm": 2.807219982147217, "learning_rate": 8.570540322079214e-07, "loss": 0.1362, "step": 18400 }, { "epoch": 0.4292500947218093, "grad_norm": 3.6621391773223877, "learning_rate": 8.569763103898527e-07, "loss": 0.1448, "step": 18410 }, { "epoch": 0.42948325609862725, "grad_norm": 1.7344331741333008, "learning_rate": 8.568985885717838e-07, "loss": 0.1516, "step": 18420 }, { "epoch": 0.4297164174754452, "grad_norm": 2.8541572093963623, "learning_rate": 8.56820866753715e-07, "loss": 0.1315, "step": 18430 }, { "epoch": 0.42994957885226315, "grad_norm": 3.2792162895202637, "learning_rate": 8.567431449356463e-07, "loss": 0.1403, "step": 18440 }, { "epoch": 0.43018274022908104, "grad_norm": 1.818741798400879, "learning_rate": 8.566654231175775e-07, "loss": 0.1241, "step": 18450 }, { "epoch": 0.430415901605899, "grad_norm": 1.3195390701293945, "learning_rate": 8.565877012995088e-07, "loss": 0.1379, "step": 18460 }, { "epoch": 0.43064906298271693, "grad_norm": 1.315929651260376, "learning_rate": 8.5650997948144e-07, "loss": 0.1248, "step": 18470 }, { "epoch": 0.4308822243595348, "grad_norm": 4.471781253814697, "learning_rate": 8.564322576633712e-07, "loss": 0.1289, "step": 18480 }, { "epoch": 0.4311153857363528, "grad_norm": 1.4491485357284546, "learning_rate": 8.563545358453025e-07, "loss": 0.1244, "step": 18490 }, { "epoch": 0.4313485471131707, "grad_norm": 4.128576278686523, "learning_rate": 8.562768140272337e-07, "loss": 0.1373, "step": 18500 }, { "epoch": 0.4315817084899886, "grad_norm": 1.3633031845092773, "learning_rate": 8.561990922091649e-07, "loss": 0.1305, "step": 18510 }, { "epoch": 0.43181486986680656, "grad_norm": 1.3577522039413452, "learning_rate": 8.561213703910961e-07, "loss": 0.1328, "step": 18520 }, { "epoch": 0.4320480312436245, "grad_norm": 1.357147455215454, "learning_rate": 8.560436485730274e-07, "loss": 0.131, "step": 18530 }, { "epoch": 0.4322811926204424, "grad_norm": 1.7992167472839355, "learning_rate": 8.559659267549586e-07, "loss": 0.1393, "step": 18540 }, { "epoch": 0.43251435399726035, "grad_norm": 1.282057523727417, "learning_rate": 8.558882049368898e-07, "loss": 0.1351, "step": 18550 }, { "epoch": 0.4327475153740783, "grad_norm": 1.3574445247650146, "learning_rate": 8.558104831188211e-07, "loss": 0.1226, "step": 18560 }, { "epoch": 0.4329806767508962, "grad_norm": 2.308297872543335, "learning_rate": 8.557327613007523e-07, "loss": 0.1333, "step": 18570 }, { "epoch": 0.43321383812771413, "grad_norm": 2.0571064949035645, "learning_rate": 8.556550394826836e-07, "loss": 0.1259, "step": 18580 }, { "epoch": 0.4334469995045321, "grad_norm": 3.2217843532562256, "learning_rate": 8.555773176646148e-07, "loss": 0.1292, "step": 18590 }, { "epoch": 0.43368016088135003, "grad_norm": 1.9014643430709839, "learning_rate": 8.554995958465459e-07, "loss": 0.1404, "step": 18600 }, { "epoch": 0.4339133222581679, "grad_norm": 1.411009669303894, "learning_rate": 8.554218740284772e-07, "loss": 0.1444, "step": 18610 }, { "epoch": 0.43414648363498587, "grad_norm": 2.5837886333465576, "learning_rate": 8.553441522104084e-07, "loss": 0.1284, "step": 18620 }, { "epoch": 0.4343796450118038, "grad_norm": 2.7479212284088135, "learning_rate": 8.552664303923397e-07, "loss": 0.1253, "step": 18630 }, { "epoch": 0.4346128063886217, "grad_norm": 3.2610599994659424, "learning_rate": 8.55188708574271e-07, "loss": 0.1382, "step": 18640 }, { "epoch": 0.43484596776543966, "grad_norm": 1.642403244972229, "learning_rate": 8.551109867562022e-07, "loss": 0.1444, "step": 18650 }, { "epoch": 0.4350791291422576, "grad_norm": 2.2435290813446045, "learning_rate": 8.550332649381334e-07, "loss": 0.1248, "step": 18660 }, { "epoch": 0.4353122905190755, "grad_norm": 2.948007345199585, "learning_rate": 8.549555431200645e-07, "loss": 0.1313, "step": 18670 }, { "epoch": 0.43554545189589344, "grad_norm": 2.006873846054077, "learning_rate": 8.548778213019958e-07, "loss": 0.1368, "step": 18680 }, { "epoch": 0.4357786132727114, "grad_norm": 1.4095194339752197, "learning_rate": 8.548000994839271e-07, "loss": 0.1323, "step": 18690 }, { "epoch": 0.4360117746495293, "grad_norm": 2.225841999053955, "learning_rate": 8.547223776658583e-07, "loss": 0.1314, "step": 18700 }, { "epoch": 0.43624493602634723, "grad_norm": 2.116290330886841, "learning_rate": 8.546446558477896e-07, "loss": 0.1311, "step": 18710 }, { "epoch": 0.4364780974031652, "grad_norm": 1.8982166051864624, "learning_rate": 8.545669340297207e-07, "loss": 0.1394, "step": 18720 }, { "epoch": 0.43671125877998307, "grad_norm": 2.9885261058807373, "learning_rate": 8.54489212211652e-07, "loss": 0.1358, "step": 18730 }, { "epoch": 0.436944420156801, "grad_norm": 1.603020429611206, "learning_rate": 8.544114903935833e-07, "loss": 0.1325, "step": 18740 }, { "epoch": 0.43717758153361896, "grad_norm": 2.6883060932159424, "learning_rate": 8.543337685755145e-07, "loss": 0.1412, "step": 18750 }, { "epoch": 0.4374107429104369, "grad_norm": 2.487018346786499, "learning_rate": 8.542560467574457e-07, "loss": 0.1336, "step": 18760 }, { "epoch": 0.4376439042872548, "grad_norm": 1.9114097356796265, "learning_rate": 8.54178324939377e-07, "loss": 0.1352, "step": 18770 }, { "epoch": 0.43787706566407275, "grad_norm": 1.8074994087219238, "learning_rate": 8.541006031213081e-07, "loss": 0.1318, "step": 18780 }, { "epoch": 0.4381102270408907, "grad_norm": 3.2768218517303467, "learning_rate": 8.540228813032394e-07, "loss": 0.1263, "step": 18790 }, { "epoch": 0.4383433884177086, "grad_norm": 1.408918857574463, "learning_rate": 8.539451594851706e-07, "loss": 0.1263, "step": 18800 }, { "epoch": 0.43857654979452654, "grad_norm": 1.4012835025787354, "learning_rate": 8.538674376671019e-07, "loss": 0.1174, "step": 18810 }, { "epoch": 0.4388097111713445, "grad_norm": 2.022869110107422, "learning_rate": 8.537897158490332e-07, "loss": 0.1373, "step": 18820 }, { "epoch": 0.4390428725481624, "grad_norm": 1.9078575372695923, "learning_rate": 8.537119940309644e-07, "loss": 0.1274, "step": 18830 }, { "epoch": 0.4392760339249803, "grad_norm": 5.110961437225342, "learning_rate": 8.536342722128955e-07, "loss": 0.1306, "step": 18840 }, { "epoch": 0.43950919530179827, "grad_norm": 1.9113755226135254, "learning_rate": 8.535565503948267e-07, "loss": 0.1355, "step": 18850 }, { "epoch": 0.43974235667861616, "grad_norm": 2.0154521465301514, "learning_rate": 8.53478828576758e-07, "loss": 0.1258, "step": 18860 }, { "epoch": 0.4399755180554341, "grad_norm": 2.802999973297119, "learning_rate": 8.534011067586893e-07, "loss": 0.1343, "step": 18870 }, { "epoch": 0.44020867943225206, "grad_norm": 1.354711890220642, "learning_rate": 8.533233849406205e-07, "loss": 0.1388, "step": 18880 }, { "epoch": 0.44044184080906995, "grad_norm": 1.4652565717697144, "learning_rate": 8.532456631225518e-07, "loss": 0.1257, "step": 18890 }, { "epoch": 0.4406750021858879, "grad_norm": 1.312787652015686, "learning_rate": 8.53167941304483e-07, "loss": 0.1325, "step": 18900 }, { "epoch": 0.44090816356270585, "grad_norm": 2.1601479053497314, "learning_rate": 8.530902194864142e-07, "loss": 0.1356, "step": 18910 }, { "epoch": 0.4411413249395238, "grad_norm": 1.9704049825668335, "learning_rate": 8.530124976683454e-07, "loss": 0.124, "step": 18920 }, { "epoch": 0.4413744863163417, "grad_norm": 1.234208345413208, "learning_rate": 8.529347758502766e-07, "loss": 0.1333, "step": 18930 }, { "epoch": 0.44160764769315963, "grad_norm": 1.6503177881240845, "learning_rate": 8.528570540322079e-07, "loss": 0.1397, "step": 18940 }, { "epoch": 0.4418408090699776, "grad_norm": 1.8992083072662354, "learning_rate": 8.527793322141391e-07, "loss": 0.1416, "step": 18950 }, { "epoch": 0.4420739704467955, "grad_norm": 1.7053581476211548, "learning_rate": 8.527016103960704e-07, "loss": 0.1198, "step": 18960 }, { "epoch": 0.4423071318236134, "grad_norm": 1.5327047109603882, "learning_rate": 8.526238885780016e-07, "loss": 0.1367, "step": 18970 }, { "epoch": 0.44254029320043137, "grad_norm": 1.3611706495285034, "learning_rate": 8.525461667599328e-07, "loss": 0.1314, "step": 18980 }, { "epoch": 0.44277345457724926, "grad_norm": 1.3309396505355835, "learning_rate": 8.524684449418641e-07, "loss": 0.1337, "step": 18990 }, { "epoch": 0.4430066159540672, "grad_norm": 1.2236382961273193, "learning_rate": 8.523907231237952e-07, "loss": 0.1305, "step": 19000 }, { "epoch": 0.44323977733088515, "grad_norm": 1.73903226852417, "learning_rate": 8.523130013057265e-07, "loss": 0.1346, "step": 19010 }, { "epoch": 0.44347293870770305, "grad_norm": 1.9282547235488892, "learning_rate": 8.522352794876578e-07, "loss": 0.1359, "step": 19020 }, { "epoch": 0.443706100084521, "grad_norm": 1.0809561014175415, "learning_rate": 8.521575576695889e-07, "loss": 0.1212, "step": 19030 }, { "epoch": 0.44393926146133894, "grad_norm": 2.2215445041656494, "learning_rate": 8.520798358515202e-07, "loss": 0.1238, "step": 19040 }, { "epoch": 0.44417242283815683, "grad_norm": 1.2387917041778564, "learning_rate": 8.520021140334514e-07, "loss": 0.1242, "step": 19050 }, { "epoch": 0.4444055842149748, "grad_norm": 2.547684907913208, "learning_rate": 8.519243922153827e-07, "loss": 0.1312, "step": 19060 }, { "epoch": 0.44463874559179273, "grad_norm": 1.2771878242492676, "learning_rate": 8.51846670397314e-07, "loss": 0.1323, "step": 19070 }, { "epoch": 0.4448719069686107, "grad_norm": 2.582022190093994, "learning_rate": 8.517689485792451e-07, "loss": 0.1322, "step": 19080 }, { "epoch": 0.44510506834542857, "grad_norm": 1.867835521697998, "learning_rate": 8.516912267611763e-07, "loss": 0.1271, "step": 19090 }, { "epoch": 0.4453382297222465, "grad_norm": 1.5403845310211182, "learning_rate": 8.516135049431075e-07, "loss": 0.1251, "step": 19100 }, { "epoch": 0.44557139109906446, "grad_norm": 1.4340780973434448, "learning_rate": 8.515357831250388e-07, "loss": 0.1252, "step": 19110 }, { "epoch": 0.44580455247588235, "grad_norm": 1.7977269887924194, "learning_rate": 8.514580613069701e-07, "loss": 0.1332, "step": 19120 }, { "epoch": 0.4460377138527003, "grad_norm": 2.2701902389526367, "learning_rate": 8.513803394889013e-07, "loss": 0.1256, "step": 19130 }, { "epoch": 0.44627087522951825, "grad_norm": 1.435965657234192, "learning_rate": 8.513026176708326e-07, "loss": 0.1169, "step": 19140 }, { "epoch": 0.44650403660633614, "grad_norm": 2.3719029426574707, "learning_rate": 8.512248958527638e-07, "loss": 0.1301, "step": 19150 }, { "epoch": 0.4467371979831541, "grad_norm": 2.300137996673584, "learning_rate": 8.511471740346949e-07, "loss": 0.1272, "step": 19160 }, { "epoch": 0.44697035935997204, "grad_norm": 2.273984670639038, "learning_rate": 8.510694522166262e-07, "loss": 0.1423, "step": 19170 }, { "epoch": 0.44720352073678993, "grad_norm": 1.4175790548324585, "learning_rate": 8.509917303985574e-07, "loss": 0.1284, "step": 19180 }, { "epoch": 0.4474366821136079, "grad_norm": 1.8984211683273315, "learning_rate": 8.509140085804887e-07, "loss": 0.1294, "step": 19190 }, { "epoch": 0.4476698434904258, "grad_norm": 3.11710262298584, "learning_rate": 8.5083628676242e-07, "loss": 0.1343, "step": 19200 }, { "epoch": 0.4479030048672437, "grad_norm": 2.0327930450439453, "learning_rate": 8.507585649443511e-07, "loss": 0.1335, "step": 19210 }, { "epoch": 0.44813616624406166, "grad_norm": 1.5054271221160889, "learning_rate": 8.506808431262824e-07, "loss": 0.1328, "step": 19220 }, { "epoch": 0.4483693276208796, "grad_norm": 1.691853642463684, "learning_rate": 8.506031213082136e-07, "loss": 0.1269, "step": 19230 }, { "epoch": 0.44860248899769756, "grad_norm": 1.1175729036331177, "learning_rate": 8.505253994901448e-07, "loss": 0.1159, "step": 19240 }, { "epoch": 0.44883565037451545, "grad_norm": 2.4875526428222656, "learning_rate": 8.504476776720761e-07, "loss": 0.1231, "step": 19250 }, { "epoch": 0.4490688117513334, "grad_norm": 2.2317841053009033, "learning_rate": 8.503699558540073e-07, "loss": 0.1363, "step": 19260 }, { "epoch": 0.44930197312815134, "grad_norm": 1.8003747463226318, "learning_rate": 8.502922340359386e-07, "loss": 0.134, "step": 19270 }, { "epoch": 0.44953513450496924, "grad_norm": 2.4716427326202393, "learning_rate": 8.502145122178697e-07, "loss": 0.1293, "step": 19280 }, { "epoch": 0.4497682958817872, "grad_norm": 1.2379119396209717, "learning_rate": 8.50136790399801e-07, "loss": 0.1349, "step": 19290 }, { "epoch": 0.45000145725860513, "grad_norm": 1.6863573789596558, "learning_rate": 8.500590685817323e-07, "loss": 0.1209, "step": 19300 }, { "epoch": 0.450234618635423, "grad_norm": 2.6782515048980713, "learning_rate": 8.499813467636635e-07, "loss": 0.1394, "step": 19310 }, { "epoch": 0.45046778001224097, "grad_norm": 3.3792214393615723, "learning_rate": 8.499036249455947e-07, "loss": 0.1333, "step": 19320 }, { "epoch": 0.4507009413890589, "grad_norm": 1.4323410987854004, "learning_rate": 8.498259031275259e-07, "loss": 0.1212, "step": 19330 }, { "epoch": 0.4509341027658768, "grad_norm": 1.460108757019043, "learning_rate": 8.497481813094571e-07, "loss": 0.1312, "step": 19340 }, { "epoch": 0.45116726414269476, "grad_norm": 2.275115489959717, "learning_rate": 8.496704594913884e-07, "loss": 0.1227, "step": 19350 }, { "epoch": 0.4514004255195127, "grad_norm": 1.3827472925186157, "learning_rate": 8.495927376733196e-07, "loss": 0.1292, "step": 19360 }, { "epoch": 0.4516335868963306, "grad_norm": 1.6642682552337646, "learning_rate": 8.495150158552509e-07, "loss": 0.1297, "step": 19370 }, { "epoch": 0.45186674827314854, "grad_norm": 3.457702398300171, "learning_rate": 8.494372940371821e-07, "loss": 0.1333, "step": 19380 }, { "epoch": 0.4520999096499665, "grad_norm": 2.711045980453491, "learning_rate": 8.493595722191134e-07, "loss": 0.1335, "step": 19390 }, { "epoch": 0.45233307102678444, "grad_norm": 1.508978009223938, "learning_rate": 8.492818504010445e-07, "loss": 0.1285, "step": 19400 }, { "epoch": 0.45256623240360233, "grad_norm": 1.6787710189819336, "learning_rate": 8.492041285829757e-07, "loss": 0.1309, "step": 19410 }, { "epoch": 0.4527993937804203, "grad_norm": 1.7573025226593018, "learning_rate": 8.49126406764907e-07, "loss": 0.124, "step": 19420 }, { "epoch": 0.4530325551572382, "grad_norm": 1.253368854522705, "learning_rate": 8.490486849468382e-07, "loss": 0.1361, "step": 19430 }, { "epoch": 0.4532657165340561, "grad_norm": 2.6866743564605713, "learning_rate": 8.489709631287695e-07, "loss": 0.1266, "step": 19440 }, { "epoch": 0.45349887791087407, "grad_norm": 1.3412193059921265, "learning_rate": 8.488932413107008e-07, "loss": 0.1328, "step": 19450 }, { "epoch": 0.453732039287692, "grad_norm": 2.655066728591919, "learning_rate": 8.488155194926319e-07, "loss": 0.1352, "step": 19460 }, { "epoch": 0.4539652006645099, "grad_norm": 1.4389981031417847, "learning_rate": 8.487377976745632e-07, "loss": 0.1158, "step": 19470 }, { "epoch": 0.45419836204132785, "grad_norm": 1.9705294370651245, "learning_rate": 8.486600758564943e-07, "loss": 0.1244, "step": 19480 }, { "epoch": 0.4544315234181458, "grad_norm": 1.5982942581176758, "learning_rate": 8.485823540384256e-07, "loss": 0.1331, "step": 19490 }, { "epoch": 0.4546646847949637, "grad_norm": 4.73193883895874, "learning_rate": 8.485046322203569e-07, "loss": 0.128, "step": 19500 }, { "epoch": 0.45489784617178164, "grad_norm": 2.404348134994507, "learning_rate": 8.484269104022881e-07, "loss": 0.1385, "step": 19510 }, { "epoch": 0.4551310075485996, "grad_norm": 2.445657730102539, "learning_rate": 8.483491885842193e-07, "loss": 0.1187, "step": 19520 }, { "epoch": 0.4553641689254175, "grad_norm": 4.468086242675781, "learning_rate": 8.482714667661505e-07, "loss": 0.147, "step": 19530 }, { "epoch": 0.4555973303022354, "grad_norm": 1.3298039436340332, "learning_rate": 8.481937449480818e-07, "loss": 0.1217, "step": 19540 }, { "epoch": 0.4558304916790534, "grad_norm": 1.8743112087249756, "learning_rate": 8.481160231300131e-07, "loss": 0.1264, "step": 19550 }, { "epoch": 0.4560636530558713, "grad_norm": 2.04825496673584, "learning_rate": 8.480383013119442e-07, "loss": 0.1401, "step": 19560 }, { "epoch": 0.4562968144326892, "grad_norm": 1.079911231994629, "learning_rate": 8.479605794938755e-07, "loss": 0.1297, "step": 19570 }, { "epoch": 0.45652997580950716, "grad_norm": 1.9914913177490234, "learning_rate": 8.478828576758066e-07, "loss": 0.13, "step": 19580 }, { "epoch": 0.4567631371863251, "grad_norm": 1.222763180732727, "learning_rate": 8.478051358577379e-07, "loss": 0.1348, "step": 19590 }, { "epoch": 0.456996298563143, "grad_norm": 2.3687641620635986, "learning_rate": 8.477274140396692e-07, "loss": 0.1358, "step": 19600 }, { "epoch": 0.45722945993996095, "grad_norm": 1.399918794631958, "learning_rate": 8.476496922216004e-07, "loss": 0.1206, "step": 19610 }, { "epoch": 0.4574626213167789, "grad_norm": 2.731076717376709, "learning_rate": 8.475719704035317e-07, "loss": 0.131, "step": 19620 }, { "epoch": 0.4576957826935968, "grad_norm": 3.8922040462493896, "learning_rate": 8.47494248585463e-07, "loss": 0.1343, "step": 19630 }, { "epoch": 0.45792894407041473, "grad_norm": 2.2190985679626465, "learning_rate": 8.47416526767394e-07, "loss": 0.1459, "step": 19640 }, { "epoch": 0.4581621054472327, "grad_norm": 3.270641803741455, "learning_rate": 8.473388049493253e-07, "loss": 0.1444, "step": 19650 }, { "epoch": 0.4583952668240506, "grad_norm": 2.077671527862549, "learning_rate": 8.472610831312565e-07, "loss": 0.1305, "step": 19660 }, { "epoch": 0.4586284282008685, "grad_norm": 2.3069305419921875, "learning_rate": 8.471833613131878e-07, "loss": 0.1405, "step": 19670 }, { "epoch": 0.45886158957768647, "grad_norm": 2.0643279552459717, "learning_rate": 8.471056394951191e-07, "loss": 0.1337, "step": 19680 }, { "epoch": 0.45909475095450436, "grad_norm": 1.3620728254318237, "learning_rate": 8.470279176770503e-07, "loss": 0.1259, "step": 19690 }, { "epoch": 0.4593279123313223, "grad_norm": 1.5658994913101196, "learning_rate": 8.469501958589816e-07, "loss": 0.1289, "step": 19700 }, { "epoch": 0.45956107370814026, "grad_norm": 2.0320422649383545, "learning_rate": 8.468724740409127e-07, "loss": 0.1302, "step": 19710 }, { "epoch": 0.4597942350849582, "grad_norm": 1.7412046194076538, "learning_rate": 8.467947522228439e-07, "loss": 0.1157, "step": 19720 }, { "epoch": 0.4600273964617761, "grad_norm": 1.5165817737579346, "learning_rate": 8.467170304047752e-07, "loss": 0.1293, "step": 19730 }, { "epoch": 0.46026055783859404, "grad_norm": 1.6878763437271118, "learning_rate": 8.466393085867064e-07, "loss": 0.1249, "step": 19740 }, { "epoch": 0.460493719215412, "grad_norm": 3.5220651626586914, "learning_rate": 8.465615867686377e-07, "loss": 0.135, "step": 19750 }, { "epoch": 0.4607268805922299, "grad_norm": 2.1653833389282227, "learning_rate": 8.464838649505689e-07, "loss": 0.1216, "step": 19760 }, { "epoch": 0.46096004196904783, "grad_norm": 1.4733048677444458, "learning_rate": 8.464061431325001e-07, "loss": 0.1526, "step": 19770 }, { "epoch": 0.4611932033458658, "grad_norm": 3.1356217861175537, "learning_rate": 8.463284213144314e-07, "loss": 0.1151, "step": 19780 }, { "epoch": 0.46142636472268367, "grad_norm": 1.2777503728866577, "learning_rate": 8.462506994963626e-07, "loss": 0.132, "step": 19790 }, { "epoch": 0.4616595260995016, "grad_norm": 2.131112575531006, "learning_rate": 8.461729776782938e-07, "loss": 0.1316, "step": 19800 }, { "epoch": 0.46189268747631956, "grad_norm": 1.5103076696395874, "learning_rate": 8.46095255860225e-07, "loss": 0.1247, "step": 19810 }, { "epoch": 0.46212584885313746, "grad_norm": 1.9304636716842651, "learning_rate": 8.460175340421563e-07, "loss": 0.1317, "step": 19820 }, { "epoch": 0.4623590102299554, "grad_norm": 1.4198373556137085, "learning_rate": 8.459398122240875e-07, "loss": 0.1239, "step": 19830 }, { "epoch": 0.46259217160677335, "grad_norm": 1.7874321937561035, "learning_rate": 8.458620904060187e-07, "loss": 0.1375, "step": 19840 }, { "epoch": 0.46282533298359124, "grad_norm": 1.948537826538086, "learning_rate": 8.4578436858795e-07, "loss": 0.1341, "step": 19850 }, { "epoch": 0.4630584943604092, "grad_norm": 1.9977257251739502, "learning_rate": 8.457066467698812e-07, "loss": 0.1308, "step": 19860 }, { "epoch": 0.46329165573722714, "grad_norm": 2.3444228172302246, "learning_rate": 8.456289249518125e-07, "loss": 0.1401, "step": 19870 }, { "epoch": 0.4635248171140451, "grad_norm": 3.178436517715454, "learning_rate": 8.455512031337437e-07, "loss": 0.1309, "step": 19880 }, { "epoch": 0.463757978490863, "grad_norm": 2.954336643218994, "learning_rate": 8.454734813156748e-07, "loss": 0.1271, "step": 19890 }, { "epoch": 0.4639911398676809, "grad_norm": 4.1659321784973145, "learning_rate": 8.453957594976061e-07, "loss": 0.1354, "step": 19900 }, { "epoch": 0.46422430124449887, "grad_norm": 1.9610459804534912, "learning_rate": 8.453180376795373e-07, "loss": 0.1199, "step": 19910 }, { "epoch": 0.46445746262131676, "grad_norm": 1.527250051498413, "learning_rate": 8.452403158614686e-07, "loss": 0.1408, "step": 19920 }, { "epoch": 0.4646906239981347, "grad_norm": 3.155945062637329, "learning_rate": 8.451625940433999e-07, "loss": 0.1334, "step": 19930 }, { "epoch": 0.46492378537495266, "grad_norm": 2.2764341831207275, "learning_rate": 8.450848722253311e-07, "loss": 0.1327, "step": 19940 }, { "epoch": 0.46515694675177055, "grad_norm": 1.5692837238311768, "learning_rate": 8.450071504072623e-07, "loss": 0.1192, "step": 19950 }, { "epoch": 0.4653901081285885, "grad_norm": 1.1716152429580688, "learning_rate": 8.449294285891934e-07, "loss": 0.1282, "step": 19960 }, { "epoch": 0.46562326950540645, "grad_norm": 1.7464990615844727, "learning_rate": 8.448517067711247e-07, "loss": 0.1274, "step": 19970 }, { "epoch": 0.46585643088222434, "grad_norm": 1.7187268733978271, "learning_rate": 8.44773984953056e-07, "loss": 0.1333, "step": 19980 }, { "epoch": 0.4660895922590423, "grad_norm": 3.8566577434539795, "learning_rate": 8.446962631349872e-07, "loss": 0.1159, "step": 19990 }, { "epoch": 0.46632275363586023, "grad_norm": 2.5352652072906494, "learning_rate": 8.446185413169185e-07, "loss": 0.1316, "step": 20000 }, { "epoch": 0.46632275363586023, "eval_accuracy": 0.9359310089304548, "eval_f1": 0.9539849357289955, "eval_loss": 0.16536282002925873, "eval_runtime": 3953.9122, "eval_samples_per_second": 462.812, "eval_steps_per_second": 57.852, "step": 20000 }, { "epoch": 0.4665559150126781, "grad_norm": 1.8460683822631836, "learning_rate": 8.445408194988496e-07, "loss": 0.1408, "step": 20010 }, { "epoch": 0.46678907638949607, "grad_norm": 1.5195164680480957, "learning_rate": 8.444630976807809e-07, "loss": 0.1359, "step": 20020 }, { "epoch": 0.467022237766314, "grad_norm": 1.8220893144607544, "learning_rate": 8.443853758627122e-07, "loss": 0.1292, "step": 20030 }, { "epoch": 0.46725539914313197, "grad_norm": 2.269089460372925, "learning_rate": 8.443076540446433e-07, "loss": 0.1249, "step": 20040 }, { "epoch": 0.46748856051994986, "grad_norm": 1.41002357006073, "learning_rate": 8.442299322265746e-07, "loss": 0.1291, "step": 20050 }, { "epoch": 0.4677217218967678, "grad_norm": 1.6403591632843018, "learning_rate": 8.441522104085059e-07, "loss": 0.1197, "step": 20060 }, { "epoch": 0.46795488327358575, "grad_norm": 2.472461700439453, "learning_rate": 8.44074488590437e-07, "loss": 0.13, "step": 20070 }, { "epoch": 0.46818804465040365, "grad_norm": 1.3747100830078125, "learning_rate": 8.439967667723683e-07, "loss": 0.1302, "step": 20080 }, { "epoch": 0.4684212060272216, "grad_norm": 1.438865303993225, "learning_rate": 8.439190449542995e-07, "loss": 0.1266, "step": 20090 }, { "epoch": 0.46865436740403954, "grad_norm": 2.515681505203247, "learning_rate": 8.438413231362308e-07, "loss": 0.1285, "step": 20100 }, { "epoch": 0.46888752878085743, "grad_norm": 2.3855462074279785, "learning_rate": 8.437636013181621e-07, "loss": 0.1274, "step": 20110 }, { "epoch": 0.4691206901576754, "grad_norm": 1.6737693548202515, "learning_rate": 8.436858795000932e-07, "loss": 0.1258, "step": 20120 }, { "epoch": 0.4693538515344933, "grad_norm": 1.4331638813018799, "learning_rate": 8.436081576820244e-07, "loss": 0.1347, "step": 20130 }, { "epoch": 0.4695870129113112, "grad_norm": 1.2215746641159058, "learning_rate": 8.435304358639556e-07, "loss": 0.1308, "step": 20140 }, { "epoch": 0.46982017428812917, "grad_norm": 3.9724040031433105, "learning_rate": 8.434527140458869e-07, "loss": 0.1231, "step": 20150 }, { "epoch": 0.4700533356649471, "grad_norm": 1.2731586694717407, "learning_rate": 8.433749922278182e-07, "loss": 0.1174, "step": 20160 }, { "epoch": 0.470286497041765, "grad_norm": 3.345296859741211, "learning_rate": 8.432972704097494e-07, "loss": 0.1275, "step": 20170 }, { "epoch": 0.47051965841858295, "grad_norm": 1.433951735496521, "learning_rate": 8.432195485916807e-07, "loss": 0.131, "step": 20180 }, { "epoch": 0.4707528197954009, "grad_norm": 2.467684268951416, "learning_rate": 8.431418267736119e-07, "loss": 0.1222, "step": 20190 }, { "epoch": 0.47098598117221885, "grad_norm": 3.070120334625244, "learning_rate": 8.43064104955543e-07, "loss": 0.1248, "step": 20200 }, { "epoch": 0.47121914254903674, "grad_norm": 1.4740465879440308, "learning_rate": 8.429863831374743e-07, "loss": 0.1166, "step": 20210 }, { "epoch": 0.4714523039258547, "grad_norm": 1.5479782819747925, "learning_rate": 8.429086613194055e-07, "loss": 0.1343, "step": 20220 }, { "epoch": 0.47168546530267264, "grad_norm": 1.89780592918396, "learning_rate": 8.428309395013368e-07, "loss": 0.1303, "step": 20230 }, { "epoch": 0.4719186266794905, "grad_norm": 1.6381566524505615, "learning_rate": 8.42753217683268e-07, "loss": 0.1381, "step": 20240 }, { "epoch": 0.4721517880563085, "grad_norm": 3.192000150680542, "learning_rate": 8.426754958651993e-07, "loss": 0.134, "step": 20250 }, { "epoch": 0.4723849494331264, "grad_norm": 1.9155235290527344, "learning_rate": 8.425977740471305e-07, "loss": 0.1274, "step": 20260 }, { "epoch": 0.4726181108099443, "grad_norm": 1.9539934396743774, "learning_rate": 8.425200522290617e-07, "loss": 0.1181, "step": 20270 }, { "epoch": 0.47285127218676226, "grad_norm": 2.0275652408599854, "learning_rate": 8.424423304109929e-07, "loss": 0.1296, "step": 20280 }, { "epoch": 0.4730844335635802, "grad_norm": 2.6832127571105957, "learning_rate": 8.423646085929241e-07, "loss": 0.1207, "step": 20290 }, { "epoch": 0.4733175949403981, "grad_norm": 2.773383140563965, "learning_rate": 8.422868867748554e-07, "loss": 0.1339, "step": 20300 }, { "epoch": 0.47355075631721605, "grad_norm": 1.9133456945419312, "learning_rate": 8.422091649567867e-07, "loss": 0.1338, "step": 20310 }, { "epoch": 0.473783917694034, "grad_norm": 2.3806815147399902, "learning_rate": 8.421314431387178e-07, "loss": 0.1359, "step": 20320 }, { "epoch": 0.4740170790708519, "grad_norm": 1.6426963806152344, "learning_rate": 8.420537213206491e-07, "loss": 0.1259, "step": 20330 }, { "epoch": 0.47425024044766984, "grad_norm": 1.9305709600448608, "learning_rate": 8.419759995025803e-07, "loss": 0.1372, "step": 20340 }, { "epoch": 0.4744834018244878, "grad_norm": 2.2709898948669434, "learning_rate": 8.418982776845116e-07, "loss": 0.1259, "step": 20350 }, { "epoch": 0.47471656320130573, "grad_norm": 1.8522928953170776, "learning_rate": 8.418205558664428e-07, "loss": 0.1355, "step": 20360 }, { "epoch": 0.4749497245781236, "grad_norm": 1.3493374586105347, "learning_rate": 8.41742834048374e-07, "loss": 0.1266, "step": 20370 }, { "epoch": 0.47518288595494157, "grad_norm": 1.2510796785354614, "learning_rate": 8.416651122303052e-07, "loss": 0.1272, "step": 20380 }, { "epoch": 0.4754160473317595, "grad_norm": 1.5591704845428467, "learning_rate": 8.415873904122364e-07, "loss": 0.1365, "step": 20390 }, { "epoch": 0.4756492087085774, "grad_norm": 2.1219639778137207, "learning_rate": 8.415096685941677e-07, "loss": 0.1579, "step": 20400 }, { "epoch": 0.47588237008539536, "grad_norm": 1.510105848312378, "learning_rate": 8.41431946776099e-07, "loss": 0.1255, "step": 20410 }, { "epoch": 0.4761155314622133, "grad_norm": 1.2631494998931885, "learning_rate": 8.413542249580302e-07, "loss": 0.1279, "step": 20420 }, { "epoch": 0.4763486928390312, "grad_norm": 5.9193572998046875, "learning_rate": 8.412765031399615e-07, "loss": 0.1258, "step": 20430 }, { "epoch": 0.47658185421584914, "grad_norm": 1.4707579612731934, "learning_rate": 8.411987813218925e-07, "loss": 0.1319, "step": 20440 }, { "epoch": 0.4768150155926671, "grad_norm": 1.2576663494110107, "learning_rate": 8.411210595038238e-07, "loss": 0.1237, "step": 20450 }, { "epoch": 0.477048176969485, "grad_norm": 2.77142333984375, "learning_rate": 8.410433376857551e-07, "loss": 0.129, "step": 20460 }, { "epoch": 0.47728133834630293, "grad_norm": 1.5697789192199707, "learning_rate": 8.409656158676863e-07, "loss": 0.1235, "step": 20470 }, { "epoch": 0.4775144997231209, "grad_norm": 1.2722837924957275, "learning_rate": 8.408878940496176e-07, "loss": 0.1268, "step": 20480 }, { "epoch": 0.47774766109993877, "grad_norm": 1.4593607187271118, "learning_rate": 8.408101722315489e-07, "loss": 0.1331, "step": 20490 }, { "epoch": 0.4779808224767567, "grad_norm": 1.1792677640914917, "learning_rate": 8.407324504134801e-07, "loss": 0.1277, "step": 20500 }, { "epoch": 0.47821398385357466, "grad_norm": 1.803439736366272, "learning_rate": 8.406547285954113e-07, "loss": 0.1339, "step": 20510 }, { "epoch": 0.4784471452303926, "grad_norm": 1.8629831075668335, "learning_rate": 8.405770067773424e-07, "loss": 0.1201, "step": 20520 }, { "epoch": 0.4786803066072105, "grad_norm": 3.122999429702759, "learning_rate": 8.404992849592737e-07, "loss": 0.1172, "step": 20530 }, { "epoch": 0.47891346798402845, "grad_norm": 1.2951018810272217, "learning_rate": 8.40421563141205e-07, "loss": 0.1278, "step": 20540 }, { "epoch": 0.4791466293608464, "grad_norm": 1.3256354331970215, "learning_rate": 8.403438413231362e-07, "loss": 0.1209, "step": 20550 }, { "epoch": 0.4793797907376643, "grad_norm": 1.3957687616348267, "learning_rate": 8.402661195050675e-07, "loss": 0.1264, "step": 20560 }, { "epoch": 0.47961295211448224, "grad_norm": 4.781678199768066, "learning_rate": 8.401883976869986e-07, "loss": 0.1174, "step": 20570 }, { "epoch": 0.4798461134913002, "grad_norm": 4.019649982452393, "learning_rate": 8.401106758689299e-07, "loss": 0.1344, "step": 20580 }, { "epoch": 0.4800792748681181, "grad_norm": 1.0879679918289185, "learning_rate": 8.400329540508612e-07, "loss": 0.1274, "step": 20590 }, { "epoch": 0.480312436244936, "grad_norm": 1.7530466318130493, "learning_rate": 8.399552322327923e-07, "loss": 0.1297, "step": 20600 }, { "epoch": 0.480545597621754, "grad_norm": 1.3184597492218018, "learning_rate": 8.398775104147236e-07, "loss": 0.1274, "step": 20610 }, { "epoch": 0.48077875899857186, "grad_norm": 2.3299849033355713, "learning_rate": 8.397997885966548e-07, "loss": 0.1266, "step": 20620 }, { "epoch": 0.4810119203753898, "grad_norm": 2.5177817344665527, "learning_rate": 8.39722066778586e-07, "loss": 0.126, "step": 20630 }, { "epoch": 0.48124508175220776, "grad_norm": 2.8914573192596436, "learning_rate": 8.396443449605173e-07, "loss": 0.1347, "step": 20640 }, { "epoch": 0.48147824312902565, "grad_norm": 1.638247013092041, "learning_rate": 8.395666231424485e-07, "loss": 0.1291, "step": 20650 }, { "epoch": 0.4817114045058436, "grad_norm": 2.2865891456604004, "learning_rate": 8.394966735061865e-07, "loss": 0.1245, "step": 20660 }, { "epoch": 0.48194456588266155, "grad_norm": 1.164203405380249, "learning_rate": 8.394189516881178e-07, "loss": 0.1316, "step": 20670 }, { "epoch": 0.4821777272594795, "grad_norm": 1.178735613822937, "learning_rate": 8.393412298700491e-07, "loss": 0.1211, "step": 20680 }, { "epoch": 0.4824108886362974, "grad_norm": 4.504310131072998, "learning_rate": 8.392635080519803e-07, "loss": 0.1329, "step": 20690 }, { "epoch": 0.48264405001311533, "grad_norm": 1.2757624387741089, "learning_rate": 8.391857862339116e-07, "loss": 0.1307, "step": 20700 }, { "epoch": 0.4828772113899333, "grad_norm": 1.1967114210128784, "learning_rate": 8.391080644158429e-07, "loss": 0.1213, "step": 20710 }, { "epoch": 0.4831103727667512, "grad_norm": 2.624696969985962, "learning_rate": 8.39030342597774e-07, "loss": 0.1259, "step": 20720 }, { "epoch": 0.4833435341435691, "grad_norm": 1.4068843126296997, "learning_rate": 8.389526207797052e-07, "loss": 0.13, "step": 20730 }, { "epoch": 0.48357669552038707, "grad_norm": 2.005720853805542, "learning_rate": 8.388748989616364e-07, "loss": 0.1279, "step": 20740 }, { "epoch": 0.48380985689720496, "grad_norm": 3.1830039024353027, "learning_rate": 8.387971771435677e-07, "loss": 0.1338, "step": 20750 }, { "epoch": 0.4840430182740229, "grad_norm": 2.4902327060699463, "learning_rate": 8.38719455325499e-07, "loss": 0.1344, "step": 20760 }, { "epoch": 0.48427617965084085, "grad_norm": 4.5719218254089355, "learning_rate": 8.386417335074302e-07, "loss": 0.1249, "step": 20770 }, { "epoch": 0.48450934102765875, "grad_norm": 1.2409528493881226, "learning_rate": 8.385640116893614e-07, "loss": 0.1143, "step": 20780 }, { "epoch": 0.4847425024044767, "grad_norm": 1.9577546119689941, "learning_rate": 8.384862898712926e-07, "loss": 0.1459, "step": 20790 }, { "epoch": 0.48497566378129464, "grad_norm": 1.5230985879898071, "learning_rate": 8.384085680532239e-07, "loss": 0.115, "step": 20800 }, { "epoch": 0.48520882515811253, "grad_norm": 2.520401954650879, "learning_rate": 8.38330846235155e-07, "loss": 0.1346, "step": 20810 }, { "epoch": 0.4854419865349305, "grad_norm": 1.4270868301391602, "learning_rate": 8.382531244170863e-07, "loss": 0.1202, "step": 20820 }, { "epoch": 0.48567514791174843, "grad_norm": 1.294369101524353, "learning_rate": 8.381754025990176e-07, "loss": 0.132, "step": 20830 }, { "epoch": 0.4859083092885664, "grad_norm": 2.8860740661621094, "learning_rate": 8.380976807809487e-07, "loss": 0.1355, "step": 20840 }, { "epoch": 0.48614147066538427, "grad_norm": 1.435876488685608, "learning_rate": 8.3801995896288e-07, "loss": 0.1267, "step": 20850 }, { "epoch": 0.4863746320422022, "grad_norm": 1.9917433261871338, "learning_rate": 8.379422371448113e-07, "loss": 0.1273, "step": 20860 }, { "epoch": 0.48660779341902016, "grad_norm": 2.7482197284698486, "learning_rate": 8.378645153267425e-07, "loss": 0.1363, "step": 20870 }, { "epoch": 0.48684095479583805, "grad_norm": 2.4748027324676514, "learning_rate": 8.377867935086738e-07, "loss": 0.1289, "step": 20880 }, { "epoch": 0.487074116172656, "grad_norm": 1.1440248489379883, "learning_rate": 8.377090716906049e-07, "loss": 0.1266, "step": 20890 }, { "epoch": 0.48730727754947395, "grad_norm": 1.6609928607940674, "learning_rate": 8.376313498725361e-07, "loss": 0.136, "step": 20900 }, { "epoch": 0.48754043892629184, "grad_norm": 1.6898146867752075, "learning_rate": 8.375536280544674e-07, "loss": 0.1277, "step": 20910 }, { "epoch": 0.4877736003031098, "grad_norm": 2.037558078765869, "learning_rate": 8.374759062363986e-07, "loss": 0.13, "step": 20920 }, { "epoch": 0.48800676167992774, "grad_norm": 1.106569528579712, "learning_rate": 8.373981844183299e-07, "loss": 0.1228, "step": 20930 }, { "epoch": 0.48823992305674563, "grad_norm": 1.3121051788330078, "learning_rate": 8.373204626002611e-07, "loss": 0.1355, "step": 20940 }, { "epoch": 0.4884730844335636, "grad_norm": 1.9649745225906372, "learning_rate": 8.372427407821924e-07, "loss": 0.1196, "step": 20950 }, { "epoch": 0.4887062458103815, "grad_norm": 1.3725224733352661, "learning_rate": 8.371650189641237e-07, "loss": 0.1293, "step": 20960 }, { "epoch": 0.4889394071871994, "grad_norm": 1.241304874420166, "learning_rate": 8.370872971460547e-07, "loss": 0.13, "step": 20970 }, { "epoch": 0.48917256856401736, "grad_norm": 1.5113403797149658, "learning_rate": 8.37009575327986e-07, "loss": 0.1269, "step": 20980 }, { "epoch": 0.4894057299408353, "grad_norm": 1.370584487915039, "learning_rate": 8.369318535099172e-07, "loss": 0.1324, "step": 20990 }, { "epoch": 0.48963889131765326, "grad_norm": 1.3429780006408691, "learning_rate": 8.368541316918485e-07, "loss": 0.1256, "step": 21000 }, { "epoch": 0.48987205269447115, "grad_norm": 2.753114938735962, "learning_rate": 8.367764098737798e-07, "loss": 0.1253, "step": 21010 }, { "epoch": 0.4901052140712891, "grad_norm": 2.7693560123443604, "learning_rate": 8.36698688055711e-07, "loss": 0.1184, "step": 21020 }, { "epoch": 0.49033837544810704, "grad_norm": 3.215080738067627, "learning_rate": 8.366209662376422e-07, "loss": 0.1263, "step": 21030 }, { "epoch": 0.49057153682492494, "grad_norm": 1.2303367853164673, "learning_rate": 8.365432444195734e-07, "loss": 0.1339, "step": 21040 }, { "epoch": 0.4908046982017429, "grad_norm": 1.5861575603485107, "learning_rate": 8.364655226015046e-07, "loss": 0.1288, "step": 21050 }, { "epoch": 0.49103785957856083, "grad_norm": 1.7302842140197754, "learning_rate": 8.363878007834359e-07, "loss": 0.1221, "step": 21060 }, { "epoch": 0.4912710209553787, "grad_norm": 1.8254520893096924, "learning_rate": 8.363100789653671e-07, "loss": 0.1211, "step": 21070 }, { "epoch": 0.49150418233219667, "grad_norm": 1.1252317428588867, "learning_rate": 8.362323571472984e-07, "loss": 0.1242, "step": 21080 }, { "epoch": 0.4917373437090146, "grad_norm": 2.909825563430786, "learning_rate": 8.361546353292295e-07, "loss": 0.1223, "step": 21090 }, { "epoch": 0.4919705050858325, "grad_norm": 1.5532315969467163, "learning_rate": 8.360769135111608e-07, "loss": 0.1286, "step": 21100 }, { "epoch": 0.49220366646265046, "grad_norm": 1.5361275672912598, "learning_rate": 8.359991916930921e-07, "loss": 0.129, "step": 21110 }, { "epoch": 0.4924368278394684, "grad_norm": 1.5791687965393066, "learning_rate": 8.359214698750233e-07, "loss": 0.1292, "step": 21120 }, { "epoch": 0.4926699892162863, "grad_norm": 1.4787017107009888, "learning_rate": 8.358437480569545e-07, "loss": 0.1279, "step": 21130 }, { "epoch": 0.49290315059310424, "grad_norm": 2.5545814037323, "learning_rate": 8.357660262388857e-07, "loss": 0.1092, "step": 21140 }, { "epoch": 0.4931363119699222, "grad_norm": 1.5042848587036133, "learning_rate": 8.356883044208169e-07, "loss": 0.126, "step": 21150 }, { "epoch": 0.49336947334674014, "grad_norm": 2.095512866973877, "learning_rate": 8.356105826027482e-07, "loss": 0.1328, "step": 21160 }, { "epoch": 0.49360263472355803, "grad_norm": 1.9433083534240723, "learning_rate": 8.355328607846794e-07, "loss": 0.1275, "step": 21170 }, { "epoch": 0.493835796100376, "grad_norm": 1.940142273902893, "learning_rate": 8.354551389666107e-07, "loss": 0.1376, "step": 21180 }, { "epoch": 0.4940689574771939, "grad_norm": 2.156810760498047, "learning_rate": 8.35377417148542e-07, "loss": 0.143, "step": 21190 }, { "epoch": 0.4943021188540118, "grad_norm": 1.962701439857483, "learning_rate": 8.352996953304732e-07, "loss": 0.1215, "step": 21200 }, { "epoch": 0.49453528023082977, "grad_norm": 1.4575930833816528, "learning_rate": 8.352219735124043e-07, "loss": 0.1286, "step": 21210 }, { "epoch": 0.4947684416076477, "grad_norm": 2.352045774459839, "learning_rate": 8.351442516943355e-07, "loss": 0.125, "step": 21220 }, { "epoch": 0.4950016029844656, "grad_norm": 1.020195722579956, "learning_rate": 8.350665298762668e-07, "loss": 0.1188, "step": 21230 }, { "epoch": 0.49523476436128355, "grad_norm": 1.438582420349121, "learning_rate": 8.34988808058198e-07, "loss": 0.1206, "step": 21240 }, { "epoch": 0.4954679257381015, "grad_norm": 1.3208593130111694, "learning_rate": 8.349110862401293e-07, "loss": 0.1226, "step": 21250 }, { "epoch": 0.4957010871149194, "grad_norm": 1.674843668937683, "learning_rate": 8.348333644220606e-07, "loss": 0.1223, "step": 21260 }, { "epoch": 0.49593424849173734, "grad_norm": 2.463418960571289, "learning_rate": 8.347556426039917e-07, "loss": 0.1326, "step": 21270 }, { "epoch": 0.4961674098685553, "grad_norm": 1.7285377979278564, "learning_rate": 8.34677920785923e-07, "loss": 0.1238, "step": 21280 }, { "epoch": 0.4964005712453732, "grad_norm": 1.7342774868011475, "learning_rate": 8.346001989678541e-07, "loss": 0.1214, "step": 21290 }, { "epoch": 0.4966337326221911, "grad_norm": 2.075880765914917, "learning_rate": 8.345224771497854e-07, "loss": 0.1377, "step": 21300 }, { "epoch": 0.4968668939990091, "grad_norm": 1.3335037231445312, "learning_rate": 8.344447553317167e-07, "loss": 0.1211, "step": 21310 }, { "epoch": 0.497100055375827, "grad_norm": 1.440301775932312, "learning_rate": 8.343670335136479e-07, "loss": 0.1219, "step": 21320 }, { "epoch": 0.4973332167526449, "grad_norm": 1.810006856918335, "learning_rate": 8.342893116955792e-07, "loss": 0.1294, "step": 21330 }, { "epoch": 0.49756637812946286, "grad_norm": 2.119765520095825, "learning_rate": 8.342115898775104e-07, "loss": 0.1297, "step": 21340 }, { "epoch": 0.4977995395062808, "grad_norm": 2.9094090461730957, "learning_rate": 8.341338680594416e-07, "loss": 0.1409, "step": 21350 }, { "epoch": 0.4980327008830987, "grad_norm": 1.4784271717071533, "learning_rate": 8.340561462413729e-07, "loss": 0.1335, "step": 21360 }, { "epoch": 0.49826586225991665, "grad_norm": 1.9297759532928467, "learning_rate": 8.33978424423304e-07, "loss": 0.1253, "step": 21370 }, { "epoch": 0.4984990236367346, "grad_norm": 2.052114963531494, "learning_rate": 8.339007026052353e-07, "loss": 0.1373, "step": 21380 }, { "epoch": 0.4987321850135525, "grad_norm": 1.7115739583969116, "learning_rate": 8.338229807871666e-07, "loss": 0.1312, "step": 21390 }, { "epoch": 0.49896534639037043, "grad_norm": 1.8911221027374268, "learning_rate": 8.337452589690977e-07, "loss": 0.1334, "step": 21400 }, { "epoch": 0.4991985077671884, "grad_norm": 1.706508994102478, "learning_rate": 8.33667537151029e-07, "loss": 0.1308, "step": 21410 }, { "epoch": 0.4994316691440063, "grad_norm": 1.551406979560852, "learning_rate": 8.335898153329602e-07, "loss": 0.1324, "step": 21420 }, { "epoch": 0.4996648305208242, "grad_norm": 1.5026063919067383, "learning_rate": 8.335120935148915e-07, "loss": 0.1179, "step": 21430 }, { "epoch": 0.49989799189764217, "grad_norm": 1.4411370754241943, "learning_rate": 8.334343716968228e-07, "loss": 0.1139, "step": 21440 }, { "epoch": 0.5001311532744601, "grad_norm": 3.5878586769104004, "learning_rate": 8.33356649878754e-07, "loss": 0.1297, "step": 21450 }, { "epoch": 0.5003643146512781, "grad_norm": 1.3914203643798828, "learning_rate": 8.332789280606851e-07, "loss": 0.1264, "step": 21460 }, { "epoch": 0.5005974760280959, "grad_norm": 1.768888235092163, "learning_rate": 8.332012062426163e-07, "loss": 0.1427, "step": 21470 }, { "epoch": 0.5008306374049138, "grad_norm": 1.6664551496505737, "learning_rate": 8.331234844245476e-07, "loss": 0.1176, "step": 21480 }, { "epoch": 0.5010637987817318, "grad_norm": 2.2097997665405273, "learning_rate": 8.330457626064789e-07, "loss": 0.1325, "step": 21490 }, { "epoch": 0.5012969601585497, "grad_norm": 1.2098329067230225, "learning_rate": 8.329680407884101e-07, "loss": 0.1126, "step": 21500 }, { "epoch": 0.5015301215353677, "grad_norm": 3.8510091304779053, "learning_rate": 8.328903189703414e-07, "loss": 0.1214, "step": 21510 }, { "epoch": 0.5017632829121856, "grad_norm": 3.781320571899414, "learning_rate": 8.328125971522725e-07, "loss": 0.1254, "step": 21520 }, { "epoch": 0.5019964442890035, "grad_norm": 1.4929929971694946, "learning_rate": 8.327348753342038e-07, "loss": 0.1314, "step": 21530 }, { "epoch": 0.5022296056658214, "grad_norm": 2.441821336746216, "learning_rate": 8.32657153516135e-07, "loss": 0.1209, "step": 21540 }, { "epoch": 0.5024627670426394, "grad_norm": 1.8204002380371094, "learning_rate": 8.325794316980662e-07, "loss": 0.1215, "step": 21550 }, { "epoch": 0.5026959284194573, "grad_norm": 1.8176003694534302, "learning_rate": 8.325017098799975e-07, "loss": 0.1124, "step": 21560 }, { "epoch": 0.5029290897962753, "grad_norm": 1.5023193359375, "learning_rate": 8.324239880619287e-07, "loss": 0.1337, "step": 21570 }, { "epoch": 0.5031622511730932, "grad_norm": 2.4639368057250977, "learning_rate": 8.323462662438599e-07, "loss": 0.1268, "step": 21580 }, { "epoch": 0.5033954125499112, "grad_norm": 1.5848349332809448, "learning_rate": 8.322685444257912e-07, "loss": 0.1455, "step": 21590 }, { "epoch": 0.503628573926729, "grad_norm": 1.7865599393844604, "learning_rate": 8.321908226077224e-07, "loss": 0.1203, "step": 21600 }, { "epoch": 0.5038617353035469, "grad_norm": 1.2442675828933716, "learning_rate": 8.321131007896537e-07, "loss": 0.1269, "step": 21610 }, { "epoch": 0.5040948966803649, "grad_norm": 2.0788145065307617, "learning_rate": 8.320353789715848e-07, "loss": 0.1206, "step": 21620 }, { "epoch": 0.5043280580571828, "grad_norm": 1.93287992477417, "learning_rate": 8.319576571535161e-07, "loss": 0.1486, "step": 21630 }, { "epoch": 0.5045612194340008, "grad_norm": 2.252885341644287, "learning_rate": 8.318799353354473e-07, "loss": 0.1319, "step": 21640 }, { "epoch": 0.5047943808108187, "grad_norm": 2.1544814109802246, "learning_rate": 8.318022135173785e-07, "loss": 0.1264, "step": 21650 }, { "epoch": 0.5050275421876366, "grad_norm": 1.9670062065124512, "learning_rate": 8.317244916993098e-07, "loss": 0.1383, "step": 21660 }, { "epoch": 0.5052607035644545, "grad_norm": 1.2472879886627197, "learning_rate": 8.31646769881241e-07, "loss": 0.1241, "step": 21670 }, { "epoch": 0.5054938649412725, "grad_norm": 2.504549026489258, "learning_rate": 8.315690480631723e-07, "loss": 0.1343, "step": 21680 }, { "epoch": 0.5057270263180904, "grad_norm": 1.24889075756073, "learning_rate": 8.314913262451036e-07, "loss": 0.1215, "step": 21690 }, { "epoch": 0.5059601876949084, "grad_norm": 2.576904296875, "learning_rate": 8.314136044270346e-07, "loss": 0.1346, "step": 21700 }, { "epoch": 0.5061933490717263, "grad_norm": 1.4697399139404297, "learning_rate": 8.313358826089659e-07, "loss": 0.1373, "step": 21710 }, { "epoch": 0.5064265104485443, "grad_norm": 2.1599009037017822, "learning_rate": 8.312581607908972e-07, "loss": 0.1272, "step": 21720 }, { "epoch": 0.5066596718253621, "grad_norm": 3.2394185066223145, "learning_rate": 8.311804389728284e-07, "loss": 0.1337, "step": 21730 }, { "epoch": 0.50689283320218, "grad_norm": 1.3317068815231323, "learning_rate": 8.311027171547597e-07, "loss": 0.1369, "step": 21740 }, { "epoch": 0.507125994578998, "grad_norm": 1.4011521339416504, "learning_rate": 8.310249953366909e-07, "loss": 0.1498, "step": 21750 }, { "epoch": 0.5073591559558159, "grad_norm": 2.4989163875579834, "learning_rate": 8.309472735186222e-07, "loss": 0.1268, "step": 21760 }, { "epoch": 0.5075923173326339, "grad_norm": 1.7494663000106812, "learning_rate": 8.308695517005534e-07, "loss": 0.1249, "step": 21770 }, { "epoch": 0.5078254787094518, "grad_norm": 1.7315900325775146, "learning_rate": 8.307918298824845e-07, "loss": 0.1252, "step": 21780 }, { "epoch": 0.5080586400862697, "grad_norm": 2.650188684463501, "learning_rate": 8.307141080644158e-07, "loss": 0.1183, "step": 21790 }, { "epoch": 0.5082918014630876, "grad_norm": 1.4787408113479614, "learning_rate": 8.30636386246347e-07, "loss": 0.1189, "step": 21800 }, { "epoch": 0.5085249628399056, "grad_norm": 3.1204187870025635, "learning_rate": 8.305586644282783e-07, "loss": 0.1085, "step": 21810 }, { "epoch": 0.5087581242167235, "grad_norm": 3.6875810623168945, "learning_rate": 8.304809426102096e-07, "loss": 0.1178, "step": 21820 }, { "epoch": 0.5089912855935415, "grad_norm": 1.6456385850906372, "learning_rate": 8.304032207921407e-07, "loss": 0.1264, "step": 21830 }, { "epoch": 0.5092244469703594, "grad_norm": 1.3485101461410522, "learning_rate": 8.30325498974072e-07, "loss": 0.1207, "step": 21840 }, { "epoch": 0.5094576083471772, "grad_norm": 1.6566025018692017, "learning_rate": 8.302477771560032e-07, "loss": 0.1269, "step": 21850 }, { "epoch": 0.5096907697239952, "grad_norm": 1.5516446828842163, "learning_rate": 8.301700553379344e-07, "loss": 0.1244, "step": 21860 }, { "epoch": 0.5099239311008131, "grad_norm": 1.5630277395248413, "learning_rate": 8.300923335198657e-07, "loss": 0.1237, "step": 21870 }, { "epoch": 0.5101570924776311, "grad_norm": 1.3655178546905518, "learning_rate": 8.300146117017969e-07, "loss": 0.1179, "step": 21880 }, { "epoch": 0.510390253854449, "grad_norm": 1.7931239604949951, "learning_rate": 8.299368898837281e-07, "loss": 0.1349, "step": 21890 }, { "epoch": 0.510623415231267, "grad_norm": 1.448595404624939, "learning_rate": 8.298591680656593e-07, "loss": 0.1219, "step": 21900 }, { "epoch": 0.5108565766080849, "grad_norm": 1.5933582782745361, "learning_rate": 8.297814462475906e-07, "loss": 0.133, "step": 21910 }, { "epoch": 0.5110897379849028, "grad_norm": 3.4577877521514893, "learning_rate": 8.297037244295219e-07, "loss": 0.1286, "step": 21920 }, { "epoch": 0.5113228993617207, "grad_norm": 1.2522382736206055, "learning_rate": 8.296260026114531e-07, "loss": 0.1235, "step": 21930 }, { "epoch": 0.5115560607385387, "grad_norm": 1.383453130722046, "learning_rate": 8.295482807933843e-07, "loss": 0.1191, "step": 21940 }, { "epoch": 0.5117892221153566, "grad_norm": 1.5258543491363525, "learning_rate": 8.294705589753154e-07, "loss": 0.1328, "step": 21950 }, { "epoch": 0.5120223834921745, "grad_norm": 2.291264533996582, "learning_rate": 8.293928371572467e-07, "loss": 0.1235, "step": 21960 }, { "epoch": 0.5122555448689925, "grad_norm": 1.1360750198364258, "learning_rate": 8.29315115339178e-07, "loss": 0.132, "step": 21970 }, { "epoch": 0.5124887062458103, "grad_norm": 2.4223155975341797, "learning_rate": 8.292373935211092e-07, "loss": 0.1309, "step": 21980 }, { "epoch": 0.5127218676226283, "grad_norm": 1.9386653900146484, "learning_rate": 8.291596717030405e-07, "loss": 0.14, "step": 21990 }, { "epoch": 0.5129550289994462, "grad_norm": 2.039977788925171, "learning_rate": 8.290819498849718e-07, "loss": 0.1334, "step": 22000 }, { "epoch": 0.5131881903762642, "grad_norm": 3.4593698978424072, "learning_rate": 8.290042280669029e-07, "loss": 0.1451, "step": 22010 }, { "epoch": 0.5134213517530821, "grad_norm": 1.3921164274215698, "learning_rate": 8.289265062488341e-07, "loss": 0.1261, "step": 22020 }, { "epoch": 0.5136545131299001, "grad_norm": 1.2240359783172607, "learning_rate": 8.288487844307653e-07, "loss": 0.1226, "step": 22030 }, { "epoch": 0.513887674506718, "grad_norm": 1.5827043056488037, "learning_rate": 8.287710626126966e-07, "loss": 0.1194, "step": 22040 }, { "epoch": 0.5141208358835359, "grad_norm": 1.6027922630310059, "learning_rate": 8.286933407946278e-07, "loss": 0.1233, "step": 22050 }, { "epoch": 0.5143539972603538, "grad_norm": 1.320663571357727, "learning_rate": 8.286156189765591e-07, "loss": 0.1277, "step": 22060 }, { "epoch": 0.5145871586371717, "grad_norm": 2.377835988998413, "learning_rate": 8.285378971584903e-07, "loss": 0.1328, "step": 22070 }, { "epoch": 0.5148203200139897, "grad_norm": 1.3724974393844604, "learning_rate": 8.284601753404215e-07, "loss": 0.1363, "step": 22080 }, { "epoch": 0.5150534813908076, "grad_norm": 2.290247678756714, "learning_rate": 8.283824535223528e-07, "loss": 0.132, "step": 22090 }, { "epoch": 0.5152866427676256, "grad_norm": 1.6334372758865356, "learning_rate": 8.28304731704284e-07, "loss": 0.1323, "step": 22100 }, { "epoch": 0.5155198041444434, "grad_norm": 3.0750646591186523, "learning_rate": 8.282270098862152e-07, "loss": 0.134, "step": 22110 }, { "epoch": 0.5157529655212614, "grad_norm": 3.1351118087768555, "learning_rate": 8.281492880681465e-07, "loss": 0.1263, "step": 22120 }, { "epoch": 0.5159861268980793, "grad_norm": 1.6079703569412231, "learning_rate": 8.280715662500776e-07, "loss": 0.109, "step": 22130 }, { "epoch": 0.5162192882748973, "grad_norm": 3.3739075660705566, "learning_rate": 8.279938444320089e-07, "loss": 0.1174, "step": 22140 }, { "epoch": 0.5164524496517152, "grad_norm": 1.9263346195220947, "learning_rate": 8.279161226139402e-07, "loss": 0.1359, "step": 22150 }, { "epoch": 0.5166856110285332, "grad_norm": 1.8761056661605835, "learning_rate": 8.278384007958714e-07, "loss": 0.1194, "step": 22160 }, { "epoch": 0.516918772405351, "grad_norm": 1.3998554944992065, "learning_rate": 8.277606789778027e-07, "loss": 0.1245, "step": 22170 }, { "epoch": 0.517151933782169, "grad_norm": 1.8969961404800415, "learning_rate": 8.276829571597338e-07, "loss": 0.1229, "step": 22180 }, { "epoch": 0.5173850951589869, "grad_norm": 1.4227430820465088, "learning_rate": 8.27605235341665e-07, "loss": 0.1166, "step": 22190 }, { "epoch": 0.5176182565358048, "grad_norm": 1.4533779621124268, "learning_rate": 8.275275135235963e-07, "loss": 0.1207, "step": 22200 }, { "epoch": 0.5178514179126228, "grad_norm": 1.2055459022521973, "learning_rate": 8.274497917055275e-07, "loss": 0.1165, "step": 22210 }, { "epoch": 0.5180845792894407, "grad_norm": 4.2553229331970215, "learning_rate": 8.273720698874588e-07, "loss": 0.1176, "step": 22220 }, { "epoch": 0.5183177406662587, "grad_norm": 1.7255173921585083, "learning_rate": 8.2729434806939e-07, "loss": 0.1257, "step": 22230 }, { "epoch": 0.5185509020430765, "grad_norm": 1.9623571634292603, "learning_rate": 8.272166262513213e-07, "loss": 0.1224, "step": 22240 }, { "epoch": 0.5187840634198945, "grad_norm": 1.2914024591445923, "learning_rate": 8.271389044332526e-07, "loss": 0.1165, "step": 22250 }, { "epoch": 0.5190172247967124, "grad_norm": 2.7821364402770996, "learning_rate": 8.270611826151836e-07, "loss": 0.1233, "step": 22260 }, { "epoch": 0.5192503861735304, "grad_norm": 1.1652735471725464, "learning_rate": 8.269834607971149e-07, "loss": 0.1304, "step": 22270 }, { "epoch": 0.5194835475503483, "grad_norm": 1.923224925994873, "learning_rate": 8.269057389790461e-07, "loss": 0.1188, "step": 22280 }, { "epoch": 0.5197167089271663, "grad_norm": 2.5299971103668213, "learning_rate": 8.268280171609774e-07, "loss": 0.1321, "step": 22290 }, { "epoch": 0.5199498703039841, "grad_norm": 2.4627110958099365, "learning_rate": 8.267502953429087e-07, "loss": 0.1244, "step": 22300 }, { "epoch": 0.520183031680802, "grad_norm": 1.183025598526001, "learning_rate": 8.266725735248399e-07, "loss": 0.1249, "step": 22310 }, { "epoch": 0.52041619305762, "grad_norm": 3.006518840789795, "learning_rate": 8.265948517067711e-07, "loss": 0.1233, "step": 22320 }, { "epoch": 0.5206493544344379, "grad_norm": 1.5197983980178833, "learning_rate": 8.265171298887023e-07, "loss": 0.1287, "step": 22330 }, { "epoch": 0.5208825158112559, "grad_norm": 1.7807471752166748, "learning_rate": 8.264394080706335e-07, "loss": 0.1329, "step": 22340 }, { "epoch": 0.5211156771880738, "grad_norm": 2.266185760498047, "learning_rate": 8.263616862525648e-07, "loss": 0.1236, "step": 22350 }, { "epoch": 0.5213488385648918, "grad_norm": 1.8248158693313599, "learning_rate": 8.26283964434496e-07, "loss": 0.1398, "step": 22360 }, { "epoch": 0.5215819999417096, "grad_norm": 1.5573277473449707, "learning_rate": 8.262062426164273e-07, "loss": 0.1215, "step": 22370 }, { "epoch": 0.5218151613185276, "grad_norm": 2.449381113052368, "learning_rate": 8.261285207983584e-07, "loss": 0.13, "step": 22380 }, { "epoch": 0.5220483226953455, "grad_norm": 1.5501075983047485, "learning_rate": 8.260507989802897e-07, "loss": 0.1309, "step": 22390 }, { "epoch": 0.5222814840721635, "grad_norm": 1.9193555116653442, "learning_rate": 8.25973077162221e-07, "loss": 0.1167, "step": 22400 }, { "epoch": 0.5225146454489814, "grad_norm": 2.6887404918670654, "learning_rate": 8.258953553441522e-07, "loss": 0.1204, "step": 22410 }, { "epoch": 0.5227478068257994, "grad_norm": 2.177759885787964, "learning_rate": 8.258176335260834e-07, "loss": 0.131, "step": 22420 }, { "epoch": 0.5229809682026172, "grad_norm": 3.113032817840576, "learning_rate": 8.257399117080146e-07, "loss": 0.115, "step": 22430 }, { "epoch": 0.5232141295794351, "grad_norm": 1.6057581901550293, "learning_rate": 8.256621898899458e-07, "loss": 0.1266, "step": 22440 }, { "epoch": 0.5234472909562531, "grad_norm": 3.3441944122314453, "learning_rate": 8.255844680718771e-07, "loss": 0.1185, "step": 22450 }, { "epoch": 0.523680452333071, "grad_norm": 1.6623482704162598, "learning_rate": 8.255067462538083e-07, "loss": 0.1194, "step": 22460 }, { "epoch": 0.523913613709889, "grad_norm": 1.4077507257461548, "learning_rate": 8.254290244357396e-07, "loss": 0.1244, "step": 22470 }, { "epoch": 0.5241467750867069, "grad_norm": 1.6218230724334717, "learning_rate": 8.253513026176709e-07, "loss": 0.13, "step": 22480 }, { "epoch": 0.5243799364635248, "grad_norm": 1.1145514249801636, "learning_rate": 8.252735807996021e-07, "loss": 0.1345, "step": 22490 }, { "epoch": 0.5246130978403427, "grad_norm": 1.9744139909744263, "learning_rate": 8.251958589815332e-07, "loss": 0.1228, "step": 22500 }, { "epoch": 0.5248462592171607, "grad_norm": 2.756617307662964, "learning_rate": 8.251181371634644e-07, "loss": 0.138, "step": 22510 }, { "epoch": 0.5250794205939786, "grad_norm": 1.2873464822769165, "learning_rate": 8.250404153453957e-07, "loss": 0.1145, "step": 22520 }, { "epoch": 0.5253125819707966, "grad_norm": 1.5757509469985962, "learning_rate": 8.24962693527327e-07, "loss": 0.1351, "step": 22530 }, { "epoch": 0.5255457433476145, "grad_norm": 2.7235217094421387, "learning_rate": 8.248849717092582e-07, "loss": 0.1277, "step": 22540 }, { "epoch": 0.5257789047244324, "grad_norm": 1.5028947591781616, "learning_rate": 8.248072498911895e-07, "loss": 0.1171, "step": 22550 }, { "epoch": 0.5260120661012503, "grad_norm": 1.7433838844299316, "learning_rate": 8.247295280731206e-07, "loss": 0.1064, "step": 22560 }, { "epoch": 0.5262452274780682, "grad_norm": 3.2150683403015137, "learning_rate": 8.246518062550519e-07, "loss": 0.1338, "step": 22570 }, { "epoch": 0.5264783888548862, "grad_norm": 2.781367063522339, "learning_rate": 8.24574084436983e-07, "loss": 0.123, "step": 22580 }, { "epoch": 0.5267115502317041, "grad_norm": 1.3580901622772217, "learning_rate": 8.244963626189143e-07, "loss": 0.132, "step": 22590 }, { "epoch": 0.5269447116085221, "grad_norm": 1.2757551670074463, "learning_rate": 8.244186408008456e-07, "loss": 0.1196, "step": 22600 }, { "epoch": 0.52717787298534, "grad_norm": 3.1361629962921143, "learning_rate": 8.243409189827768e-07, "loss": 0.1373, "step": 22610 }, { "epoch": 0.5274110343621579, "grad_norm": 1.4986242055892944, "learning_rate": 8.242631971647081e-07, "loss": 0.1229, "step": 22620 }, { "epoch": 0.5276441957389758, "grad_norm": 2.4701790809631348, "learning_rate": 8.241854753466393e-07, "loss": 0.1256, "step": 22630 }, { "epoch": 0.5278773571157938, "grad_norm": 4.059034824371338, "learning_rate": 8.241077535285705e-07, "loss": 0.1498, "step": 22640 }, { "epoch": 0.5281105184926117, "grad_norm": 1.3524978160858154, "learning_rate": 8.240300317105018e-07, "loss": 0.123, "step": 22650 }, { "epoch": 0.5283436798694296, "grad_norm": 1.969784140586853, "learning_rate": 8.239523098924329e-07, "loss": 0.1235, "step": 22660 }, { "epoch": 0.5285768412462476, "grad_norm": 3.4231297969818115, "learning_rate": 8.238745880743642e-07, "loss": 0.1223, "step": 22670 }, { "epoch": 0.5288100026230654, "grad_norm": 2.1345229148864746, "learning_rate": 8.238046384381023e-07, "loss": 0.1449, "step": 22680 }, { "epoch": 0.5290431639998834, "grad_norm": 1.8768846988677979, "learning_rate": 8.237269166200336e-07, "loss": 0.1304, "step": 22690 }, { "epoch": 0.5292763253767013, "grad_norm": 1.2699986696243286, "learning_rate": 8.236491948019648e-07, "loss": 0.1237, "step": 22700 }, { "epoch": 0.5295094867535193, "grad_norm": 1.7022615671157837, "learning_rate": 8.23571472983896e-07, "loss": 0.14, "step": 22710 }, { "epoch": 0.5297426481303372, "grad_norm": 1.047471523284912, "learning_rate": 8.234937511658272e-07, "loss": 0.1281, "step": 22720 }, { "epoch": 0.5299758095071552, "grad_norm": 3.540414333343506, "learning_rate": 8.234160293477584e-07, "loss": 0.1372, "step": 22730 }, { "epoch": 0.5302089708839731, "grad_norm": 1.3576087951660156, "learning_rate": 8.233383075296897e-07, "loss": 0.1249, "step": 22740 }, { "epoch": 0.530442132260791, "grad_norm": 4.675361156463623, "learning_rate": 8.232605857116209e-07, "loss": 0.1249, "step": 22750 }, { "epoch": 0.5306752936376089, "grad_norm": 2.8892085552215576, "learning_rate": 8.231828638935522e-07, "loss": 0.1343, "step": 22760 }, { "epoch": 0.5309084550144268, "grad_norm": 1.361635684967041, "learning_rate": 8.231051420754835e-07, "loss": 0.1182, "step": 22770 }, { "epoch": 0.5311416163912448, "grad_norm": 1.3769659996032715, "learning_rate": 8.230274202574146e-07, "loss": 0.1357, "step": 22780 }, { "epoch": 0.5313747777680627, "grad_norm": 3.679626941680908, "learning_rate": 8.229496984393458e-07, "loss": 0.1132, "step": 22790 }, { "epoch": 0.5316079391448807, "grad_norm": 2.896420955657959, "learning_rate": 8.22871976621277e-07, "loss": 0.1266, "step": 22800 }, { "epoch": 0.5318411005216985, "grad_norm": 1.636715054512024, "learning_rate": 8.227942548032083e-07, "loss": 0.114, "step": 22810 }, { "epoch": 0.5320742618985165, "grad_norm": 1.3522619009017944, "learning_rate": 8.227165329851396e-07, "loss": 0.1288, "step": 22820 }, { "epoch": 0.5323074232753344, "grad_norm": 1.2705423831939697, "learning_rate": 8.226388111670708e-07, "loss": 0.1337, "step": 22830 }, { "epoch": 0.5325405846521524, "grad_norm": 4.337090015411377, "learning_rate": 8.22561089349002e-07, "loss": 0.1285, "step": 22840 }, { "epoch": 0.5327737460289703, "grad_norm": 2.6833367347717285, "learning_rate": 8.224833675309332e-07, "loss": 0.1334, "step": 22850 }, { "epoch": 0.5330069074057883, "grad_norm": 1.5908411741256714, "learning_rate": 8.224056457128645e-07, "loss": 0.134, "step": 22860 }, { "epoch": 0.5332400687826062, "grad_norm": 1.6649872064590454, "learning_rate": 8.223279238947957e-07, "loss": 0.1336, "step": 22870 }, { "epoch": 0.533473230159424, "grad_norm": 2.9533793926239014, "learning_rate": 8.222502020767269e-07, "loss": 0.1197, "step": 22880 }, { "epoch": 0.533706391536242, "grad_norm": 1.5918611288070679, "learning_rate": 8.221724802586582e-07, "loss": 0.1097, "step": 22890 }, { "epoch": 0.53393955291306, "grad_norm": 1.3670610189437866, "learning_rate": 8.220947584405893e-07, "loss": 0.1244, "step": 22900 }, { "epoch": 0.5341727142898779, "grad_norm": 1.7467800378799438, "learning_rate": 8.220170366225206e-07, "loss": 0.1199, "step": 22910 }, { "epoch": 0.5344058756666958, "grad_norm": 1.121644377708435, "learning_rate": 8.219393148044519e-07, "loss": 0.1165, "step": 22920 }, { "epoch": 0.5346390370435138, "grad_norm": 1.1821573972702026, "learning_rate": 8.218615929863831e-07, "loss": 0.1287, "step": 22930 }, { "epoch": 0.5348721984203316, "grad_norm": 1.9825356006622314, "learning_rate": 8.217838711683144e-07, "loss": 0.1209, "step": 22940 }, { "epoch": 0.5351053597971496, "grad_norm": 2.4757630825042725, "learning_rate": 8.217061493502456e-07, "loss": 0.1395, "step": 22950 }, { "epoch": 0.5353385211739675, "grad_norm": 1.4488638639450073, "learning_rate": 8.216284275321767e-07, "loss": 0.1144, "step": 22960 }, { "epoch": 0.5355716825507855, "grad_norm": 1.5226280689239502, "learning_rate": 8.21550705714108e-07, "loss": 0.1329, "step": 22970 }, { "epoch": 0.5358048439276034, "grad_norm": 1.4106051921844482, "learning_rate": 8.214729838960392e-07, "loss": 0.1304, "step": 22980 }, { "epoch": 0.5360380053044214, "grad_norm": 2.461071252822876, "learning_rate": 8.213952620779705e-07, "loss": 0.1178, "step": 22990 }, { "epoch": 0.5362711666812392, "grad_norm": 1.144471287727356, "learning_rate": 8.213175402599018e-07, "loss": 0.1183, "step": 23000 }, { "epoch": 0.5365043280580571, "grad_norm": 1.437994122505188, "learning_rate": 8.21239818441833e-07, "loss": 0.1264, "step": 23010 }, { "epoch": 0.5367374894348751, "grad_norm": 1.2498295307159424, "learning_rate": 8.211620966237643e-07, "loss": 0.1284, "step": 23020 }, { "epoch": 0.536970650811693, "grad_norm": 2.6744298934936523, "learning_rate": 8.210843748056953e-07, "loss": 0.1345, "step": 23030 }, { "epoch": 0.537203812188511, "grad_norm": 1.3113751411437988, "learning_rate": 8.210066529876266e-07, "loss": 0.1242, "step": 23040 }, { "epoch": 0.5374369735653289, "grad_norm": 1.408484697341919, "learning_rate": 8.209289311695579e-07, "loss": 0.1312, "step": 23050 }, { "epoch": 0.5376701349421469, "grad_norm": 2.771350622177124, "learning_rate": 8.208512093514891e-07, "loss": 0.127, "step": 23060 }, { "epoch": 0.5379032963189647, "grad_norm": 1.9287402629852295, "learning_rate": 8.207734875334204e-07, "loss": 0.1302, "step": 23070 }, { "epoch": 0.5381364576957827, "grad_norm": 2.821913003921509, "learning_rate": 8.206957657153516e-07, "loss": 0.1282, "step": 23080 }, { "epoch": 0.5383696190726006, "grad_norm": 2.865629196166992, "learning_rate": 8.206180438972828e-07, "loss": 0.1252, "step": 23090 }, { "epoch": 0.5386027804494186, "grad_norm": 1.4961310625076294, "learning_rate": 8.205403220792141e-07, "loss": 0.1134, "step": 23100 }, { "epoch": 0.5388359418262365, "grad_norm": 1.3260258436203003, "learning_rate": 8.204626002611452e-07, "loss": 0.117, "step": 23110 }, { "epoch": 0.5390691032030545, "grad_norm": 2.344412088394165, "learning_rate": 8.203848784430765e-07, "loss": 0.1305, "step": 23120 }, { "epoch": 0.5393022645798723, "grad_norm": 1.8058950901031494, "learning_rate": 8.203071566250077e-07, "loss": 0.1173, "step": 23130 }, { "epoch": 0.5395354259566902, "grad_norm": 2.4079549312591553, "learning_rate": 8.20229434806939e-07, "loss": 0.1309, "step": 23140 }, { "epoch": 0.5397685873335082, "grad_norm": 3.199129104614258, "learning_rate": 8.201517129888702e-07, "loss": 0.1389, "step": 23150 }, { "epoch": 0.5400017487103261, "grad_norm": 3.1715707778930664, "learning_rate": 8.200739911708014e-07, "loss": 0.1241, "step": 23160 }, { "epoch": 0.5402349100871441, "grad_norm": 1.315953254699707, "learning_rate": 8.199962693527327e-07, "loss": 0.1313, "step": 23170 }, { "epoch": 0.540468071463962, "grad_norm": 1.3561760187149048, "learning_rate": 8.19918547534664e-07, "loss": 0.1167, "step": 23180 }, { "epoch": 0.54070123284078, "grad_norm": 2.6183249950408936, "learning_rate": 8.198408257165951e-07, "loss": 0.1437, "step": 23190 }, { "epoch": 0.5409343942175978, "grad_norm": 2.1626784801483154, "learning_rate": 8.197631038985264e-07, "loss": 0.1209, "step": 23200 }, { "epoch": 0.5411675555944158, "grad_norm": 1.2278202772140503, "learning_rate": 8.196853820804575e-07, "loss": 0.1193, "step": 23210 }, { "epoch": 0.5414007169712337, "grad_norm": 2.5852739810943604, "learning_rate": 8.196076602623888e-07, "loss": 0.1192, "step": 23220 }, { "epoch": 0.5416338783480517, "grad_norm": 1.962585687637329, "learning_rate": 8.1952993844432e-07, "loss": 0.1294, "step": 23230 }, { "epoch": 0.5418670397248696, "grad_norm": 2.642951011657715, "learning_rate": 8.194522166262513e-07, "loss": 0.1298, "step": 23240 }, { "epoch": 0.5421002011016876, "grad_norm": 1.5924725532531738, "learning_rate": 8.193744948081826e-07, "loss": 0.1351, "step": 23250 }, { "epoch": 0.5423333624785054, "grad_norm": 3.535335063934326, "learning_rate": 8.192967729901138e-07, "loss": 0.1266, "step": 23260 }, { "epoch": 0.5425665238553233, "grad_norm": 2.862778663635254, "learning_rate": 8.192190511720449e-07, "loss": 0.1339, "step": 23270 }, { "epoch": 0.5427996852321413, "grad_norm": 1.201092004776001, "learning_rate": 8.191413293539761e-07, "loss": 0.1306, "step": 23280 }, { "epoch": 0.5430328466089592, "grad_norm": 1.7555756568908691, "learning_rate": 8.190636075359074e-07, "loss": 0.1334, "step": 23290 }, { "epoch": 0.5432660079857772, "grad_norm": 4.084972381591797, "learning_rate": 8.189858857178387e-07, "loss": 0.1297, "step": 23300 }, { "epoch": 0.5434991693625951, "grad_norm": 1.6490159034729004, "learning_rate": 8.189081638997699e-07, "loss": 0.1215, "step": 23310 }, { "epoch": 0.543732330739413, "grad_norm": 2.2488479614257812, "learning_rate": 8.188304420817012e-07, "loss": 0.1294, "step": 23320 }, { "epoch": 0.5439654921162309, "grad_norm": 1.3956836462020874, "learning_rate": 8.187527202636324e-07, "loss": 0.1324, "step": 23330 }, { "epoch": 0.5441986534930489, "grad_norm": 1.0723967552185059, "learning_rate": 8.186749984455636e-07, "loss": 0.1234, "step": 23340 }, { "epoch": 0.5444318148698668, "grad_norm": 1.4178904294967651, "learning_rate": 8.185972766274948e-07, "loss": 0.1264, "step": 23350 }, { "epoch": 0.5446649762466848, "grad_norm": 2.3933467864990234, "learning_rate": 8.18519554809426e-07, "loss": 0.1406, "step": 23360 }, { "epoch": 0.5448981376235027, "grad_norm": 1.7381515502929688, "learning_rate": 8.184418329913573e-07, "loss": 0.1218, "step": 23370 }, { "epoch": 0.5451312990003206, "grad_norm": 2.139936685562134, "learning_rate": 8.183641111732886e-07, "loss": 0.121, "step": 23380 }, { "epoch": 0.5453644603771385, "grad_norm": 1.0594931840896606, "learning_rate": 8.182863893552197e-07, "loss": 0.1249, "step": 23390 }, { "epoch": 0.5455976217539564, "grad_norm": 3.5597429275512695, "learning_rate": 8.18208667537151e-07, "loss": 0.1333, "step": 23400 }, { "epoch": 0.5458307831307744, "grad_norm": 2.870252847671509, "learning_rate": 8.181309457190822e-07, "loss": 0.1279, "step": 23410 }, { "epoch": 0.5460639445075923, "grad_norm": 1.173078179359436, "learning_rate": 8.180532239010135e-07, "loss": 0.1438, "step": 23420 }, { "epoch": 0.5462971058844103, "grad_norm": 1.4986624717712402, "learning_rate": 8.179755020829447e-07, "loss": 0.1255, "step": 23430 }, { "epoch": 0.5465302672612282, "grad_norm": 1.6651873588562012, "learning_rate": 8.178977802648759e-07, "loss": 0.1342, "step": 23440 }, { "epoch": 0.5467634286380461, "grad_norm": 1.2878241539001465, "learning_rate": 8.178200584468072e-07, "loss": 0.1283, "step": 23450 }, { "epoch": 0.546996590014864, "grad_norm": 2.251814365386963, "learning_rate": 8.177423366287383e-07, "loss": 0.1202, "step": 23460 }, { "epoch": 0.547229751391682, "grad_norm": 1.7903753519058228, "learning_rate": 8.176646148106696e-07, "loss": 0.1296, "step": 23470 }, { "epoch": 0.5474629127684999, "grad_norm": 2.618098020553589, "learning_rate": 8.175868929926009e-07, "loss": 0.1291, "step": 23480 }, { "epoch": 0.5476960741453178, "grad_norm": 1.5144972801208496, "learning_rate": 8.175091711745321e-07, "loss": 0.1249, "step": 23490 }, { "epoch": 0.5479292355221358, "grad_norm": 2.290508270263672, "learning_rate": 8.174314493564634e-07, "loss": 0.1193, "step": 23500 }, { "epoch": 0.5481623968989537, "grad_norm": 1.740647554397583, "learning_rate": 8.173537275383945e-07, "loss": 0.1317, "step": 23510 }, { "epoch": 0.5483955582757716, "grad_norm": 2.833183765411377, "learning_rate": 8.172760057203257e-07, "loss": 0.1322, "step": 23520 }, { "epoch": 0.5486287196525895, "grad_norm": 1.237043023109436, "learning_rate": 8.17198283902257e-07, "loss": 0.1297, "step": 23530 }, { "epoch": 0.5488618810294075, "grad_norm": 1.4128801822662354, "learning_rate": 8.171205620841882e-07, "loss": 0.1123, "step": 23540 }, { "epoch": 0.5490950424062254, "grad_norm": 1.9018300771713257, "learning_rate": 8.170428402661195e-07, "loss": 0.128, "step": 23550 }, { "epoch": 0.5493282037830434, "grad_norm": 1.9312565326690674, "learning_rate": 8.169651184480507e-07, "loss": 0.111, "step": 23560 }, { "epoch": 0.5495613651598613, "grad_norm": 2.335942506790161, "learning_rate": 8.16887396629982e-07, "loss": 0.1352, "step": 23570 }, { "epoch": 0.5497945265366792, "grad_norm": 1.4398480653762817, "learning_rate": 8.168096748119132e-07, "loss": 0.1107, "step": 23580 }, { "epoch": 0.5500276879134971, "grad_norm": 2.160726308822632, "learning_rate": 8.167319529938443e-07, "loss": 0.1283, "step": 23590 }, { "epoch": 0.550260849290315, "grad_norm": 1.4372597932815552, "learning_rate": 8.166542311757756e-07, "loss": 0.1083, "step": 23600 }, { "epoch": 0.550494010667133, "grad_norm": 1.4080713987350464, "learning_rate": 8.165765093577068e-07, "loss": 0.13, "step": 23610 }, { "epoch": 0.5507271720439509, "grad_norm": 1.5650595426559448, "learning_rate": 8.164987875396381e-07, "loss": 0.1262, "step": 23620 }, { "epoch": 0.5509603334207689, "grad_norm": 1.4357627630233765, "learning_rate": 8.164210657215694e-07, "loss": 0.1263, "step": 23630 }, { "epoch": 0.5511934947975867, "grad_norm": 2.3720624446868896, "learning_rate": 8.163433439035005e-07, "loss": 0.1362, "step": 23640 }, { "epoch": 0.5514266561744047, "grad_norm": 1.0998618602752686, "learning_rate": 8.162656220854318e-07, "loss": 0.121, "step": 23650 }, { "epoch": 0.5516598175512226, "grad_norm": 1.4071333408355713, "learning_rate": 8.16187900267363e-07, "loss": 0.1179, "step": 23660 }, { "epoch": 0.5518929789280406, "grad_norm": 1.3499128818511963, "learning_rate": 8.161101784492942e-07, "loss": 0.1208, "step": 23670 }, { "epoch": 0.5521261403048585, "grad_norm": 1.456547498703003, "learning_rate": 8.160324566312255e-07, "loss": 0.1224, "step": 23680 }, { "epoch": 0.5523593016816765, "grad_norm": 1.4705359935760498, "learning_rate": 8.159547348131567e-07, "loss": 0.1334, "step": 23690 }, { "epoch": 0.5525924630584944, "grad_norm": 1.1387742757797241, "learning_rate": 8.158770129950879e-07, "loss": 0.1313, "step": 23700 }, { "epoch": 0.5528256244353122, "grad_norm": 1.415285587310791, "learning_rate": 8.157992911770191e-07, "loss": 0.118, "step": 23710 }, { "epoch": 0.5530587858121302, "grad_norm": 1.2377961874008179, "learning_rate": 8.157215693589504e-07, "loss": 0.1235, "step": 23720 }, { "epoch": 0.5532919471889481, "grad_norm": 2.968824625015259, "learning_rate": 8.156438475408817e-07, "loss": 0.1166, "step": 23730 }, { "epoch": 0.5535251085657661, "grad_norm": 2.183737277984619, "learning_rate": 8.155661257228129e-07, "loss": 0.1245, "step": 23740 }, { "epoch": 0.553758269942584, "grad_norm": 1.484630823135376, "learning_rate": 8.154884039047441e-07, "loss": 0.1288, "step": 23750 }, { "epoch": 0.553991431319402, "grad_norm": 1.1246342658996582, "learning_rate": 8.154106820866752e-07, "loss": 0.1187, "step": 23760 }, { "epoch": 0.5542245926962198, "grad_norm": 2.6674070358276367, "learning_rate": 8.153329602686065e-07, "loss": 0.1321, "step": 23770 }, { "epoch": 0.5544577540730378, "grad_norm": 3.252545118331909, "learning_rate": 8.152552384505378e-07, "loss": 0.1222, "step": 23780 }, { "epoch": 0.5546909154498557, "grad_norm": 1.3610163927078247, "learning_rate": 8.15177516632469e-07, "loss": 0.1371, "step": 23790 }, { "epoch": 0.5549240768266737, "grad_norm": 1.2479833364486694, "learning_rate": 8.150997948144003e-07, "loss": 0.1201, "step": 23800 }, { "epoch": 0.5551572382034916, "grad_norm": 1.4350253343582153, "learning_rate": 8.150220729963316e-07, "loss": 0.1192, "step": 23810 }, { "epoch": 0.5553903995803096, "grad_norm": 1.592720627784729, "learning_rate": 8.149443511782628e-07, "loss": 0.1245, "step": 23820 }, { "epoch": 0.5556235609571275, "grad_norm": 1.4343043565750122, "learning_rate": 8.148666293601939e-07, "loss": 0.12, "step": 23830 }, { "epoch": 0.5558567223339453, "grad_norm": 1.4043222665786743, "learning_rate": 8.147889075421251e-07, "loss": 0.1355, "step": 23840 }, { "epoch": 0.5560898837107633, "grad_norm": 2.894827365875244, "learning_rate": 8.147111857240564e-07, "loss": 0.1442, "step": 23850 }, { "epoch": 0.5563230450875812, "grad_norm": 1.5112224817276, "learning_rate": 8.146334639059877e-07, "loss": 0.1347, "step": 23860 }, { "epoch": 0.5565562064643992, "grad_norm": 2.8139896392822266, "learning_rate": 8.145557420879189e-07, "loss": 0.1254, "step": 23870 }, { "epoch": 0.5567893678412171, "grad_norm": 1.6828118562698364, "learning_rate": 8.144780202698502e-07, "loss": 0.1228, "step": 23880 }, { "epoch": 0.5570225292180351, "grad_norm": 3.7476251125335693, "learning_rate": 8.144002984517813e-07, "loss": 0.1244, "step": 23890 }, { "epoch": 0.5572556905948529, "grad_norm": 2.085965156555176, "learning_rate": 8.143225766337126e-07, "loss": 0.1277, "step": 23900 }, { "epoch": 0.5574888519716709, "grad_norm": 1.1396937370300293, "learning_rate": 8.142448548156438e-07, "loss": 0.1269, "step": 23910 }, { "epoch": 0.5577220133484888, "grad_norm": 1.556832194328308, "learning_rate": 8.14167132997575e-07, "loss": 0.1221, "step": 23920 }, { "epoch": 0.5579551747253068, "grad_norm": 1.3904298543930054, "learning_rate": 8.140894111795063e-07, "loss": 0.1211, "step": 23930 }, { "epoch": 0.5581883361021247, "grad_norm": 1.4594526290893555, "learning_rate": 8.140116893614375e-07, "loss": 0.1099, "step": 23940 }, { "epoch": 0.5584214974789427, "grad_norm": 2.0661568641662598, "learning_rate": 8.139339675433687e-07, "loss": 0.1327, "step": 23950 }, { "epoch": 0.5586546588557605, "grad_norm": 1.8315457105636597, "learning_rate": 8.138562457253e-07, "loss": 0.1317, "step": 23960 }, { "epoch": 0.5588878202325784, "grad_norm": 1.3287235498428345, "learning_rate": 8.137785239072312e-07, "loss": 0.1246, "step": 23970 }, { "epoch": 0.5591209816093964, "grad_norm": 1.5337454080581665, "learning_rate": 8.137008020891625e-07, "loss": 0.1264, "step": 23980 }, { "epoch": 0.5593541429862143, "grad_norm": 1.7352895736694336, "learning_rate": 8.136230802710936e-07, "loss": 0.1237, "step": 23990 }, { "epoch": 0.5595873043630323, "grad_norm": 1.686029314994812, "learning_rate": 8.135453584530249e-07, "loss": 0.1353, "step": 24000 }, { "epoch": 0.5598204657398502, "grad_norm": 1.1729248762130737, "learning_rate": 8.134676366349561e-07, "loss": 0.1216, "step": 24010 }, { "epoch": 0.5600536271166682, "grad_norm": 1.6111642122268677, "learning_rate": 8.133899148168873e-07, "loss": 0.1269, "step": 24020 }, { "epoch": 0.560286788493486, "grad_norm": 1.2782241106033325, "learning_rate": 8.133121929988186e-07, "loss": 0.1211, "step": 24030 }, { "epoch": 0.560519949870304, "grad_norm": 1.293053388595581, "learning_rate": 8.132344711807498e-07, "loss": 0.1204, "step": 24040 }, { "epoch": 0.5607531112471219, "grad_norm": 1.3371658325195312, "learning_rate": 8.131567493626811e-07, "loss": 0.1274, "step": 24050 }, { "epoch": 0.5609862726239399, "grad_norm": 1.2781243324279785, "learning_rate": 8.130790275446124e-07, "loss": 0.1213, "step": 24060 }, { "epoch": 0.5612194340007578, "grad_norm": 2.2057604789733887, "learning_rate": 8.130013057265435e-07, "loss": 0.1201, "step": 24070 }, { "epoch": 0.5614525953775757, "grad_norm": 1.2555017471313477, "learning_rate": 8.129235839084747e-07, "loss": 0.1288, "step": 24080 }, { "epoch": 0.5616857567543936, "grad_norm": 1.4361501932144165, "learning_rate": 8.128458620904059e-07, "loss": 0.1215, "step": 24090 }, { "epoch": 0.5619189181312115, "grad_norm": 1.0896259546279907, "learning_rate": 8.127681402723372e-07, "loss": 0.1274, "step": 24100 }, { "epoch": 0.5621520795080295, "grad_norm": 3.90000581741333, "learning_rate": 8.126904184542685e-07, "loss": 0.1251, "step": 24110 }, { "epoch": 0.5623852408848474, "grad_norm": 1.4064956903457642, "learning_rate": 8.126126966361997e-07, "loss": 0.118, "step": 24120 }, { "epoch": 0.5626184022616654, "grad_norm": 1.4810826778411865, "learning_rate": 8.125349748181309e-07, "loss": 0.1306, "step": 24130 }, { "epoch": 0.5628515636384833, "grad_norm": 2.5606040954589844, "learning_rate": 8.124572530000621e-07, "loss": 0.1226, "step": 24140 }, { "epoch": 0.5630847250153013, "grad_norm": 2.3678975105285645, "learning_rate": 8.123795311819934e-07, "loss": 0.1231, "step": 24150 }, { "epoch": 0.5633178863921191, "grad_norm": 2.7214722633361816, "learning_rate": 8.123018093639246e-07, "loss": 0.127, "step": 24160 }, { "epoch": 0.563551047768937, "grad_norm": 1.9406142234802246, "learning_rate": 8.122240875458558e-07, "loss": 0.1315, "step": 24170 }, { "epoch": 0.563784209145755, "grad_norm": 1.2139357328414917, "learning_rate": 8.121463657277871e-07, "loss": 0.1226, "step": 24180 }, { "epoch": 0.564017370522573, "grad_norm": 1.124192476272583, "learning_rate": 8.120686439097182e-07, "loss": 0.135, "step": 24190 }, { "epoch": 0.5642505318993909, "grad_norm": 1.6052943468093872, "learning_rate": 8.119909220916495e-07, "loss": 0.1259, "step": 24200 }, { "epoch": 0.5644836932762088, "grad_norm": 1.813039779663086, "learning_rate": 8.119132002735808e-07, "loss": 0.1283, "step": 24210 }, { "epoch": 0.5647168546530267, "grad_norm": 2.597559928894043, "learning_rate": 8.11835478455512e-07, "loss": 0.1182, "step": 24220 }, { "epoch": 0.5649500160298446, "grad_norm": 1.356108546257019, "learning_rate": 8.117577566374433e-07, "loss": 0.1278, "step": 24230 }, { "epoch": 0.5651831774066626, "grad_norm": 1.4941706657409668, "learning_rate": 8.116800348193745e-07, "loss": 0.1142, "step": 24240 }, { "epoch": 0.5654163387834805, "grad_norm": 1.7754881381988525, "learning_rate": 8.116023130013056e-07, "loss": 0.1213, "step": 24250 }, { "epoch": 0.5656495001602985, "grad_norm": 1.4006999731063843, "learning_rate": 8.115245911832369e-07, "loss": 0.1231, "step": 24260 }, { "epoch": 0.5658826615371164, "grad_norm": 1.5025970935821533, "learning_rate": 8.114468693651681e-07, "loss": 0.124, "step": 24270 }, { "epoch": 0.5661158229139343, "grad_norm": 2.284609794616699, "learning_rate": 8.113691475470994e-07, "loss": 0.1489, "step": 24280 }, { "epoch": 0.5663489842907522, "grad_norm": 1.1194974184036255, "learning_rate": 8.112914257290307e-07, "loss": 0.1254, "step": 24290 }, { "epoch": 0.5665821456675701, "grad_norm": 3.0875155925750732, "learning_rate": 8.112137039109619e-07, "loss": 0.1205, "step": 24300 }, { "epoch": 0.5668153070443881, "grad_norm": 1.2138422727584839, "learning_rate": 8.111359820928932e-07, "loss": 0.1171, "step": 24310 }, { "epoch": 0.567048468421206, "grad_norm": 1.377011775970459, "learning_rate": 8.110582602748242e-07, "loss": 0.1141, "step": 24320 }, { "epoch": 0.567281629798024, "grad_norm": 2.430323839187622, "learning_rate": 8.109805384567555e-07, "loss": 0.1232, "step": 24330 }, { "epoch": 0.5675147911748419, "grad_norm": 2.0369584560394287, "learning_rate": 8.109028166386868e-07, "loss": 0.129, "step": 24340 }, { "epoch": 0.5677479525516598, "grad_norm": 1.9480113983154297, "learning_rate": 8.10825094820618e-07, "loss": 0.1366, "step": 24350 }, { "epoch": 0.5679811139284777, "grad_norm": 1.7135975360870361, "learning_rate": 8.107473730025493e-07, "loss": 0.1201, "step": 24360 }, { "epoch": 0.5682142753052957, "grad_norm": 1.7658190727233887, "learning_rate": 8.106696511844805e-07, "loss": 0.1274, "step": 24370 }, { "epoch": 0.5684474366821136, "grad_norm": 1.1084659099578857, "learning_rate": 8.105919293664117e-07, "loss": 0.1246, "step": 24380 }, { "epoch": 0.5686805980589316, "grad_norm": 1.4111989736557007, "learning_rate": 8.10514207548343e-07, "loss": 0.118, "step": 24390 }, { "epoch": 0.5689137594357495, "grad_norm": 1.6389182806015015, "learning_rate": 8.104364857302741e-07, "loss": 0.1308, "step": 24400 }, { "epoch": 0.5691469208125673, "grad_norm": 1.3611245155334473, "learning_rate": 8.103587639122054e-07, "loss": 0.1267, "step": 24410 }, { "epoch": 0.5693800821893853, "grad_norm": 1.2189968824386597, "learning_rate": 8.102810420941366e-07, "loss": 0.1228, "step": 24420 }, { "epoch": 0.5696132435662032, "grad_norm": 1.734165906906128, "learning_rate": 8.102033202760679e-07, "loss": 0.1251, "step": 24430 }, { "epoch": 0.5698464049430212, "grad_norm": 2.582963228225708, "learning_rate": 8.101255984579991e-07, "loss": 0.1298, "step": 24440 }, { "epoch": 0.5700795663198391, "grad_norm": 1.2448731660842896, "learning_rate": 8.100478766399303e-07, "loss": 0.1189, "step": 24450 }, { "epoch": 0.5703127276966571, "grad_norm": 1.4114547967910767, "learning_rate": 8.099701548218616e-07, "loss": 0.1278, "step": 24460 }, { "epoch": 0.570545889073475, "grad_norm": 2.8974084854125977, "learning_rate": 8.098924330037928e-07, "loss": 0.1322, "step": 24470 }, { "epoch": 0.5707790504502929, "grad_norm": 2.012505531311035, "learning_rate": 8.09814711185724e-07, "loss": 0.1239, "step": 24480 }, { "epoch": 0.5710122118271108, "grad_norm": 2.9342072010040283, "learning_rate": 8.097369893676553e-07, "loss": 0.129, "step": 24490 }, { "epoch": 0.5712453732039288, "grad_norm": 1.8834666013717651, "learning_rate": 8.096592675495864e-07, "loss": 0.1162, "step": 24500 }, { "epoch": 0.5714785345807467, "grad_norm": 2.3787574768066406, "learning_rate": 8.095815457315177e-07, "loss": 0.1212, "step": 24510 }, { "epoch": 0.5717116959575647, "grad_norm": 1.3290812969207764, "learning_rate": 8.095038239134489e-07, "loss": 0.1329, "step": 24520 }, { "epoch": 0.5719448573343826, "grad_norm": 1.2225595712661743, "learning_rate": 8.094261020953802e-07, "loss": 0.1251, "step": 24530 }, { "epoch": 0.5721780187112004, "grad_norm": 2.3560314178466797, "learning_rate": 8.093483802773115e-07, "loss": 0.1288, "step": 24540 }, { "epoch": 0.5724111800880184, "grad_norm": 1.3389390707015991, "learning_rate": 8.092706584592427e-07, "loss": 0.1323, "step": 24550 }, { "epoch": 0.5726443414648363, "grad_norm": 3.1883952617645264, "learning_rate": 8.091929366411738e-07, "loss": 0.1326, "step": 24560 }, { "epoch": 0.5728775028416543, "grad_norm": 1.796858310699463, "learning_rate": 8.09115214823105e-07, "loss": 0.13, "step": 24570 }, { "epoch": 0.5731106642184722, "grad_norm": 1.3316973447799683, "learning_rate": 8.090374930050363e-07, "loss": 0.1355, "step": 24580 }, { "epoch": 0.5733438255952902, "grad_norm": 2.262301445007324, "learning_rate": 8.089597711869676e-07, "loss": 0.1189, "step": 24590 }, { "epoch": 0.573576986972108, "grad_norm": 2.5937416553497314, "learning_rate": 8.088820493688988e-07, "loss": 0.1282, "step": 24600 }, { "epoch": 0.573810148348926, "grad_norm": 3.472034454345703, "learning_rate": 8.088043275508301e-07, "loss": 0.1224, "step": 24610 }, { "epoch": 0.5740433097257439, "grad_norm": 2.2722132205963135, "learning_rate": 8.087266057327613e-07, "loss": 0.118, "step": 24620 }, { "epoch": 0.5742764711025619, "grad_norm": 1.4830944538116455, "learning_rate": 8.086488839146925e-07, "loss": 0.1155, "step": 24630 }, { "epoch": 0.5745096324793798, "grad_norm": 4.4480977058410645, "learning_rate": 8.085711620966237e-07, "loss": 0.1243, "step": 24640 }, { "epoch": 0.5747427938561978, "grad_norm": 1.2353365421295166, "learning_rate": 8.084934402785549e-07, "loss": 0.1111, "step": 24650 }, { "epoch": 0.5749759552330157, "grad_norm": 1.940393090248108, "learning_rate": 8.084157184604862e-07, "loss": 0.1139, "step": 24660 }, { "epoch": 0.5752091166098335, "grad_norm": 3.007514238357544, "learning_rate": 8.083379966424175e-07, "loss": 0.1161, "step": 24670 }, { "epoch": 0.5754422779866515, "grad_norm": 1.6098415851593018, "learning_rate": 8.082602748243487e-07, "loss": 0.1398, "step": 24680 }, { "epoch": 0.5756754393634694, "grad_norm": 1.694928765296936, "learning_rate": 8.081903251880867e-07, "loss": 0.1381, "step": 24690 }, { "epoch": 0.5759086007402874, "grad_norm": 5.53736686706543, "learning_rate": 8.08112603370018e-07, "loss": 0.1207, "step": 24700 }, { "epoch": 0.5761417621171053, "grad_norm": 1.5379555225372314, "learning_rate": 8.080348815519493e-07, "loss": 0.1231, "step": 24710 }, { "epoch": 0.5763749234939233, "grad_norm": 2.129347562789917, "learning_rate": 8.079571597338804e-07, "loss": 0.1262, "step": 24720 }, { "epoch": 0.5766080848707411, "grad_norm": 2.0987284183502197, "learning_rate": 8.078794379158117e-07, "loss": 0.1326, "step": 24730 }, { "epoch": 0.5768412462475591, "grad_norm": 1.6175793409347534, "learning_rate": 8.078017160977429e-07, "loss": 0.1239, "step": 24740 }, { "epoch": 0.577074407624377, "grad_norm": 1.6865267753601074, "learning_rate": 8.077239942796742e-07, "loss": 0.1392, "step": 24750 }, { "epoch": 0.577307569001195, "grad_norm": 1.7762188911437988, "learning_rate": 8.076462724616055e-07, "loss": 0.138, "step": 24760 }, { "epoch": 0.5775407303780129, "grad_norm": 1.9401910305023193, "learning_rate": 8.075685506435366e-07, "loss": 0.1249, "step": 24770 }, { "epoch": 0.5777738917548308, "grad_norm": 1.6038011312484741, "learning_rate": 8.074908288254678e-07, "loss": 0.1216, "step": 24780 }, { "epoch": 0.5780070531316488, "grad_norm": 1.2965031862258911, "learning_rate": 8.07413107007399e-07, "loss": 0.129, "step": 24790 }, { "epoch": 0.5782402145084666, "grad_norm": 1.6734857559204102, "learning_rate": 8.073353851893303e-07, "loss": 0.1172, "step": 24800 }, { "epoch": 0.5784733758852846, "grad_norm": 1.6323140859603882, "learning_rate": 8.072576633712616e-07, "loss": 0.1283, "step": 24810 }, { "epoch": 0.5787065372621025, "grad_norm": 1.580662488937378, "learning_rate": 8.071799415531928e-07, "loss": 0.1341, "step": 24820 }, { "epoch": 0.5789396986389205, "grad_norm": 3.345900774002075, "learning_rate": 8.071022197351241e-07, "loss": 0.1279, "step": 24830 }, { "epoch": 0.5791728600157384, "grad_norm": 1.3400720357894897, "learning_rate": 8.070244979170552e-07, "loss": 0.1353, "step": 24840 }, { "epoch": 0.5794060213925564, "grad_norm": 1.3199633359909058, "learning_rate": 8.069467760989864e-07, "loss": 0.1232, "step": 24850 }, { "epoch": 0.5796391827693742, "grad_norm": 1.7488160133361816, "learning_rate": 8.068690542809177e-07, "loss": 0.1228, "step": 24860 }, { "epoch": 0.5798723441461922, "grad_norm": 5.772220134735107, "learning_rate": 8.067913324628489e-07, "loss": 0.1188, "step": 24870 }, { "epoch": 0.5801055055230101, "grad_norm": 1.4352964162826538, "learning_rate": 8.067136106447802e-07, "loss": 0.1234, "step": 24880 }, { "epoch": 0.580338666899828, "grad_norm": 1.3338398933410645, "learning_rate": 8.066358888267115e-07, "loss": 0.1281, "step": 24890 }, { "epoch": 0.580571828276646, "grad_norm": 3.654017210006714, "learning_rate": 8.065581670086426e-07, "loss": 0.1238, "step": 24900 }, { "epoch": 0.5808049896534639, "grad_norm": 1.573182225227356, "learning_rate": 8.064804451905739e-07, "loss": 0.1101, "step": 24910 }, { "epoch": 0.5810381510302818, "grad_norm": 1.4270505905151367, "learning_rate": 8.064027233725051e-07, "loss": 0.1221, "step": 24920 }, { "epoch": 0.5812713124070997, "grad_norm": 1.4365227222442627, "learning_rate": 8.063250015544363e-07, "loss": 0.1227, "step": 24930 }, { "epoch": 0.5815044737839177, "grad_norm": 4.8236236572265625, "learning_rate": 8.062472797363675e-07, "loss": 0.1296, "step": 24940 }, { "epoch": 0.5817376351607356, "grad_norm": 2.2621910572052, "learning_rate": 8.061695579182988e-07, "loss": 0.1334, "step": 24950 }, { "epoch": 0.5819707965375536, "grad_norm": 2.484020471572876, "learning_rate": 8.0609183610023e-07, "loss": 0.1317, "step": 24960 }, { "epoch": 0.5822039579143715, "grad_norm": 1.3711472749710083, "learning_rate": 8.060141142821612e-07, "loss": 0.1257, "step": 24970 }, { "epoch": 0.5824371192911895, "grad_norm": 1.3176003694534302, "learning_rate": 8.059363924640925e-07, "loss": 0.1024, "step": 24980 }, { "epoch": 0.5826702806680073, "grad_norm": 2.2733373641967773, "learning_rate": 8.058586706460238e-07, "loss": 0.1282, "step": 24990 }, { "epoch": 0.5829034420448252, "grad_norm": 2.4825804233551025, "learning_rate": 8.05780948827955e-07, "loss": 0.1226, "step": 25000 }, { "epoch": 0.5831366034216432, "grad_norm": 1.5205929279327393, "learning_rate": 8.057032270098862e-07, "loss": 0.1411, "step": 25010 }, { "epoch": 0.5833697647984611, "grad_norm": 1.5840017795562744, "learning_rate": 8.056255051918173e-07, "loss": 0.1411, "step": 25020 }, { "epoch": 0.5836029261752791, "grad_norm": 1.7537071704864502, "learning_rate": 8.055477833737486e-07, "loss": 0.1152, "step": 25030 }, { "epoch": 0.583836087552097, "grad_norm": 1.7435808181762695, "learning_rate": 8.054700615556799e-07, "loss": 0.1364, "step": 25040 }, { "epoch": 0.5840692489289149, "grad_norm": 3.945981025695801, "learning_rate": 8.053923397376111e-07, "loss": 0.1284, "step": 25050 }, { "epoch": 0.5843024103057328, "grad_norm": 2.9259140491485596, "learning_rate": 8.053146179195424e-07, "loss": 0.1462, "step": 25060 }, { "epoch": 0.5845355716825508, "grad_norm": 4.813845157623291, "learning_rate": 8.052368961014736e-07, "loss": 0.118, "step": 25070 }, { "epoch": 0.5847687330593687, "grad_norm": 1.5247938632965088, "learning_rate": 8.051591742834049e-07, "loss": 0.1333, "step": 25080 }, { "epoch": 0.5850018944361867, "grad_norm": 1.2216640710830688, "learning_rate": 8.05081452465336e-07, "loss": 0.1343, "step": 25090 }, { "epoch": 0.5852350558130046, "grad_norm": 1.6076884269714355, "learning_rate": 8.050037306472672e-07, "loss": 0.1242, "step": 25100 }, { "epoch": 0.5854682171898226, "grad_norm": 1.2805315256118774, "learning_rate": 8.049260088291985e-07, "loss": 0.1343, "step": 25110 }, { "epoch": 0.5857013785666404, "grad_norm": 1.3577454090118408, "learning_rate": 8.048482870111297e-07, "loss": 0.1176, "step": 25120 }, { "epoch": 0.5859345399434583, "grad_norm": 1.4667491912841797, "learning_rate": 8.04770565193061e-07, "loss": 0.1214, "step": 25130 }, { "epoch": 0.5861677013202763, "grad_norm": 1.3334968090057373, "learning_rate": 8.046928433749923e-07, "loss": 0.1217, "step": 25140 }, { "epoch": 0.5864008626970942, "grad_norm": 2.685756206512451, "learning_rate": 8.046151215569234e-07, "loss": 0.1199, "step": 25150 }, { "epoch": 0.5866340240739122, "grad_norm": 1.9251933097839355, "learning_rate": 8.045373997388547e-07, "loss": 0.1442, "step": 25160 }, { "epoch": 0.5868671854507301, "grad_norm": 1.8932517766952515, "learning_rate": 8.044596779207858e-07, "loss": 0.1187, "step": 25170 }, { "epoch": 0.587100346827548, "grad_norm": 1.587549090385437, "learning_rate": 8.043819561027171e-07, "loss": 0.1195, "step": 25180 }, { "epoch": 0.5873335082043659, "grad_norm": 1.9740593433380127, "learning_rate": 8.043042342846484e-07, "loss": 0.1122, "step": 25190 }, { "epoch": 0.5875666695811839, "grad_norm": 1.194743275642395, "learning_rate": 8.042265124665796e-07, "loss": 0.1157, "step": 25200 }, { "epoch": 0.5877998309580018, "grad_norm": 2.509416103363037, "learning_rate": 8.041487906485108e-07, "loss": 0.1421, "step": 25210 }, { "epoch": 0.5880329923348198, "grad_norm": 1.18466317653656, "learning_rate": 8.04071068830442e-07, "loss": 0.1168, "step": 25220 }, { "epoch": 0.5882661537116377, "grad_norm": 1.486082673072815, "learning_rate": 8.039933470123733e-07, "loss": 0.1326, "step": 25230 }, { "epoch": 0.5884993150884555, "grad_norm": 1.2464066743850708, "learning_rate": 8.039156251943046e-07, "loss": 0.1252, "step": 25240 }, { "epoch": 0.5887324764652735, "grad_norm": 1.30666184425354, "learning_rate": 8.038379033762357e-07, "loss": 0.1232, "step": 25250 }, { "epoch": 0.5889656378420914, "grad_norm": 2.1410951614379883, "learning_rate": 8.03760181558167e-07, "loss": 0.1259, "step": 25260 }, { "epoch": 0.5891987992189094, "grad_norm": 1.1784127950668335, "learning_rate": 8.036824597400981e-07, "loss": 0.1263, "step": 25270 }, { "epoch": 0.5894319605957273, "grad_norm": 1.6214590072631836, "learning_rate": 8.036047379220294e-07, "loss": 0.1315, "step": 25280 }, { "epoch": 0.5896651219725453, "grad_norm": 1.8392211198806763, "learning_rate": 8.035270161039607e-07, "loss": 0.1226, "step": 25290 }, { "epoch": 0.5898982833493632, "grad_norm": 2.059088706970215, "learning_rate": 8.034492942858919e-07, "loss": 0.1319, "step": 25300 }, { "epoch": 0.5901314447261811, "grad_norm": 1.8027740716934204, "learning_rate": 8.033715724678232e-07, "loss": 0.1322, "step": 25310 }, { "epoch": 0.590364606102999, "grad_norm": 2.57594895362854, "learning_rate": 8.032938506497545e-07, "loss": 0.1267, "step": 25320 }, { "epoch": 0.590597767479817, "grad_norm": 2.2850232124328613, "learning_rate": 8.032161288316855e-07, "loss": 0.144, "step": 25330 }, { "epoch": 0.5908309288566349, "grad_norm": 1.2680010795593262, "learning_rate": 8.031384070136168e-07, "loss": 0.1165, "step": 25340 }, { "epoch": 0.5910640902334529, "grad_norm": 1.231418251991272, "learning_rate": 8.03060685195548e-07, "loss": 0.1129, "step": 25350 }, { "epoch": 0.5912972516102708, "grad_norm": 2.8512990474700928, "learning_rate": 8.029829633774793e-07, "loss": 0.1245, "step": 25360 }, { "epoch": 0.5915304129870886, "grad_norm": 2.201061725616455, "learning_rate": 8.029052415594106e-07, "loss": 0.1089, "step": 25370 }, { "epoch": 0.5917635743639066, "grad_norm": 2.35178279876709, "learning_rate": 8.028275197413418e-07, "loss": 0.1272, "step": 25380 }, { "epoch": 0.5919967357407245, "grad_norm": 1.2560808658599854, "learning_rate": 8.02749797923273e-07, "loss": 0.1317, "step": 25390 }, { "epoch": 0.5922298971175425, "grad_norm": 4.440523624420166, "learning_rate": 8.026720761052042e-07, "loss": 0.1151, "step": 25400 }, { "epoch": 0.5924630584943604, "grad_norm": 1.3921915292739868, "learning_rate": 8.025943542871354e-07, "loss": 0.1047, "step": 25410 }, { "epoch": 0.5926962198711784, "grad_norm": 2.142484188079834, "learning_rate": 8.025166324690667e-07, "loss": 0.127, "step": 25420 }, { "epoch": 0.5929293812479963, "grad_norm": 2.0327188968658447, "learning_rate": 8.024389106509979e-07, "loss": 0.1194, "step": 25430 }, { "epoch": 0.5931625426248142, "grad_norm": 2.712616443634033, "learning_rate": 8.023611888329292e-07, "loss": 0.1308, "step": 25440 }, { "epoch": 0.5933957040016321, "grad_norm": 3.7305991649627686, "learning_rate": 8.022834670148603e-07, "loss": 0.1238, "step": 25450 }, { "epoch": 0.59362886537845, "grad_norm": 2.2224557399749756, "learning_rate": 8.022057451967916e-07, "loss": 0.124, "step": 25460 }, { "epoch": 0.593862026755268, "grad_norm": 1.283910870552063, "learning_rate": 8.021280233787229e-07, "loss": 0.1258, "step": 25470 }, { "epoch": 0.594095188132086, "grad_norm": 2.831679582595825, "learning_rate": 8.020503015606541e-07, "loss": 0.1211, "step": 25480 }, { "epoch": 0.5943283495089039, "grad_norm": 2.0013790130615234, "learning_rate": 8.019725797425853e-07, "loss": 0.1058, "step": 25490 }, { "epoch": 0.5945615108857217, "grad_norm": 1.5507228374481201, "learning_rate": 8.018948579245165e-07, "loss": 0.1295, "step": 25500 }, { "epoch": 0.5947946722625397, "grad_norm": 1.8570657968521118, "learning_rate": 8.018171361064477e-07, "loss": 0.1357, "step": 25510 }, { "epoch": 0.5950278336393576, "grad_norm": 1.1730797290802002, "learning_rate": 8.01739414288379e-07, "loss": 0.1183, "step": 25520 }, { "epoch": 0.5952609950161756, "grad_norm": 1.3180718421936035, "learning_rate": 8.016616924703102e-07, "loss": 0.1272, "step": 25530 }, { "epoch": 0.5954941563929935, "grad_norm": 3.0467841625213623, "learning_rate": 8.015839706522415e-07, "loss": 0.1211, "step": 25540 }, { "epoch": 0.5957273177698115, "grad_norm": 3.9470412731170654, "learning_rate": 8.015062488341727e-07, "loss": 0.1246, "step": 25550 }, { "epoch": 0.5959604791466293, "grad_norm": 3.2242138385772705, "learning_rate": 8.01428527016104e-07, "loss": 0.1262, "step": 25560 }, { "epoch": 0.5961936405234473, "grad_norm": 1.2517069578170776, "learning_rate": 8.013508051980352e-07, "loss": 0.1261, "step": 25570 }, { "epoch": 0.5964268019002652, "grad_norm": 3.419281482696533, "learning_rate": 8.012730833799663e-07, "loss": 0.1333, "step": 25580 }, { "epoch": 0.5966599632770831, "grad_norm": 1.5557795763015747, "learning_rate": 8.011953615618976e-07, "loss": 0.1204, "step": 25590 }, { "epoch": 0.5968931246539011, "grad_norm": 1.1273112297058105, "learning_rate": 8.011176397438288e-07, "loss": 0.1198, "step": 25600 }, { "epoch": 0.597126286030719, "grad_norm": 1.7264554500579834, "learning_rate": 8.010399179257601e-07, "loss": 0.1255, "step": 25610 }, { "epoch": 0.597359447407537, "grad_norm": 1.3597939014434814, "learning_rate": 8.009621961076914e-07, "loss": 0.1209, "step": 25620 }, { "epoch": 0.5975926087843548, "grad_norm": 1.849521279335022, "learning_rate": 8.008844742896226e-07, "loss": 0.1139, "step": 25630 }, { "epoch": 0.5978257701611728, "grad_norm": 1.8582417964935303, "learning_rate": 8.008067524715538e-07, "loss": 0.1339, "step": 25640 }, { "epoch": 0.5980589315379907, "grad_norm": 1.7306150197982788, "learning_rate": 8.007290306534849e-07, "loss": 0.125, "step": 25650 }, { "epoch": 0.5982920929148087, "grad_norm": 1.9741427898406982, "learning_rate": 8.006513088354162e-07, "loss": 0.1256, "step": 25660 }, { "epoch": 0.5985252542916266, "grad_norm": 1.3288390636444092, "learning_rate": 8.005735870173475e-07, "loss": 0.1313, "step": 25670 }, { "epoch": 0.5987584156684446, "grad_norm": 2.41627836227417, "learning_rate": 8.004958651992787e-07, "loss": 0.1201, "step": 25680 }, { "epoch": 0.5989915770452624, "grad_norm": 2.2731544971466064, "learning_rate": 8.0041814338121e-07, "loss": 0.1232, "step": 25690 }, { "epoch": 0.5992247384220803, "grad_norm": 2.256793975830078, "learning_rate": 8.003404215631411e-07, "loss": 0.1207, "step": 25700 }, { "epoch": 0.5994578997988983, "grad_norm": 1.4385589361190796, "learning_rate": 8.002626997450724e-07, "loss": 0.1191, "step": 25710 }, { "epoch": 0.5996910611757162, "grad_norm": 1.7896486520767212, "learning_rate": 8.001849779270037e-07, "loss": 0.1262, "step": 25720 }, { "epoch": 0.5999242225525342, "grad_norm": 4.890050888061523, "learning_rate": 8.001072561089348e-07, "loss": 0.1204, "step": 25730 }, { "epoch": 0.6001573839293521, "grad_norm": 1.3155330419540405, "learning_rate": 8.000295342908661e-07, "loss": 0.1202, "step": 25740 }, { "epoch": 0.6003905453061701, "grad_norm": 1.2993651628494263, "learning_rate": 7.999518124727973e-07, "loss": 0.1347, "step": 25750 }, { "epoch": 0.6006237066829879, "grad_norm": 1.584138035774231, "learning_rate": 7.998740906547285e-07, "loss": 0.1362, "step": 25760 }, { "epoch": 0.6008568680598059, "grad_norm": 3.7769017219543457, "learning_rate": 7.997963688366598e-07, "loss": 0.1246, "step": 25770 }, { "epoch": 0.6010900294366238, "grad_norm": 2.100580930709839, "learning_rate": 7.99718647018591e-07, "loss": 0.1189, "step": 25780 }, { "epoch": 0.6013231908134418, "grad_norm": 1.3373544216156006, "learning_rate": 7.996409252005223e-07, "loss": 0.117, "step": 25790 }, { "epoch": 0.6015563521902597, "grad_norm": 2.709930419921875, "learning_rate": 7.995632033824536e-07, "loss": 0.1104, "step": 25800 }, { "epoch": 0.6017895135670777, "grad_norm": 2.7267048358917236, "learning_rate": 7.994854815643847e-07, "loss": 0.1224, "step": 25810 }, { "epoch": 0.6020226749438955, "grad_norm": 2.3442656993865967, "learning_rate": 7.994077597463159e-07, "loss": 0.1237, "step": 25820 }, { "epoch": 0.6022558363207134, "grad_norm": 1.5114539861679077, "learning_rate": 7.993300379282471e-07, "loss": 0.1293, "step": 25830 }, { "epoch": 0.6024889976975314, "grad_norm": 1.5649765729904175, "learning_rate": 7.992523161101784e-07, "loss": 0.1298, "step": 25840 }, { "epoch": 0.6027221590743493, "grad_norm": 1.8269916772842407, "learning_rate": 7.991745942921097e-07, "loss": 0.1238, "step": 25850 }, { "epoch": 0.6029553204511673, "grad_norm": 2.9469687938690186, "learning_rate": 7.990968724740409e-07, "loss": 0.1149, "step": 25860 }, { "epoch": 0.6031884818279852, "grad_norm": 1.413743257522583, "learning_rate": 7.990191506559722e-07, "loss": 0.1165, "step": 25870 }, { "epoch": 0.6034216432048031, "grad_norm": 1.9009265899658203, "learning_rate": 7.989414288379034e-07, "loss": 0.1153, "step": 25880 }, { "epoch": 0.603654804581621, "grad_norm": 1.5853347778320312, "learning_rate": 7.988637070198345e-07, "loss": 0.119, "step": 25890 }, { "epoch": 0.603887965958439, "grad_norm": 1.7229321002960205, "learning_rate": 7.987859852017658e-07, "loss": 0.1213, "step": 25900 }, { "epoch": 0.6041211273352569, "grad_norm": 1.8429988622665405, "learning_rate": 7.98708263383697e-07, "loss": 0.1248, "step": 25910 }, { "epoch": 0.6043542887120749, "grad_norm": 1.7174123525619507, "learning_rate": 7.986305415656283e-07, "loss": 0.1287, "step": 25920 }, { "epoch": 0.6045874500888928, "grad_norm": 1.4811006784439087, "learning_rate": 7.985528197475595e-07, "loss": 0.1212, "step": 25930 }, { "epoch": 0.6048206114657108, "grad_norm": 1.4917747974395752, "learning_rate": 7.984750979294908e-07, "loss": 0.1318, "step": 25940 }, { "epoch": 0.6050537728425286, "grad_norm": 1.0249834060668945, "learning_rate": 7.98397376111422e-07, "loss": 0.1261, "step": 25950 }, { "epoch": 0.6052869342193465, "grad_norm": 2.5457870960235596, "learning_rate": 7.983196542933532e-07, "loss": 0.1376, "step": 25960 }, { "epoch": 0.6055200955961645, "grad_norm": 1.6103099584579468, "learning_rate": 7.982419324752844e-07, "loss": 0.1149, "step": 25970 }, { "epoch": 0.6057532569729824, "grad_norm": 1.3794220685958862, "learning_rate": 7.981642106572156e-07, "loss": 0.1271, "step": 25980 }, { "epoch": 0.6059864183498004, "grad_norm": 1.5124164819717407, "learning_rate": 7.980864888391469e-07, "loss": 0.1278, "step": 25990 }, { "epoch": 0.6062195797266183, "grad_norm": 1.3668264150619507, "learning_rate": 7.980087670210782e-07, "loss": 0.1361, "step": 26000 }, { "epoch": 0.6064527411034362, "grad_norm": 2.2840781211853027, "learning_rate": 7.979310452030093e-07, "loss": 0.1246, "step": 26010 }, { "epoch": 0.6066859024802541, "grad_norm": 2.02424955368042, "learning_rate": 7.978533233849406e-07, "loss": 0.1296, "step": 26020 }, { "epoch": 0.6069190638570721, "grad_norm": 2.7550430297851562, "learning_rate": 7.977756015668718e-07, "loss": 0.1141, "step": 26030 }, { "epoch": 0.60715222523389, "grad_norm": 1.4864873886108398, "learning_rate": 7.976978797488031e-07, "loss": 0.1262, "step": 26040 }, { "epoch": 0.607385386610708, "grad_norm": 1.1679298877716064, "learning_rate": 7.976201579307343e-07, "loss": 0.112, "step": 26050 }, { "epoch": 0.6076185479875259, "grad_norm": 1.2398097515106201, "learning_rate": 7.975424361126655e-07, "loss": 0.1152, "step": 26060 }, { "epoch": 0.6078517093643439, "grad_norm": 1.1966826915740967, "learning_rate": 7.974647142945967e-07, "loss": 0.1186, "step": 26070 }, { "epoch": 0.6080848707411617, "grad_norm": 1.949553370475769, "learning_rate": 7.973869924765279e-07, "loss": 0.1237, "step": 26080 }, { "epoch": 0.6083180321179796, "grad_norm": 2.9169070720672607, "learning_rate": 7.973092706584592e-07, "loss": 0.1387, "step": 26090 }, { "epoch": 0.6085511934947976, "grad_norm": 1.0991714000701904, "learning_rate": 7.972315488403905e-07, "loss": 0.1121, "step": 26100 }, { "epoch": 0.6087843548716155, "grad_norm": 1.474183201789856, "learning_rate": 7.971538270223217e-07, "loss": 0.145, "step": 26110 }, { "epoch": 0.6090175162484335, "grad_norm": 1.3576130867004395, "learning_rate": 7.97076105204253e-07, "loss": 0.1259, "step": 26120 }, { "epoch": 0.6092506776252514, "grad_norm": 1.622771143913269, "learning_rate": 7.96998383386184e-07, "loss": 0.1283, "step": 26130 }, { "epoch": 0.6094838390020693, "grad_norm": 1.7288559675216675, "learning_rate": 7.969206615681153e-07, "loss": 0.1198, "step": 26140 }, { "epoch": 0.6097170003788872, "grad_norm": 1.4490046501159668, "learning_rate": 7.968429397500466e-07, "loss": 0.1274, "step": 26150 }, { "epoch": 0.6099501617557052, "grad_norm": 4.674640655517578, "learning_rate": 7.967652179319778e-07, "loss": 0.1316, "step": 26160 }, { "epoch": 0.6101833231325231, "grad_norm": 1.2869839668273926, "learning_rate": 7.966874961139091e-07, "loss": 0.1201, "step": 26170 }, { "epoch": 0.610416484509341, "grad_norm": 1.3277090787887573, "learning_rate": 7.966097742958404e-07, "loss": 0.1276, "step": 26180 }, { "epoch": 0.610649645886159, "grad_norm": 1.3916937112808228, "learning_rate": 7.965320524777715e-07, "loss": 0.1081, "step": 26190 }, { "epoch": 0.6108828072629768, "grad_norm": 1.2938567399978638, "learning_rate": 7.964543306597028e-07, "loss": 0.1229, "step": 26200 }, { "epoch": 0.6111159686397948, "grad_norm": 1.22265625, "learning_rate": 7.963766088416339e-07, "loss": 0.129, "step": 26210 }, { "epoch": 0.6113491300166127, "grad_norm": 1.3707562685012817, "learning_rate": 7.962988870235652e-07, "loss": 0.1222, "step": 26220 }, { "epoch": 0.6115822913934307, "grad_norm": 3.768594741821289, "learning_rate": 7.962211652054964e-07, "loss": 0.1153, "step": 26230 }, { "epoch": 0.6118154527702486, "grad_norm": 1.2280765771865845, "learning_rate": 7.961434433874277e-07, "loss": 0.1128, "step": 26240 }, { "epoch": 0.6120486141470666, "grad_norm": 2.279892921447754, "learning_rate": 7.960657215693589e-07, "loss": 0.1275, "step": 26250 }, { "epoch": 0.6122817755238845, "grad_norm": 2.5080668926239014, "learning_rate": 7.959879997512901e-07, "loss": 0.1337, "step": 26260 }, { "epoch": 0.6125149369007024, "grad_norm": 1.1000021696090698, "learning_rate": 7.959102779332214e-07, "loss": 0.124, "step": 26270 }, { "epoch": 0.6127480982775203, "grad_norm": 3.885026454925537, "learning_rate": 7.958325561151527e-07, "loss": 0.1439, "step": 26280 }, { "epoch": 0.6129812596543383, "grad_norm": 1.3604766130447388, "learning_rate": 7.957548342970838e-07, "loss": 0.1182, "step": 26290 }, { "epoch": 0.6132144210311562, "grad_norm": 1.384729266166687, "learning_rate": 7.956771124790151e-07, "loss": 0.1278, "step": 26300 }, { "epoch": 0.6134475824079741, "grad_norm": 2.3799517154693604, "learning_rate": 7.955993906609462e-07, "loss": 0.1181, "step": 26310 }, { "epoch": 0.6136807437847921, "grad_norm": 2.2571136951446533, "learning_rate": 7.955216688428775e-07, "loss": 0.1269, "step": 26320 }, { "epoch": 0.6139139051616099, "grad_norm": 2.466602325439453, "learning_rate": 7.954439470248088e-07, "loss": 0.1293, "step": 26330 }, { "epoch": 0.6141470665384279, "grad_norm": 1.3261935710906982, "learning_rate": 7.9536622520674e-07, "loss": 0.1304, "step": 26340 }, { "epoch": 0.6143802279152458, "grad_norm": 2.463655948638916, "learning_rate": 7.952885033886713e-07, "loss": 0.1289, "step": 26350 }, { "epoch": 0.6146133892920638, "grad_norm": 1.6265987157821655, "learning_rate": 7.952107815706025e-07, "loss": 0.1299, "step": 26360 }, { "epoch": 0.6148465506688817, "grad_norm": 1.7686642408370972, "learning_rate": 7.951330597525336e-07, "loss": 0.1165, "step": 26370 }, { "epoch": 0.6150797120456997, "grad_norm": 1.6362911462783813, "learning_rate": 7.950553379344649e-07, "loss": 0.1358, "step": 26380 }, { "epoch": 0.6153128734225176, "grad_norm": 2.470221996307373, "learning_rate": 7.949776161163961e-07, "loss": 0.1201, "step": 26390 }, { "epoch": 0.6155460347993355, "grad_norm": 1.3080908060073853, "learning_rate": 7.948998942983274e-07, "loss": 0.1189, "step": 26400 }, { "epoch": 0.6157791961761534, "grad_norm": 1.551192283630371, "learning_rate": 7.948221724802586e-07, "loss": 0.133, "step": 26410 }, { "epoch": 0.6160123575529713, "grad_norm": 1.1808000802993774, "learning_rate": 7.947444506621899e-07, "loss": 0.1272, "step": 26420 }, { "epoch": 0.6162455189297893, "grad_norm": 2.2378628253936768, "learning_rate": 7.946667288441212e-07, "loss": 0.1253, "step": 26430 }, { "epoch": 0.6164786803066072, "grad_norm": 3.4565322399139404, "learning_rate": 7.945890070260523e-07, "loss": 0.1376, "step": 26440 }, { "epoch": 0.6167118416834252, "grad_norm": 3.4843974113464355, "learning_rate": 7.945112852079835e-07, "loss": 0.1166, "step": 26450 }, { "epoch": 0.616945003060243, "grad_norm": 1.0198945999145508, "learning_rate": 7.944335633899147e-07, "loss": 0.1144, "step": 26460 }, { "epoch": 0.617178164437061, "grad_norm": 1.2326117753982544, "learning_rate": 7.94355841571846e-07, "loss": 0.1353, "step": 26470 }, { "epoch": 0.6174113258138789, "grad_norm": 2.3446221351623535, "learning_rate": 7.942781197537773e-07, "loss": 0.1306, "step": 26480 }, { "epoch": 0.6176444871906969, "grad_norm": 2.194976568222046, "learning_rate": 7.942003979357085e-07, "loss": 0.1213, "step": 26490 }, { "epoch": 0.6178776485675148, "grad_norm": 1.403739333152771, "learning_rate": 7.941226761176397e-07, "loss": 0.1179, "step": 26500 }, { "epoch": 0.6181108099443328, "grad_norm": 1.2369598150253296, "learning_rate": 7.940449542995709e-07, "loss": 0.13, "step": 26510 }, { "epoch": 0.6183439713211506, "grad_norm": 2.6309926509857178, "learning_rate": 7.939672324815022e-07, "loss": 0.1109, "step": 26520 }, { "epoch": 0.6185771326979685, "grad_norm": 1.2351014614105225, "learning_rate": 7.938895106634334e-07, "loss": 0.1184, "step": 26530 }, { "epoch": 0.6188102940747865, "grad_norm": 1.8269994258880615, "learning_rate": 7.938117888453646e-07, "loss": 0.1207, "step": 26540 }, { "epoch": 0.6190434554516044, "grad_norm": 1.3350824117660522, "learning_rate": 7.937340670272959e-07, "loss": 0.1245, "step": 26550 }, { "epoch": 0.6192766168284224, "grad_norm": 1.7794219255447388, "learning_rate": 7.93656345209227e-07, "loss": 0.1276, "step": 26560 }, { "epoch": 0.6195097782052403, "grad_norm": 1.498177409172058, "learning_rate": 7.935786233911583e-07, "loss": 0.1339, "step": 26570 }, { "epoch": 0.6197429395820583, "grad_norm": 1.5044739246368408, "learning_rate": 7.935009015730896e-07, "loss": 0.1138, "step": 26580 }, { "epoch": 0.6199761009588761, "grad_norm": 2.038696527481079, "learning_rate": 7.934231797550208e-07, "loss": 0.1265, "step": 26590 }, { "epoch": 0.6202092623356941, "grad_norm": 1.45698881149292, "learning_rate": 7.933454579369521e-07, "loss": 0.117, "step": 26600 }, { "epoch": 0.620442423712512, "grad_norm": 1.8586512804031372, "learning_rate": 7.932677361188832e-07, "loss": 0.1225, "step": 26610 }, { "epoch": 0.62067558508933, "grad_norm": 1.21579110622406, "learning_rate": 7.931900143008144e-07, "loss": 0.1311, "step": 26620 }, { "epoch": 0.6209087464661479, "grad_norm": 1.6697289943695068, "learning_rate": 7.931122924827457e-07, "loss": 0.1129, "step": 26630 }, { "epoch": 0.6211419078429659, "grad_norm": 1.3041770458221436, "learning_rate": 7.930345706646769e-07, "loss": 0.1271, "step": 26640 }, { "epoch": 0.6213750692197837, "grad_norm": 1.6563040018081665, "learning_rate": 7.929568488466082e-07, "loss": 0.1221, "step": 26650 }, { "epoch": 0.6216082305966016, "grad_norm": 1.1859077215194702, "learning_rate": 7.928791270285395e-07, "loss": 0.1238, "step": 26660 }, { "epoch": 0.6218413919734196, "grad_norm": 1.7566797733306885, "learning_rate": 7.928014052104707e-07, "loss": 0.1256, "step": 26670 }, { "epoch": 0.6220745533502375, "grad_norm": 4.603399753570557, "learning_rate": 7.927236833924019e-07, "loss": 0.1232, "step": 26680 }, { "epoch": 0.6223077147270555, "grad_norm": 3.04742431640625, "learning_rate": 7.9265373375614e-07, "loss": 0.1314, "step": 26690 }, { "epoch": 0.6225408761038734, "grad_norm": 1.5796265602111816, "learning_rate": 7.925760119380713e-07, "loss": 0.1259, "step": 26700 }, { "epoch": 0.6227740374806914, "grad_norm": 2.4521069526672363, "learning_rate": 7.924982901200024e-07, "loss": 0.1155, "step": 26710 }, { "epoch": 0.6230071988575092, "grad_norm": 1.6218605041503906, "learning_rate": 7.924205683019337e-07, "loss": 0.1245, "step": 26720 }, { "epoch": 0.6232403602343272, "grad_norm": 2.236147880554199, "learning_rate": 7.923428464838649e-07, "loss": 0.126, "step": 26730 }, { "epoch": 0.6234735216111451, "grad_norm": 1.6599960327148438, "learning_rate": 7.922651246657961e-07, "loss": 0.1239, "step": 26740 }, { "epoch": 0.6237066829879631, "grad_norm": 2.0112907886505127, "learning_rate": 7.921874028477274e-07, "loss": 0.1307, "step": 26750 }, { "epoch": 0.623939844364781, "grad_norm": 1.9279963970184326, "learning_rate": 7.921096810296586e-07, "loss": 0.1232, "step": 26760 }, { "epoch": 0.624173005741599, "grad_norm": 1.968522548675537, "learning_rate": 7.920319592115899e-07, "loss": 0.1201, "step": 26770 }, { "epoch": 0.6244061671184168, "grad_norm": 1.5206321477890015, "learning_rate": 7.91954237393521e-07, "loss": 0.1196, "step": 26780 }, { "epoch": 0.6246393284952347, "grad_norm": 1.4780375957489014, "learning_rate": 7.918765155754523e-07, "loss": 0.1111, "step": 26790 }, { "epoch": 0.6248724898720527, "grad_norm": 1.2035555839538574, "learning_rate": 7.917987937573836e-07, "loss": 0.1174, "step": 26800 }, { "epoch": 0.6251056512488706, "grad_norm": 1.7618657350540161, "learning_rate": 7.917210719393148e-07, "loss": 0.1325, "step": 26810 }, { "epoch": 0.6253388126256886, "grad_norm": 1.2949655055999756, "learning_rate": 7.91643350121246e-07, "loss": 0.1243, "step": 26820 }, { "epoch": 0.6255719740025065, "grad_norm": 1.4323463439941406, "learning_rate": 7.915656283031772e-07, "loss": 0.1222, "step": 26830 }, { "epoch": 0.6258051353793244, "grad_norm": 1.2126309871673584, "learning_rate": 7.914879064851084e-07, "loss": 0.1285, "step": 26840 }, { "epoch": 0.6260382967561423, "grad_norm": 1.364504098892212, "learning_rate": 7.914101846670397e-07, "loss": 0.1095, "step": 26850 }, { "epoch": 0.6262714581329603, "grad_norm": 2.946502447128296, "learning_rate": 7.913324628489709e-07, "loss": 0.1184, "step": 26860 }, { "epoch": 0.6265046195097782, "grad_norm": 2.9295144081115723, "learning_rate": 7.912547410309022e-07, "loss": 0.1388, "step": 26870 }, { "epoch": 0.6267377808865962, "grad_norm": 1.7637790441513062, "learning_rate": 7.911770192128334e-07, "loss": 0.1113, "step": 26880 }, { "epoch": 0.6269709422634141, "grad_norm": 1.1275426149368286, "learning_rate": 7.910992973947647e-07, "loss": 0.1244, "step": 26890 }, { "epoch": 0.627204103640232, "grad_norm": 2.1618175506591797, "learning_rate": 7.910215755766959e-07, "loss": 0.1129, "step": 26900 }, { "epoch": 0.6274372650170499, "grad_norm": 1.3500109910964966, "learning_rate": 7.90943853758627e-07, "loss": 0.1163, "step": 26910 }, { "epoch": 0.6276704263938678, "grad_norm": 3.783933162689209, "learning_rate": 7.908661319405583e-07, "loss": 0.1346, "step": 26920 }, { "epoch": 0.6279035877706858, "grad_norm": 3.459235191345215, "learning_rate": 7.907884101224895e-07, "loss": 0.1248, "step": 26930 }, { "epoch": 0.6281367491475037, "grad_norm": 1.1476343870162964, "learning_rate": 7.907106883044208e-07, "loss": 0.1211, "step": 26940 }, { "epoch": 0.6283699105243217, "grad_norm": 3.909467935562134, "learning_rate": 7.906329664863521e-07, "loss": 0.1281, "step": 26950 }, { "epoch": 0.6286030719011396, "grad_norm": 1.3066517114639282, "learning_rate": 7.905552446682832e-07, "loss": 0.1369, "step": 26960 }, { "epoch": 0.6288362332779575, "grad_norm": 1.3454346656799316, "learning_rate": 7.904775228502145e-07, "loss": 0.12, "step": 26970 }, { "epoch": 0.6290693946547754, "grad_norm": 1.3705099821090698, "learning_rate": 7.903998010321458e-07, "loss": 0.132, "step": 26980 }, { "epoch": 0.6293025560315934, "grad_norm": 1.4486531019210815, "learning_rate": 7.903220792140769e-07, "loss": 0.1413, "step": 26990 }, { "epoch": 0.6295357174084113, "grad_norm": 4.59236478805542, "learning_rate": 7.902443573960082e-07, "loss": 0.121, "step": 27000 }, { "epoch": 0.6297688787852292, "grad_norm": 1.2790974378585815, "learning_rate": 7.901666355779394e-07, "loss": 0.1266, "step": 27010 }, { "epoch": 0.6300020401620472, "grad_norm": 1.9322606325149536, "learning_rate": 7.900889137598706e-07, "loss": 0.1223, "step": 27020 }, { "epoch": 0.6302352015388651, "grad_norm": 1.737695574760437, "learning_rate": 7.900111919418018e-07, "loss": 0.1368, "step": 27030 }, { "epoch": 0.630468362915683, "grad_norm": 1.0170356035232544, "learning_rate": 7.899334701237331e-07, "loss": 0.1242, "step": 27040 }, { "epoch": 0.6307015242925009, "grad_norm": 1.3896081447601318, "learning_rate": 7.898557483056644e-07, "loss": 0.1172, "step": 27050 }, { "epoch": 0.6309346856693189, "grad_norm": 2.177015542984009, "learning_rate": 7.897780264875956e-07, "loss": 0.13, "step": 27060 }, { "epoch": 0.6311678470461368, "grad_norm": 1.5227034091949463, "learning_rate": 7.897003046695268e-07, "loss": 0.1266, "step": 27070 }, { "epoch": 0.6314010084229548, "grad_norm": 1.4027215242385864, "learning_rate": 7.89622582851458e-07, "loss": 0.1195, "step": 27080 }, { "epoch": 0.6316341697997727, "grad_norm": 1.648126244544983, "learning_rate": 7.895448610333892e-07, "loss": 0.1242, "step": 27090 }, { "epoch": 0.6318673311765906, "grad_norm": 1.7172726392745972, "learning_rate": 7.894671392153205e-07, "loss": 0.1343, "step": 27100 }, { "epoch": 0.6321004925534085, "grad_norm": 1.2503533363342285, "learning_rate": 7.893894173972517e-07, "loss": 0.1148, "step": 27110 }, { "epoch": 0.6323336539302264, "grad_norm": 1.460805058479309, "learning_rate": 7.89311695579183e-07, "loss": 0.1293, "step": 27120 }, { "epoch": 0.6325668153070444, "grad_norm": 1.8735558986663818, "learning_rate": 7.892339737611143e-07, "loss": 0.1116, "step": 27130 }, { "epoch": 0.6327999766838623, "grad_norm": 3.5195462703704834, "learning_rate": 7.891562519430455e-07, "loss": 0.1276, "step": 27140 }, { "epoch": 0.6330331380606803, "grad_norm": 1.701323390007019, "learning_rate": 7.890785301249766e-07, "loss": 0.117, "step": 27150 }, { "epoch": 0.6332662994374981, "grad_norm": 1.1621772050857544, "learning_rate": 7.890008083069078e-07, "loss": 0.1129, "step": 27160 }, { "epoch": 0.6334994608143161, "grad_norm": 1.3698912858963013, "learning_rate": 7.889230864888391e-07, "loss": 0.1314, "step": 27170 }, { "epoch": 0.633732622191134, "grad_norm": 1.4415675401687622, "learning_rate": 7.888453646707704e-07, "loss": 0.1185, "step": 27180 }, { "epoch": 0.633965783567952, "grad_norm": 1.7878702878952026, "learning_rate": 7.887676428527016e-07, "loss": 0.11, "step": 27190 }, { "epoch": 0.6341989449447699, "grad_norm": 1.4025452136993408, "learning_rate": 7.886899210346329e-07, "loss": 0.1107, "step": 27200 }, { "epoch": 0.6344321063215879, "grad_norm": 1.3490585088729858, "learning_rate": 7.88612199216564e-07, "loss": 0.1257, "step": 27210 }, { "epoch": 0.6346652676984058, "grad_norm": 1.325833797454834, "learning_rate": 7.885344773984953e-07, "loss": 0.1302, "step": 27220 }, { "epoch": 0.6348984290752236, "grad_norm": 1.807859182357788, "learning_rate": 7.884567555804265e-07, "loss": 0.1389, "step": 27230 }, { "epoch": 0.6351315904520416, "grad_norm": 1.2939764261245728, "learning_rate": 7.883790337623577e-07, "loss": 0.1208, "step": 27240 }, { "epoch": 0.6353647518288595, "grad_norm": 2.180828332901001, "learning_rate": 7.88301311944289e-07, "loss": 0.1222, "step": 27250 }, { "epoch": 0.6355979132056775, "grad_norm": 1.808091640472412, "learning_rate": 7.882235901262202e-07, "loss": 0.1253, "step": 27260 }, { "epoch": 0.6358310745824954, "grad_norm": 1.7756366729736328, "learning_rate": 7.881458683081514e-07, "loss": 0.1172, "step": 27270 }, { "epoch": 0.6360642359593134, "grad_norm": 1.907391905784607, "learning_rate": 7.880681464900827e-07, "loss": 0.1362, "step": 27280 }, { "epoch": 0.6362973973361312, "grad_norm": 1.7972629070281982, "learning_rate": 7.879904246720139e-07, "loss": 0.1225, "step": 27290 }, { "epoch": 0.6365305587129492, "grad_norm": 1.4641153812408447, "learning_rate": 7.879127028539452e-07, "loss": 0.1242, "step": 27300 }, { "epoch": 0.6367637200897671, "grad_norm": 1.39193856716156, "learning_rate": 7.878349810358763e-07, "loss": 0.1311, "step": 27310 }, { "epoch": 0.6369968814665851, "grad_norm": 3.930391788482666, "learning_rate": 7.877572592178076e-07, "loss": 0.1186, "step": 27320 }, { "epoch": 0.637230042843403, "grad_norm": 1.9089972972869873, "learning_rate": 7.876795373997388e-07, "loss": 0.119, "step": 27330 }, { "epoch": 0.637463204220221, "grad_norm": 1.6186140775680542, "learning_rate": 7.8760181558167e-07, "loss": 0.119, "step": 27340 }, { "epoch": 0.6376963655970389, "grad_norm": 1.2470848560333252, "learning_rate": 7.875240937636013e-07, "loss": 0.1084, "step": 27350 }, { "epoch": 0.6379295269738567, "grad_norm": 3.185147762298584, "learning_rate": 7.874463719455325e-07, "loss": 0.1203, "step": 27360 }, { "epoch": 0.6381626883506747, "grad_norm": 2.0678677558898926, "learning_rate": 7.873686501274638e-07, "loss": 0.1361, "step": 27370 }, { "epoch": 0.6383958497274926, "grad_norm": 2.321617603302002, "learning_rate": 7.872909283093951e-07, "loss": 0.1163, "step": 27380 }, { "epoch": 0.6386290111043106, "grad_norm": 1.314093828201294, "learning_rate": 7.872132064913261e-07, "loss": 0.119, "step": 27390 }, { "epoch": 0.6388621724811285, "grad_norm": 3.111820697784424, "learning_rate": 7.871354846732574e-07, "loss": 0.1178, "step": 27400 }, { "epoch": 0.6390953338579465, "grad_norm": 1.9212841987609863, "learning_rate": 7.870577628551886e-07, "loss": 0.1269, "step": 27410 }, { "epoch": 0.6393284952347643, "grad_norm": 2.143421173095703, "learning_rate": 7.869800410371199e-07, "loss": 0.129, "step": 27420 }, { "epoch": 0.6395616566115823, "grad_norm": 2.721099615097046, "learning_rate": 7.869023192190512e-07, "loss": 0.1208, "step": 27430 }, { "epoch": 0.6397948179884002, "grad_norm": 1.4123921394348145, "learning_rate": 7.868245974009824e-07, "loss": 0.1238, "step": 27440 }, { "epoch": 0.6400279793652182, "grad_norm": 2.7621796131134033, "learning_rate": 7.867468755829136e-07, "loss": 0.1208, "step": 27450 }, { "epoch": 0.6402611407420361, "grad_norm": 1.6500383615493774, "learning_rate": 7.866691537648449e-07, "loss": 0.1336, "step": 27460 }, { "epoch": 0.640494302118854, "grad_norm": 1.5662381649017334, "learning_rate": 7.86591431946776e-07, "loss": 0.1187, "step": 27470 }, { "epoch": 0.6407274634956719, "grad_norm": 1.2884788513183594, "learning_rate": 7.865137101287073e-07, "loss": 0.1171, "step": 27480 }, { "epoch": 0.6409606248724898, "grad_norm": 2.3561577796936035, "learning_rate": 7.864359883106385e-07, "loss": 0.122, "step": 27490 }, { "epoch": 0.6411937862493078, "grad_norm": 1.2219101190567017, "learning_rate": 7.863582664925698e-07, "loss": 0.1225, "step": 27500 }, { "epoch": 0.6414269476261257, "grad_norm": 2.2540645599365234, "learning_rate": 7.86280544674501e-07, "loss": 0.1134, "step": 27510 }, { "epoch": 0.6416601090029437, "grad_norm": 1.6628304719924927, "learning_rate": 7.862028228564322e-07, "loss": 0.1194, "step": 27520 }, { "epoch": 0.6418932703797616, "grad_norm": 3.1801085472106934, "learning_rate": 7.861251010383635e-07, "loss": 0.1194, "step": 27530 }, { "epoch": 0.6421264317565796, "grad_norm": 1.5230746269226074, "learning_rate": 7.860473792202947e-07, "loss": 0.1352, "step": 27540 }, { "epoch": 0.6423595931333974, "grad_norm": 1.4361457824707031, "learning_rate": 7.859696574022259e-07, "loss": 0.1106, "step": 27550 }, { "epoch": 0.6425927545102154, "grad_norm": 3.355491876602173, "learning_rate": 7.858919355841572e-07, "loss": 0.1199, "step": 27560 }, { "epoch": 0.6428259158870333, "grad_norm": 1.9442170858383179, "learning_rate": 7.858142137660883e-07, "loss": 0.1312, "step": 27570 }, { "epoch": 0.6430590772638513, "grad_norm": 1.3862613439559937, "learning_rate": 7.857364919480196e-07, "loss": 0.1266, "step": 27580 }, { "epoch": 0.6432922386406692, "grad_norm": 2.305211305618286, "learning_rate": 7.856587701299508e-07, "loss": 0.1254, "step": 27590 }, { "epoch": 0.6435254000174871, "grad_norm": 2.781093120574951, "learning_rate": 7.855810483118821e-07, "loss": 0.1335, "step": 27600 }, { "epoch": 0.643758561394305, "grad_norm": 1.388838529586792, "learning_rate": 7.855033264938134e-07, "loss": 0.1289, "step": 27610 }, { "epoch": 0.6439917227711229, "grad_norm": 2.8856894969940186, "learning_rate": 7.854256046757446e-07, "loss": 0.1236, "step": 27620 }, { "epoch": 0.6442248841479409, "grad_norm": 1.151655673980713, "learning_rate": 7.853478828576757e-07, "loss": 0.1236, "step": 27630 }, { "epoch": 0.6444580455247588, "grad_norm": 1.8542500734329224, "learning_rate": 7.852701610396069e-07, "loss": 0.1267, "step": 27640 }, { "epoch": 0.6446912069015768, "grad_norm": 1.8259215354919434, "learning_rate": 7.851924392215382e-07, "loss": 0.1264, "step": 27650 }, { "epoch": 0.6449243682783947, "grad_norm": 1.3784493207931519, "learning_rate": 7.851147174034695e-07, "loss": 0.1227, "step": 27660 }, { "epoch": 0.6451575296552127, "grad_norm": 2.1240503787994385, "learning_rate": 7.850369955854007e-07, "loss": 0.1196, "step": 27670 }, { "epoch": 0.6453906910320305, "grad_norm": 1.2108017206192017, "learning_rate": 7.84959273767332e-07, "loss": 0.128, "step": 27680 }, { "epoch": 0.6456238524088485, "grad_norm": 2.5054004192352295, "learning_rate": 7.848815519492632e-07, "loss": 0.1315, "step": 27690 }, { "epoch": 0.6458570137856664, "grad_norm": 1.2900030612945557, "learning_rate": 7.848038301311944e-07, "loss": 0.112, "step": 27700 }, { "epoch": 0.6460901751624843, "grad_norm": 2.4156980514526367, "learning_rate": 7.847261083131256e-07, "loss": 0.1276, "step": 27710 }, { "epoch": 0.6463233365393023, "grad_norm": 1.4345883131027222, "learning_rate": 7.846483864950568e-07, "loss": 0.1225, "step": 27720 }, { "epoch": 0.6465564979161202, "grad_norm": 3.021266222000122, "learning_rate": 7.845706646769881e-07, "loss": 0.1313, "step": 27730 }, { "epoch": 0.6467896592929381, "grad_norm": 2.381004810333252, "learning_rate": 7.844929428589193e-07, "loss": 0.116, "step": 27740 }, { "epoch": 0.647022820669756, "grad_norm": 1.1840009689331055, "learning_rate": 7.844152210408506e-07, "loss": 0.1267, "step": 27750 }, { "epoch": 0.647255982046574, "grad_norm": 2.834627628326416, "learning_rate": 7.843374992227818e-07, "loss": 0.1156, "step": 27760 }, { "epoch": 0.6474891434233919, "grad_norm": 1.3251315355300903, "learning_rate": 7.84259777404713e-07, "loss": 0.1271, "step": 27770 }, { "epoch": 0.6477223048002099, "grad_norm": 1.4779860973358154, "learning_rate": 7.841820555866443e-07, "loss": 0.1263, "step": 27780 }, { "epoch": 0.6479554661770278, "grad_norm": 2.088639497756958, "learning_rate": 7.841043337685754e-07, "loss": 0.1267, "step": 27790 }, { "epoch": 0.6481886275538457, "grad_norm": 1.4702914953231812, "learning_rate": 7.840266119505067e-07, "loss": 0.1204, "step": 27800 }, { "epoch": 0.6484217889306636, "grad_norm": 2.79872727394104, "learning_rate": 7.83948890132438e-07, "loss": 0.1284, "step": 27810 }, { "epoch": 0.6486549503074815, "grad_norm": 1.1318827867507935, "learning_rate": 7.838711683143691e-07, "loss": 0.1236, "step": 27820 }, { "epoch": 0.6488881116842995, "grad_norm": 1.7832298278808594, "learning_rate": 7.837934464963004e-07, "loss": 0.1361, "step": 27830 }, { "epoch": 0.6491212730611174, "grad_norm": 1.5356779098510742, "learning_rate": 7.837157246782316e-07, "loss": 0.1201, "step": 27840 }, { "epoch": 0.6493544344379354, "grad_norm": 1.443367600440979, "learning_rate": 7.836380028601629e-07, "loss": 0.1141, "step": 27850 }, { "epoch": 0.6495875958147533, "grad_norm": 4.437742710113525, "learning_rate": 7.835602810420942e-07, "loss": 0.1248, "step": 27860 }, { "epoch": 0.6498207571915712, "grad_norm": 1.2157504558563232, "learning_rate": 7.834825592240253e-07, "loss": 0.1212, "step": 27870 }, { "epoch": 0.6500539185683891, "grad_norm": 2.4957518577575684, "learning_rate": 7.834048374059565e-07, "loss": 0.1327, "step": 27880 }, { "epoch": 0.6502870799452071, "grad_norm": 1.5830907821655273, "learning_rate": 7.833271155878877e-07, "loss": 0.1175, "step": 27890 }, { "epoch": 0.650520241322025, "grad_norm": 1.8248673677444458, "learning_rate": 7.83249393769819e-07, "loss": 0.1271, "step": 27900 }, { "epoch": 0.650753402698843, "grad_norm": 1.7116371393203735, "learning_rate": 7.831716719517503e-07, "loss": 0.1221, "step": 27910 }, { "epoch": 0.6509865640756609, "grad_norm": 2.129796028137207, "learning_rate": 7.830939501336815e-07, "loss": 0.1255, "step": 27920 }, { "epoch": 0.6512197254524787, "grad_norm": 1.2609596252441406, "learning_rate": 7.830162283156128e-07, "loss": 0.1296, "step": 27930 }, { "epoch": 0.6514528868292967, "grad_norm": 1.9501121044158936, "learning_rate": 7.82938506497544e-07, "loss": 0.1181, "step": 27940 }, { "epoch": 0.6516860482061146, "grad_norm": 1.0727612972259521, "learning_rate": 7.828607846794751e-07, "loss": 0.1154, "step": 27950 }, { "epoch": 0.6519192095829326, "grad_norm": 2.9877512454986572, "learning_rate": 7.827830628614064e-07, "loss": 0.1148, "step": 27960 }, { "epoch": 0.6521523709597505, "grad_norm": 2.8598148822784424, "learning_rate": 7.827053410433376e-07, "loss": 0.1437, "step": 27970 }, { "epoch": 0.6523855323365685, "grad_norm": 2.35614013671875, "learning_rate": 7.826276192252689e-07, "loss": 0.1266, "step": 27980 }, { "epoch": 0.6526186937133864, "grad_norm": 1.2643786668777466, "learning_rate": 7.825498974072002e-07, "loss": 0.1271, "step": 27990 }, { "epoch": 0.6528518550902043, "grad_norm": 2.761733055114746, "learning_rate": 7.824721755891314e-07, "loss": 0.1126, "step": 28000 }, { "epoch": 0.6530850164670222, "grad_norm": 1.4956496953964233, "learning_rate": 7.823944537710626e-07, "loss": 0.1209, "step": 28010 }, { "epoch": 0.6533181778438402, "grad_norm": 1.856347680091858, "learning_rate": 7.823167319529938e-07, "loss": 0.1284, "step": 28020 }, { "epoch": 0.6535513392206581, "grad_norm": 1.1864138841629028, "learning_rate": 7.82239010134925e-07, "loss": 0.1276, "step": 28030 }, { "epoch": 0.6537845005974761, "grad_norm": 2.0137643814086914, "learning_rate": 7.821612883168563e-07, "loss": 0.1258, "step": 28040 }, { "epoch": 0.654017661974294, "grad_norm": 3.59182071685791, "learning_rate": 7.820835664987875e-07, "loss": 0.1232, "step": 28050 }, { "epoch": 0.6542508233511118, "grad_norm": 2.954094648361206, "learning_rate": 7.820058446807188e-07, "loss": 0.1053, "step": 28060 }, { "epoch": 0.6544839847279298, "grad_norm": 1.2009836435317993, "learning_rate": 7.819281228626499e-07, "loss": 0.121, "step": 28070 }, { "epoch": 0.6547171461047477, "grad_norm": 1.3348525762557983, "learning_rate": 7.818504010445812e-07, "loss": 0.112, "step": 28080 }, { "epoch": 0.6549503074815657, "grad_norm": 1.2184817790985107, "learning_rate": 7.817726792265125e-07, "loss": 0.1202, "step": 28090 }, { "epoch": 0.6551834688583836, "grad_norm": 1.6246120929718018, "learning_rate": 7.816949574084437e-07, "loss": 0.1288, "step": 28100 }, { "epoch": 0.6554166302352016, "grad_norm": 1.6174259185791016, "learning_rate": 7.816172355903749e-07, "loss": 0.1124, "step": 28110 }, { "epoch": 0.6556497916120194, "grad_norm": 2.802685260772705, "learning_rate": 7.815395137723061e-07, "loss": 0.1279, "step": 28120 }, { "epoch": 0.6558829529888374, "grad_norm": 1.5320278406143188, "learning_rate": 7.814617919542373e-07, "loss": 0.1301, "step": 28130 }, { "epoch": 0.6561161143656553, "grad_norm": 2.5688843727111816, "learning_rate": 7.813840701361686e-07, "loss": 0.1216, "step": 28140 }, { "epoch": 0.6563492757424733, "grad_norm": 2.4368910789489746, "learning_rate": 7.813063483180998e-07, "loss": 0.1208, "step": 28150 }, { "epoch": 0.6565824371192912, "grad_norm": 1.071601152420044, "learning_rate": 7.812286265000311e-07, "loss": 0.1222, "step": 28160 }, { "epoch": 0.6568155984961092, "grad_norm": 1.9023386240005493, "learning_rate": 7.811509046819623e-07, "loss": 0.119, "step": 28170 }, { "epoch": 0.6570487598729271, "grad_norm": 1.1793626546859741, "learning_rate": 7.810731828638936e-07, "loss": 0.1254, "step": 28180 }, { "epoch": 0.6572819212497449, "grad_norm": 1.6112632751464844, "learning_rate": 7.809954610458247e-07, "loss": 0.1265, "step": 28190 }, { "epoch": 0.6575150826265629, "grad_norm": 1.3730422258377075, "learning_rate": 7.809177392277559e-07, "loss": 0.1232, "step": 28200 }, { "epoch": 0.6577482440033808, "grad_norm": 0.9814436435699463, "learning_rate": 7.808400174096872e-07, "loss": 0.1198, "step": 28210 }, { "epoch": 0.6579814053801988, "grad_norm": 2.0499155521392822, "learning_rate": 7.807622955916184e-07, "loss": 0.1164, "step": 28220 }, { "epoch": 0.6582145667570167, "grad_norm": 1.586832046508789, "learning_rate": 7.806845737735497e-07, "loss": 0.1255, "step": 28230 }, { "epoch": 0.6584477281338347, "grad_norm": 1.975511074066162, "learning_rate": 7.80606851955481e-07, "loss": 0.119, "step": 28240 }, { "epoch": 0.6586808895106525, "grad_norm": 3.011415958404541, "learning_rate": 7.805291301374121e-07, "loss": 0.1197, "step": 28250 }, { "epoch": 0.6589140508874705, "grad_norm": 1.8412156105041504, "learning_rate": 7.804514083193434e-07, "loss": 0.1103, "step": 28260 }, { "epoch": 0.6591472122642884, "grad_norm": 1.5287184715270996, "learning_rate": 7.803736865012745e-07, "loss": 0.1195, "step": 28270 }, { "epoch": 0.6593803736411064, "grad_norm": 1.430766224861145, "learning_rate": 7.802959646832058e-07, "loss": 0.112, "step": 28280 }, { "epoch": 0.6596135350179243, "grad_norm": 1.7616900205612183, "learning_rate": 7.802182428651371e-07, "loss": 0.1251, "step": 28290 }, { "epoch": 0.6598466963947422, "grad_norm": 1.2230404615402222, "learning_rate": 7.801405210470683e-07, "loss": 0.1264, "step": 28300 }, { "epoch": 0.6600798577715601, "grad_norm": 2.175402879714966, "learning_rate": 7.800627992289995e-07, "loss": 0.121, "step": 28310 }, { "epoch": 0.660313019148378, "grad_norm": 4.534302711486816, "learning_rate": 7.799850774109307e-07, "loss": 0.108, "step": 28320 }, { "epoch": 0.660546180525196, "grad_norm": 2.33310866355896, "learning_rate": 7.79907355592862e-07, "loss": 0.1184, "step": 28330 }, { "epoch": 0.6607793419020139, "grad_norm": 1.2258312702178955, "learning_rate": 7.798296337747933e-07, "loss": 0.1208, "step": 28340 }, { "epoch": 0.6610125032788319, "grad_norm": 3.927267551422119, "learning_rate": 7.797519119567244e-07, "loss": 0.123, "step": 28350 }, { "epoch": 0.6612456646556498, "grad_norm": 1.375295639038086, "learning_rate": 7.796741901386557e-07, "loss": 0.1159, "step": 28360 }, { "epoch": 0.6614788260324678, "grad_norm": 1.9506226778030396, "learning_rate": 7.795964683205868e-07, "loss": 0.1344, "step": 28370 }, { "epoch": 0.6617119874092856, "grad_norm": 1.5361709594726562, "learning_rate": 7.795187465025181e-07, "loss": 0.1201, "step": 28380 }, { "epoch": 0.6619451487861036, "grad_norm": 1.6767123937606812, "learning_rate": 7.794410246844494e-07, "loss": 0.1185, "step": 28390 }, { "epoch": 0.6621783101629215, "grad_norm": 2.377727746963501, "learning_rate": 7.793633028663806e-07, "loss": 0.1127, "step": 28400 }, { "epoch": 0.6624114715397394, "grad_norm": 1.4051488637924194, "learning_rate": 7.792855810483119e-07, "loss": 0.1244, "step": 28410 }, { "epoch": 0.6626446329165574, "grad_norm": 1.524707317352295, "learning_rate": 7.792078592302432e-07, "loss": 0.1293, "step": 28420 }, { "epoch": 0.6628777942933753, "grad_norm": 1.8152021169662476, "learning_rate": 7.791301374121742e-07, "loss": 0.1214, "step": 28430 }, { "epoch": 0.6631109556701932, "grad_norm": 1.6365548372268677, "learning_rate": 7.790524155941055e-07, "loss": 0.1219, "step": 28440 }, { "epoch": 0.6633441170470111, "grad_norm": 1.5127242803573608, "learning_rate": 7.789746937760367e-07, "loss": 0.1162, "step": 28450 }, { "epoch": 0.6635772784238291, "grad_norm": 1.9577772617340088, "learning_rate": 7.78896971957968e-07, "loss": 0.1273, "step": 28460 }, { "epoch": 0.663810439800647, "grad_norm": 3.0144262313842773, "learning_rate": 7.788192501398993e-07, "loss": 0.1315, "step": 28470 }, { "epoch": 0.664043601177465, "grad_norm": 1.3049495220184326, "learning_rate": 7.787415283218305e-07, "loss": 0.1284, "step": 28480 }, { "epoch": 0.6642767625542829, "grad_norm": 1.4701106548309326, "learning_rate": 7.786638065037618e-07, "loss": 0.1091, "step": 28490 }, { "epoch": 0.6645099239311009, "grad_norm": 2.686962127685547, "learning_rate": 7.785860846856929e-07, "loss": 0.1212, "step": 28500 }, { "epoch": 0.6647430853079187, "grad_norm": 1.2196331024169922, "learning_rate": 7.785083628676241e-07, "loss": 0.1162, "step": 28510 }, { "epoch": 0.6649762466847366, "grad_norm": 1.2558950185775757, "learning_rate": 7.784306410495554e-07, "loss": 0.1274, "step": 28520 }, { "epoch": 0.6652094080615546, "grad_norm": 1.8382731676101685, "learning_rate": 7.783529192314866e-07, "loss": 0.1283, "step": 28530 }, { "epoch": 0.6654425694383725, "grad_norm": 3.5493738651275635, "learning_rate": 7.782751974134179e-07, "loss": 0.1242, "step": 28540 }, { "epoch": 0.6656757308151905, "grad_norm": 2.758492946624756, "learning_rate": 7.781974755953491e-07, "loss": 0.1106, "step": 28550 }, { "epoch": 0.6659088921920084, "grad_norm": 3.278578758239746, "learning_rate": 7.781197537772803e-07, "loss": 0.1292, "step": 28560 }, { "epoch": 0.6661420535688263, "grad_norm": 2.4199421405792236, "learning_rate": 7.780420319592116e-07, "loss": 0.1128, "step": 28570 }, { "epoch": 0.6663752149456442, "grad_norm": 1.2864060401916504, "learning_rate": 7.779643101411428e-07, "loss": 0.1217, "step": 28580 }, { "epoch": 0.6666083763224622, "grad_norm": 1.6761261224746704, "learning_rate": 7.77886588323074e-07, "loss": 0.1095, "step": 28590 }, { "epoch": 0.6668415376992801, "grad_norm": 2.1128129959106445, "learning_rate": 7.778088665050052e-07, "loss": 0.1191, "step": 28600 }, { "epoch": 0.6670746990760981, "grad_norm": 1.0973169803619385, "learning_rate": 7.777311446869365e-07, "loss": 0.1161, "step": 28610 }, { "epoch": 0.667307860452916, "grad_norm": 4.1933746337890625, "learning_rate": 7.776534228688677e-07, "loss": 0.1331, "step": 28620 }, { "epoch": 0.6675410218297338, "grad_norm": 3.278597354888916, "learning_rate": 7.775757010507989e-07, "loss": 0.121, "step": 28630 }, { "epoch": 0.6677741832065518, "grad_norm": 1.5709718465805054, "learning_rate": 7.774979792327302e-07, "loss": 0.1123, "step": 28640 }, { "epoch": 0.6680073445833697, "grad_norm": 1.4403035640716553, "learning_rate": 7.774202574146614e-07, "loss": 0.1275, "step": 28650 }, { "epoch": 0.6682405059601877, "grad_norm": 1.786276936531067, "learning_rate": 7.773425355965927e-07, "loss": 0.1286, "step": 28660 }, { "epoch": 0.6684736673370056, "grad_norm": 2.6946113109588623, "learning_rate": 7.772648137785239e-07, "loss": 0.1277, "step": 28670 }, { "epoch": 0.6687068287138236, "grad_norm": 2.51375675201416, "learning_rate": 7.77187091960455e-07, "loss": 0.1252, "step": 28680 }, { "epoch": 0.6689399900906415, "grad_norm": 1.2332957983016968, "learning_rate": 7.771093701423863e-07, "loss": 0.1111, "step": 28690 }, { "epoch": 0.6691731514674594, "grad_norm": 1.9513221979141235, "learning_rate": 7.770394205061245e-07, "loss": 0.1201, "step": 28700 }, { "epoch": 0.6694063128442773, "grad_norm": 1.3585246801376343, "learning_rate": 7.769616986880557e-07, "loss": 0.1201, "step": 28710 }, { "epoch": 0.6696394742210953, "grad_norm": 1.2488282918930054, "learning_rate": 7.768839768699868e-07, "loss": 0.1207, "step": 28720 }, { "epoch": 0.6698726355979132, "grad_norm": 1.2259691953659058, "learning_rate": 7.768062550519181e-07, "loss": 0.119, "step": 28730 }, { "epoch": 0.6701057969747312, "grad_norm": 2.558032989501953, "learning_rate": 7.767285332338494e-07, "loss": 0.1181, "step": 28740 }, { "epoch": 0.6703389583515491, "grad_norm": 2.0966339111328125, "learning_rate": 7.766508114157806e-07, "loss": 0.1265, "step": 28750 }, { "epoch": 0.6705721197283669, "grad_norm": 1.6145732402801514, "learning_rate": 7.765730895977119e-07, "loss": 0.1199, "step": 28760 }, { "epoch": 0.6708052811051849, "grad_norm": 2.3075380325317383, "learning_rate": 7.76495367779643e-07, "loss": 0.1132, "step": 28770 }, { "epoch": 0.6710384424820028, "grad_norm": 1.5617541074752808, "learning_rate": 7.764176459615743e-07, "loss": 0.1199, "step": 28780 }, { "epoch": 0.6712716038588208, "grad_norm": 4.437211036682129, "learning_rate": 7.763399241435056e-07, "loss": 0.1209, "step": 28790 }, { "epoch": 0.6715047652356387, "grad_norm": 1.2026854753494263, "learning_rate": 7.762622023254367e-07, "loss": 0.1046, "step": 28800 }, { "epoch": 0.6717379266124567, "grad_norm": 1.20786452293396, "learning_rate": 7.76184480507368e-07, "loss": 0.1304, "step": 28810 }, { "epoch": 0.6719710879892746, "grad_norm": 1.857530117034912, "learning_rate": 7.761067586892992e-07, "loss": 0.1154, "step": 28820 }, { "epoch": 0.6722042493660925, "grad_norm": 1.2333838939666748, "learning_rate": 7.760290368712305e-07, "loss": 0.1159, "step": 28830 }, { "epoch": 0.6724374107429104, "grad_norm": 1.5823746919631958, "learning_rate": 7.759513150531617e-07, "loss": 0.1141, "step": 28840 }, { "epoch": 0.6726705721197284, "grad_norm": 1.3422614336013794, "learning_rate": 7.758735932350929e-07, "loss": 0.1274, "step": 28850 }, { "epoch": 0.6729037334965463, "grad_norm": 1.4764692783355713, "learning_rate": 7.757958714170242e-07, "loss": 0.1263, "step": 28860 }, { "epoch": 0.6731368948733643, "grad_norm": 1.4499403238296509, "learning_rate": 7.757181495989554e-07, "loss": 0.1214, "step": 28870 }, { "epoch": 0.6733700562501822, "grad_norm": 2.361924171447754, "learning_rate": 7.756404277808866e-07, "loss": 0.1237, "step": 28880 }, { "epoch": 0.673603217627, "grad_norm": 1.781578779220581, "learning_rate": 7.755627059628179e-07, "loss": 0.1229, "step": 28890 }, { "epoch": 0.673836379003818, "grad_norm": 1.6937984228134155, "learning_rate": 7.75484984144749e-07, "loss": 0.1316, "step": 28900 }, { "epoch": 0.6740695403806359, "grad_norm": 1.4221415519714355, "learning_rate": 7.754072623266803e-07, "loss": 0.1148, "step": 28910 }, { "epoch": 0.6743027017574539, "grad_norm": 1.4945176839828491, "learning_rate": 7.753295405086115e-07, "loss": 0.116, "step": 28920 }, { "epoch": 0.6745358631342718, "grad_norm": 1.07468843460083, "learning_rate": 7.752518186905428e-07, "loss": 0.108, "step": 28930 }, { "epoch": 0.6747690245110898, "grad_norm": 2.291243076324463, "learning_rate": 7.751740968724741e-07, "loss": 0.1317, "step": 28940 }, { "epoch": 0.6750021858879076, "grad_norm": 1.3969577550888062, "learning_rate": 7.750963750544053e-07, "loss": 0.1299, "step": 28950 }, { "epoch": 0.6752353472647256, "grad_norm": 1.526064395904541, "learning_rate": 7.750186532363364e-07, "loss": 0.1219, "step": 28960 }, { "epoch": 0.6754685086415435, "grad_norm": 1.4349145889282227, "learning_rate": 7.749409314182676e-07, "loss": 0.1306, "step": 28970 }, { "epoch": 0.6757016700183615, "grad_norm": 2.9156877994537354, "learning_rate": 7.748632096001989e-07, "loss": 0.1232, "step": 28980 }, { "epoch": 0.6759348313951794, "grad_norm": 2.999319553375244, "learning_rate": 7.747854877821302e-07, "loss": 0.1252, "step": 28990 }, { "epoch": 0.6761679927719974, "grad_norm": 2.7114756107330322, "learning_rate": 7.747077659640614e-07, "loss": 0.134, "step": 29000 }, { "epoch": 0.6764011541488153, "grad_norm": 2.17578387260437, "learning_rate": 7.746300441459927e-07, "loss": 0.128, "step": 29010 }, { "epoch": 0.6766343155256331, "grad_norm": 2.514817237854004, "learning_rate": 7.745523223279238e-07, "loss": 0.1195, "step": 29020 }, { "epoch": 0.6768674769024511, "grad_norm": 4.426085472106934, "learning_rate": 7.744746005098551e-07, "loss": 0.1225, "step": 29030 }, { "epoch": 0.677100638279269, "grad_norm": 1.78897225856781, "learning_rate": 7.743968786917863e-07, "loss": 0.1246, "step": 29040 }, { "epoch": 0.677333799656087, "grad_norm": 1.411731481552124, "learning_rate": 7.743191568737175e-07, "loss": 0.1194, "step": 29050 }, { "epoch": 0.6775669610329049, "grad_norm": 1.8717204332351685, "learning_rate": 7.742414350556488e-07, "loss": 0.1166, "step": 29060 }, { "epoch": 0.6778001224097229, "grad_norm": 2.3714375495910645, "learning_rate": 7.7416371323758e-07, "loss": 0.1206, "step": 29070 }, { "epoch": 0.6780332837865407, "grad_norm": 1.6774218082427979, "learning_rate": 7.740859914195112e-07, "loss": 0.1285, "step": 29080 }, { "epoch": 0.6782664451633587, "grad_norm": 2.7001357078552246, "learning_rate": 7.740082696014425e-07, "loss": 0.1179, "step": 29090 }, { "epoch": 0.6784996065401766, "grad_norm": 1.226793646812439, "learning_rate": 7.739305477833737e-07, "loss": 0.126, "step": 29100 }, { "epoch": 0.6787327679169946, "grad_norm": 2.776857376098633, "learning_rate": 7.73852825965305e-07, "loss": 0.1212, "step": 29110 }, { "epoch": 0.6789659292938125, "grad_norm": 3.4164087772369385, "learning_rate": 7.737751041472361e-07, "loss": 0.1253, "step": 29120 }, { "epoch": 0.6791990906706304, "grad_norm": 1.3333996534347534, "learning_rate": 7.736973823291674e-07, "loss": 0.1211, "step": 29130 }, { "epoch": 0.6794322520474484, "grad_norm": 1.5068283081054688, "learning_rate": 7.736196605110986e-07, "loss": 0.1217, "step": 29140 }, { "epoch": 0.6796654134242662, "grad_norm": 3.2048542499542236, "learning_rate": 7.735419386930298e-07, "loss": 0.1231, "step": 29150 }, { "epoch": 0.6798985748010842, "grad_norm": 1.6864421367645264, "learning_rate": 7.734642168749611e-07, "loss": 0.1322, "step": 29160 }, { "epoch": 0.6801317361779021, "grad_norm": 1.3088325262069702, "learning_rate": 7.733864950568924e-07, "loss": 0.1237, "step": 29170 }, { "epoch": 0.6803648975547201, "grad_norm": 1.4858369827270508, "learning_rate": 7.733087732388236e-07, "loss": 0.121, "step": 29180 }, { "epoch": 0.680598058931538, "grad_norm": 3.164034128189087, "learning_rate": 7.732310514207549e-07, "loss": 0.1187, "step": 29190 }, { "epoch": 0.680831220308356, "grad_norm": 1.6557165384292603, "learning_rate": 7.731533296026859e-07, "loss": 0.1219, "step": 29200 }, { "epoch": 0.6810643816851738, "grad_norm": 2.029672861099243, "learning_rate": 7.730756077846172e-07, "loss": 0.1173, "step": 29210 }, { "epoch": 0.6812975430619918, "grad_norm": 1.4067665338516235, "learning_rate": 7.729978859665485e-07, "loss": 0.1302, "step": 29220 }, { "epoch": 0.6815307044388097, "grad_norm": 1.1114542484283447, "learning_rate": 7.729201641484797e-07, "loss": 0.1182, "step": 29230 }, { "epoch": 0.6817638658156276, "grad_norm": 1.5200711488723755, "learning_rate": 7.72842442330411e-07, "loss": 0.126, "step": 29240 }, { "epoch": 0.6819970271924456, "grad_norm": 1.4550648927688599, "learning_rate": 7.727647205123422e-07, "loss": 0.1348, "step": 29250 }, { "epoch": 0.6822301885692635, "grad_norm": 2.9155962467193604, "learning_rate": 7.726869986942735e-07, "loss": 0.109, "step": 29260 }, { "epoch": 0.6824633499460814, "grad_norm": 1.4230406284332275, "learning_rate": 7.726092768762047e-07, "loss": 0.1245, "step": 29270 }, { "epoch": 0.6826965113228993, "grad_norm": 1.588865041732788, "learning_rate": 7.725315550581358e-07, "loss": 0.1218, "step": 29280 }, { "epoch": 0.6829296726997173, "grad_norm": 1.4569251537322998, "learning_rate": 7.724538332400671e-07, "loss": 0.1177, "step": 29290 }, { "epoch": 0.6831628340765352, "grad_norm": 1.2845910787582397, "learning_rate": 7.723761114219983e-07, "loss": 0.1308, "step": 29300 }, { "epoch": 0.6833959954533532, "grad_norm": 1.359213948249817, "learning_rate": 7.722983896039296e-07, "loss": 0.1294, "step": 29310 }, { "epoch": 0.6836291568301711, "grad_norm": 1.8533012866973877, "learning_rate": 7.722206677858609e-07, "loss": 0.116, "step": 29320 }, { "epoch": 0.6838623182069891, "grad_norm": 4.327017784118652, "learning_rate": 7.72142945967792e-07, "loss": 0.132, "step": 29330 }, { "epoch": 0.6840954795838069, "grad_norm": 1.4133440256118774, "learning_rate": 7.720652241497233e-07, "loss": 0.117, "step": 29340 }, { "epoch": 0.6843286409606248, "grad_norm": 1.4422454833984375, "learning_rate": 7.719875023316545e-07, "loss": 0.1279, "step": 29350 }, { "epoch": 0.6845618023374428, "grad_norm": 1.5096173286437988, "learning_rate": 7.719097805135857e-07, "loss": 0.1158, "step": 29360 }, { "epoch": 0.6847949637142607, "grad_norm": 1.330291986465454, "learning_rate": 7.71832058695517e-07, "loss": 0.1247, "step": 29370 }, { "epoch": 0.6850281250910787, "grad_norm": 3.2741622924804688, "learning_rate": 7.717543368774482e-07, "loss": 0.1297, "step": 29380 }, { "epoch": 0.6852612864678966, "grad_norm": 1.201525330543518, "learning_rate": 7.716766150593794e-07, "loss": 0.1271, "step": 29390 }, { "epoch": 0.6854944478447145, "grad_norm": 3.1227595806121826, "learning_rate": 7.715988932413106e-07, "loss": 0.1133, "step": 29400 }, { "epoch": 0.6857276092215324, "grad_norm": 2.137707233428955, "learning_rate": 7.715211714232419e-07, "loss": 0.1216, "step": 29410 }, { "epoch": 0.6859607705983504, "grad_norm": 2.1455225944519043, "learning_rate": 7.714434496051732e-07, "loss": 0.1212, "step": 29420 }, { "epoch": 0.6861939319751683, "grad_norm": 1.520923137664795, "learning_rate": 7.713657277871044e-07, "loss": 0.1172, "step": 29430 }, { "epoch": 0.6864270933519863, "grad_norm": 1.5981007814407349, "learning_rate": 7.712880059690356e-07, "loss": 0.1233, "step": 29440 }, { "epoch": 0.6866602547288042, "grad_norm": 1.6990776062011719, "learning_rate": 7.712102841509667e-07, "loss": 0.1191, "step": 29450 }, { "epoch": 0.6868934161056222, "grad_norm": 1.5532734394073486, "learning_rate": 7.71132562332898e-07, "loss": 0.1147, "step": 29460 }, { "epoch": 0.68712657748244, "grad_norm": 2.4875741004943848, "learning_rate": 7.710548405148293e-07, "loss": 0.1125, "step": 29470 }, { "epoch": 0.6873597388592579, "grad_norm": 2.1049909591674805, "learning_rate": 7.709771186967605e-07, "loss": 0.1327, "step": 29480 }, { "epoch": 0.6875929002360759, "grad_norm": 2.797440528869629, "learning_rate": 7.708993968786918e-07, "loss": 0.1266, "step": 29490 }, { "epoch": 0.6878260616128938, "grad_norm": 2.072974681854248, "learning_rate": 7.70821675060623e-07, "loss": 0.1153, "step": 29500 }, { "epoch": 0.6880592229897118, "grad_norm": 1.2176017761230469, "learning_rate": 7.707439532425542e-07, "loss": 0.1173, "step": 29510 }, { "epoch": 0.6882923843665297, "grad_norm": 1.6368553638458252, "learning_rate": 7.706662314244854e-07, "loss": 0.1223, "step": 29520 }, { "epoch": 0.6885255457433476, "grad_norm": 1.7079534530639648, "learning_rate": 7.705885096064166e-07, "loss": 0.113, "step": 29530 }, { "epoch": 0.6887587071201655, "grad_norm": 1.9435323476791382, "learning_rate": 7.705107877883479e-07, "loss": 0.1133, "step": 29540 }, { "epoch": 0.6889918684969835, "grad_norm": 2.9111037254333496, "learning_rate": 7.704330659702792e-07, "loss": 0.1251, "step": 29550 }, { "epoch": 0.6892250298738014, "grad_norm": 3.9915707111358643, "learning_rate": 7.703553441522104e-07, "loss": 0.1269, "step": 29560 }, { "epoch": 0.6894581912506194, "grad_norm": 4.470600605010986, "learning_rate": 7.702776223341416e-07, "loss": 0.1205, "step": 29570 }, { "epoch": 0.6896913526274373, "grad_norm": 1.7487834692001343, "learning_rate": 7.701999005160728e-07, "loss": 0.1251, "step": 29580 }, { "epoch": 0.6899245140042551, "grad_norm": 1.642834186553955, "learning_rate": 7.701221786980041e-07, "loss": 0.1118, "step": 29590 }, { "epoch": 0.6901576753810731, "grad_norm": 1.724115014076233, "learning_rate": 7.700444568799354e-07, "loss": 0.1212, "step": 29600 }, { "epoch": 0.690390836757891, "grad_norm": 1.7580417394638062, "learning_rate": 7.699667350618665e-07, "loss": 0.1315, "step": 29610 }, { "epoch": 0.690623998134709, "grad_norm": 2.1788227558135986, "learning_rate": 7.698890132437978e-07, "loss": 0.1278, "step": 29620 }, { "epoch": 0.6908571595115269, "grad_norm": 2.436168670654297, "learning_rate": 7.698112914257289e-07, "loss": 0.1213, "step": 29630 }, { "epoch": 0.6910903208883449, "grad_norm": 1.8928254842758179, "learning_rate": 7.697335696076602e-07, "loss": 0.123, "step": 29640 }, { "epoch": 0.6913234822651628, "grad_norm": 2.224118709564209, "learning_rate": 7.696558477895915e-07, "loss": 0.1185, "step": 29650 }, { "epoch": 0.6915566436419807, "grad_norm": 1.3791111707687378, "learning_rate": 7.695781259715227e-07, "loss": 0.1173, "step": 29660 }, { "epoch": 0.6917898050187986, "grad_norm": 4.206772804260254, "learning_rate": 7.69500404153454e-07, "loss": 0.1213, "step": 29670 }, { "epoch": 0.6920229663956166, "grad_norm": 3.7535011768341064, "learning_rate": 7.694226823353852e-07, "loss": 0.1249, "step": 29680 }, { "epoch": 0.6922561277724345, "grad_norm": 1.1824012994766235, "learning_rate": 7.693449605173163e-07, "loss": 0.1367, "step": 29690 }, { "epoch": 0.6924892891492525, "grad_norm": 1.1892458200454712, "learning_rate": 7.692672386992476e-07, "loss": 0.1199, "step": 29700 }, { "epoch": 0.6927224505260704, "grad_norm": 1.5772337913513184, "learning_rate": 7.691895168811788e-07, "loss": 0.125, "step": 29710 }, { "epoch": 0.6929556119028882, "grad_norm": 1.8320634365081787, "learning_rate": 7.691117950631101e-07, "loss": 0.1246, "step": 29720 }, { "epoch": 0.6931887732797062, "grad_norm": 1.4409857988357544, "learning_rate": 7.690340732450413e-07, "loss": 0.1202, "step": 29730 }, { "epoch": 0.6934219346565241, "grad_norm": 1.2858535051345825, "learning_rate": 7.689563514269726e-07, "loss": 0.1269, "step": 29740 }, { "epoch": 0.6936550960333421, "grad_norm": 2.278629779815674, "learning_rate": 7.688786296089039e-07, "loss": 0.1237, "step": 29750 }, { "epoch": 0.69388825741016, "grad_norm": 1.566449761390686, "learning_rate": 7.68800907790835e-07, "loss": 0.1275, "step": 29760 }, { "epoch": 0.694121418786978, "grad_norm": 1.283623218536377, "learning_rate": 7.687231859727662e-07, "loss": 0.1159, "step": 29770 }, { "epoch": 0.6943545801637959, "grad_norm": 1.2992407083511353, "learning_rate": 7.686454641546974e-07, "loss": 0.1243, "step": 29780 }, { "epoch": 0.6945877415406138, "grad_norm": 1.5640946626663208, "learning_rate": 7.685677423366287e-07, "loss": 0.1231, "step": 29790 }, { "epoch": 0.6948209029174317, "grad_norm": 1.867907166481018, "learning_rate": 7.6849002051856e-07, "loss": 0.1161, "step": 29800 }, { "epoch": 0.6950540642942497, "grad_norm": 1.4336051940917969, "learning_rate": 7.684122987004912e-07, "loss": 0.1253, "step": 29810 }, { "epoch": 0.6952872256710676, "grad_norm": 1.3856014013290405, "learning_rate": 7.683345768824224e-07, "loss": 0.1139, "step": 29820 }, { "epoch": 0.6955203870478855, "grad_norm": 1.4393771886825562, "learning_rate": 7.682568550643536e-07, "loss": 0.1168, "step": 29830 }, { "epoch": 0.6957535484247035, "grad_norm": 1.5788805484771729, "learning_rate": 7.681791332462849e-07, "loss": 0.1164, "step": 29840 }, { "epoch": 0.6959867098015213, "grad_norm": 2.2982611656188965, "learning_rate": 7.681014114282161e-07, "loss": 0.1241, "step": 29850 }, { "epoch": 0.6962198711783393, "grad_norm": 3.061225175857544, "learning_rate": 7.680236896101473e-07, "loss": 0.111, "step": 29860 }, { "epoch": 0.6964530325551572, "grad_norm": 1.5656378269195557, "learning_rate": 7.679459677920786e-07, "loss": 0.1251, "step": 29870 }, { "epoch": 0.6966861939319752, "grad_norm": 2.252739191055298, "learning_rate": 7.678682459740097e-07, "loss": 0.1273, "step": 29880 }, { "epoch": 0.6969193553087931, "grad_norm": 3.117032527923584, "learning_rate": 7.67790524155941e-07, "loss": 0.1301, "step": 29890 }, { "epoch": 0.6971525166856111, "grad_norm": 1.7543169260025024, "learning_rate": 7.677128023378723e-07, "loss": 0.1281, "step": 29900 }, { "epoch": 0.6973856780624289, "grad_norm": 1.3310390710830688, "learning_rate": 7.676350805198035e-07, "loss": 0.1244, "step": 29910 }, { "epoch": 0.6976188394392469, "grad_norm": 1.178740382194519, "learning_rate": 7.675573587017348e-07, "loss": 0.119, "step": 29920 }, { "epoch": 0.6978520008160648, "grad_norm": 2.872349739074707, "learning_rate": 7.67479636883666e-07, "loss": 0.1324, "step": 29930 }, { "epoch": 0.6980851621928827, "grad_norm": 2.051274538040161, "learning_rate": 7.674019150655971e-07, "loss": 0.1155, "step": 29940 }, { "epoch": 0.6983183235697007, "grad_norm": 1.438536524772644, "learning_rate": 7.673241932475284e-07, "loss": 0.1257, "step": 29950 }, { "epoch": 0.6985514849465186, "grad_norm": 1.300337553024292, "learning_rate": 7.672464714294596e-07, "loss": 0.1176, "step": 29960 }, { "epoch": 0.6987846463233366, "grad_norm": 1.5952441692352295, "learning_rate": 7.671687496113909e-07, "loss": 0.1199, "step": 29970 }, { "epoch": 0.6990178077001544, "grad_norm": 2.658407211303711, "learning_rate": 7.670910277933222e-07, "loss": 0.1207, "step": 29980 }, { "epoch": 0.6992509690769724, "grad_norm": 2.479196310043335, "learning_rate": 7.670133059752534e-07, "loss": 0.1198, "step": 29990 }, { "epoch": 0.6994841304537903, "grad_norm": 1.4253188371658325, "learning_rate": 7.669355841571846e-07, "loss": 0.1177, "step": 30000 }, { "epoch": 0.6994841304537903, "eval_accuracy": 0.9399415711523685, "eval_f1": 0.9569384626958304, "eval_loss": 0.15475942194461823, "eval_runtime": 3928.7247, "eval_samples_per_second": 465.779, "eval_steps_per_second": 58.222, "step": 30000 }, { "epoch": 0.6997172918306083, "grad_norm": 2.128211259841919, "learning_rate": 7.668578623391157e-07, "loss": 0.1272, "step": 30010 }, { "epoch": 0.6999504532074262, "grad_norm": 1.2433969974517822, "learning_rate": 7.66780140521047e-07, "loss": 0.1358, "step": 30020 }, { "epoch": 0.7001836145842442, "grad_norm": 1.9315153360366821, "learning_rate": 7.667024187029783e-07, "loss": 0.1224, "step": 30030 }, { "epoch": 0.700416775961062, "grad_norm": 1.3370245695114136, "learning_rate": 7.666246968849095e-07, "loss": 0.1277, "step": 30040 }, { "epoch": 0.70064993733788, "grad_norm": 2.336494207382202, "learning_rate": 7.665469750668408e-07, "loss": 0.1243, "step": 30050 }, { "epoch": 0.7008830987146979, "grad_norm": 1.6730684041976929, "learning_rate": 7.664692532487719e-07, "loss": 0.1246, "step": 30060 }, { "epoch": 0.7011162600915158, "grad_norm": 2.3491716384887695, "learning_rate": 7.663915314307032e-07, "loss": 0.1154, "step": 30070 }, { "epoch": 0.7013494214683338, "grad_norm": 1.9009552001953125, "learning_rate": 7.663138096126345e-07, "loss": 0.1322, "step": 30080 }, { "epoch": 0.7015825828451517, "grad_norm": 1.9722615480422974, "learning_rate": 7.662360877945656e-07, "loss": 0.1151, "step": 30090 }, { "epoch": 0.7018157442219697, "grad_norm": 1.2896997928619385, "learning_rate": 7.661583659764969e-07, "loss": 0.1194, "step": 30100 }, { "epoch": 0.7020489055987875, "grad_norm": 2.944446563720703, "learning_rate": 7.660806441584281e-07, "loss": 0.1162, "step": 30110 }, { "epoch": 0.7022820669756055, "grad_norm": 2.0220773220062256, "learning_rate": 7.660029223403594e-07, "loss": 0.1262, "step": 30120 }, { "epoch": 0.7025152283524234, "grad_norm": 2.1550486087799072, "learning_rate": 7.659252005222906e-07, "loss": 0.1211, "step": 30130 }, { "epoch": 0.7027483897292414, "grad_norm": 1.4192088842391968, "learning_rate": 7.658474787042218e-07, "loss": 0.1232, "step": 30140 }, { "epoch": 0.7029815511060593, "grad_norm": 1.7094032764434814, "learning_rate": 7.657697568861531e-07, "loss": 0.1283, "step": 30150 }, { "epoch": 0.7032147124828773, "grad_norm": 4.605645656585693, "learning_rate": 7.656920350680843e-07, "loss": 0.1067, "step": 30160 }, { "epoch": 0.7034478738596951, "grad_norm": 1.853302001953125, "learning_rate": 7.656143132500155e-07, "loss": 0.1215, "step": 30170 }, { "epoch": 0.703681035236513, "grad_norm": 2.722653388977051, "learning_rate": 7.655365914319468e-07, "loss": 0.126, "step": 30180 }, { "epoch": 0.703914196613331, "grad_norm": 1.6194632053375244, "learning_rate": 7.654588696138779e-07, "loss": 0.1171, "step": 30190 }, { "epoch": 0.7041473579901489, "grad_norm": 2.972970724105835, "learning_rate": 7.653811477958092e-07, "loss": 0.1253, "step": 30200 }, { "epoch": 0.7043805193669669, "grad_norm": 2.382680892944336, "learning_rate": 7.653034259777404e-07, "loss": 0.1115, "step": 30210 }, { "epoch": 0.7046136807437848, "grad_norm": 1.5421154499053955, "learning_rate": 7.652257041596717e-07, "loss": 0.1246, "step": 30220 }, { "epoch": 0.7048468421206027, "grad_norm": 1.2340131998062134, "learning_rate": 7.65147982341603e-07, "loss": 0.1297, "step": 30230 }, { "epoch": 0.7050800034974206, "grad_norm": 1.502335786819458, "learning_rate": 7.650702605235342e-07, "loss": 0.1102, "step": 30240 }, { "epoch": 0.7053131648742386, "grad_norm": 1.4974709749221802, "learning_rate": 7.649925387054653e-07, "loss": 0.1159, "step": 30250 }, { "epoch": 0.7055463262510565, "grad_norm": 1.6957372426986694, "learning_rate": 7.649148168873965e-07, "loss": 0.1372, "step": 30260 }, { "epoch": 0.7057794876278745, "grad_norm": 1.4888001680374146, "learning_rate": 7.648370950693278e-07, "loss": 0.1197, "step": 30270 }, { "epoch": 0.7060126490046924, "grad_norm": 3.5354998111724854, "learning_rate": 7.647593732512591e-07, "loss": 0.1299, "step": 30280 }, { "epoch": 0.7062458103815104, "grad_norm": 2.7392067909240723, "learning_rate": 7.646816514331903e-07, "loss": 0.1246, "step": 30290 }, { "epoch": 0.7064789717583282, "grad_norm": 2.228705644607544, "learning_rate": 7.646039296151216e-07, "loss": 0.1267, "step": 30300 }, { "epoch": 0.7067121331351461, "grad_norm": 4.703065872192383, "learning_rate": 7.645262077970527e-07, "loss": 0.1121, "step": 30310 }, { "epoch": 0.7069452945119641, "grad_norm": 2.6318230628967285, "learning_rate": 7.64448485978984e-07, "loss": 0.1184, "step": 30320 }, { "epoch": 0.707178455888782, "grad_norm": 1.5831444263458252, "learning_rate": 7.643707641609152e-07, "loss": 0.1041, "step": 30330 }, { "epoch": 0.7074116172656, "grad_norm": 1.3854089975357056, "learning_rate": 7.642930423428464e-07, "loss": 0.1181, "step": 30340 }, { "epoch": 0.7076447786424179, "grad_norm": 1.310787558555603, "learning_rate": 7.642153205247777e-07, "loss": 0.1143, "step": 30350 }, { "epoch": 0.7078779400192358, "grad_norm": 2.6229588985443115, "learning_rate": 7.64137598706709e-07, "loss": 0.1257, "step": 30360 }, { "epoch": 0.7081111013960537, "grad_norm": 1.6272969245910645, "learning_rate": 7.640598768886401e-07, "loss": 0.1215, "step": 30370 }, { "epoch": 0.7083442627728717, "grad_norm": 1.170414686203003, "learning_rate": 7.639821550705714e-07, "loss": 0.1203, "step": 30380 }, { "epoch": 0.7085774241496896, "grad_norm": 1.2135339975357056, "learning_rate": 7.639044332525026e-07, "loss": 0.1246, "step": 30390 }, { "epoch": 0.7088105855265076, "grad_norm": 2.774651288986206, "learning_rate": 7.638267114344339e-07, "loss": 0.1203, "step": 30400 }, { "epoch": 0.7090437469033255, "grad_norm": 3.016022205352783, "learning_rate": 7.63748989616365e-07, "loss": 0.1185, "step": 30410 }, { "epoch": 0.7092769082801434, "grad_norm": 2.4733288288116455, "learning_rate": 7.636712677982963e-07, "loss": 0.1166, "step": 30420 }, { "epoch": 0.7095100696569613, "grad_norm": 4.198978900909424, "learning_rate": 7.635935459802275e-07, "loss": 0.1238, "step": 30430 }, { "epoch": 0.7097432310337792, "grad_norm": 3.199822425842285, "learning_rate": 7.635158241621587e-07, "loss": 0.1208, "step": 30440 }, { "epoch": 0.7099763924105972, "grad_norm": 2.0859203338623047, "learning_rate": 7.6343810234409e-07, "loss": 0.121, "step": 30450 }, { "epoch": 0.7102095537874151, "grad_norm": 1.50101637840271, "learning_rate": 7.633603805260213e-07, "loss": 0.109, "step": 30460 }, { "epoch": 0.7104427151642331, "grad_norm": 1.3732025623321533, "learning_rate": 7.632826587079525e-07, "loss": 0.1138, "step": 30470 }, { "epoch": 0.710675876541051, "grad_norm": 2.4129951000213623, "learning_rate": 7.632049368898838e-07, "loss": 0.1251, "step": 30480 }, { "epoch": 0.7109090379178689, "grad_norm": 1.5689854621887207, "learning_rate": 7.631272150718148e-07, "loss": 0.1108, "step": 30490 }, { "epoch": 0.7111421992946868, "grad_norm": 3.0812339782714844, "learning_rate": 7.630494932537461e-07, "loss": 0.1277, "step": 30500 }, { "epoch": 0.7113753606715048, "grad_norm": 1.2378058433532715, "learning_rate": 7.629717714356774e-07, "loss": 0.1133, "step": 30510 }, { "epoch": 0.7116085220483227, "grad_norm": 1.7456742525100708, "learning_rate": 7.628940496176086e-07, "loss": 0.1233, "step": 30520 }, { "epoch": 0.7118416834251406, "grad_norm": 1.3332290649414062, "learning_rate": 7.628163277995399e-07, "loss": 0.1195, "step": 30530 }, { "epoch": 0.7120748448019586, "grad_norm": 1.419756293296814, "learning_rate": 7.627386059814711e-07, "loss": 0.1309, "step": 30540 }, { "epoch": 0.7123080061787764, "grad_norm": 1.9083693027496338, "learning_rate": 7.626608841634024e-07, "loss": 0.1155, "step": 30550 }, { "epoch": 0.7125411675555944, "grad_norm": 3.6914174556732178, "learning_rate": 7.625831623453336e-07, "loss": 0.1135, "step": 30560 }, { "epoch": 0.7127743289324123, "grad_norm": 1.2322142124176025, "learning_rate": 7.625054405272647e-07, "loss": 0.1151, "step": 30570 }, { "epoch": 0.7130074903092303, "grad_norm": 2.5395138263702393, "learning_rate": 7.62427718709196e-07, "loss": 0.1186, "step": 30580 }, { "epoch": 0.7132406516860482, "grad_norm": 1.779801607131958, "learning_rate": 7.623499968911272e-07, "loss": 0.1148, "step": 30590 }, { "epoch": 0.7134738130628662, "grad_norm": 2.219355344772339, "learning_rate": 7.622722750730585e-07, "loss": 0.1295, "step": 30600 }, { "epoch": 0.7137069744396841, "grad_norm": 2.003683567047119, "learning_rate": 7.621945532549898e-07, "loss": 0.1172, "step": 30610 }, { "epoch": 0.713940135816502, "grad_norm": 1.9225102663040161, "learning_rate": 7.621168314369209e-07, "loss": 0.115, "step": 30620 }, { "epoch": 0.7141732971933199, "grad_norm": 4.1064605712890625, "learning_rate": 7.620391096188522e-07, "loss": 0.1161, "step": 30630 }, { "epoch": 0.7144064585701378, "grad_norm": 1.8537013530731201, "learning_rate": 7.619613878007834e-07, "loss": 0.1286, "step": 30640 }, { "epoch": 0.7146396199469558, "grad_norm": 1.2897616624832153, "learning_rate": 7.618836659827146e-07, "loss": 0.1164, "step": 30650 }, { "epoch": 0.7148727813237737, "grad_norm": 2.7076640129089355, "learning_rate": 7.618059441646459e-07, "loss": 0.1172, "step": 30660 }, { "epoch": 0.7151059427005917, "grad_norm": 1.6159735918045044, "learning_rate": 7.617282223465771e-07, "loss": 0.1124, "step": 30670 }, { "epoch": 0.7153391040774095, "grad_norm": 1.5185283422470093, "learning_rate": 7.616505005285083e-07, "loss": 0.1303, "step": 30680 }, { "epoch": 0.7155722654542275, "grad_norm": 3.0063529014587402, "learning_rate": 7.615727787104395e-07, "loss": 0.1173, "step": 30690 }, { "epoch": 0.7158054268310454, "grad_norm": 2.953054428100586, "learning_rate": 7.615028290741777e-07, "loss": 0.1194, "step": 30700 }, { "epoch": 0.7160385882078634, "grad_norm": 3.3434300422668457, "learning_rate": 7.614251072561088e-07, "loss": 0.1347, "step": 30710 }, { "epoch": 0.7162717495846813, "grad_norm": 1.809855341911316, "learning_rate": 7.613473854380401e-07, "loss": 0.1208, "step": 30720 }, { "epoch": 0.7165049109614993, "grad_norm": 1.391929030418396, "learning_rate": 7.612696636199713e-07, "loss": 0.1258, "step": 30730 }, { "epoch": 0.7167380723383172, "grad_norm": 1.72377610206604, "learning_rate": 7.611919418019026e-07, "loss": 0.1068, "step": 30740 }, { "epoch": 0.716971233715135, "grad_norm": 1.2954673767089844, "learning_rate": 7.611142199838339e-07, "loss": 0.1248, "step": 30750 }, { "epoch": 0.717204395091953, "grad_norm": 3.6402318477630615, "learning_rate": 7.610364981657651e-07, "loss": 0.1394, "step": 30760 }, { "epoch": 0.7174375564687709, "grad_norm": 1.175087332725525, "learning_rate": 7.609587763476963e-07, "loss": 0.1207, "step": 30770 }, { "epoch": 0.7176707178455889, "grad_norm": 2.027925968170166, "learning_rate": 7.608810545296274e-07, "loss": 0.1187, "step": 30780 }, { "epoch": 0.7179038792224068, "grad_norm": 1.4276680946350098, "learning_rate": 7.608033327115587e-07, "loss": 0.1182, "step": 30790 }, { "epoch": 0.7181370405992248, "grad_norm": 1.9730452299118042, "learning_rate": 7.6072561089349e-07, "loss": 0.1202, "step": 30800 }, { "epoch": 0.7183702019760426, "grad_norm": 1.247275710105896, "learning_rate": 7.606478890754212e-07, "loss": 0.1201, "step": 30810 }, { "epoch": 0.7186033633528606, "grad_norm": 1.3537489175796509, "learning_rate": 7.605701672573525e-07, "loss": 0.1233, "step": 30820 }, { "epoch": 0.7188365247296785, "grad_norm": 2.1013646125793457, "learning_rate": 7.604924454392837e-07, "loss": 0.1209, "step": 30830 }, { "epoch": 0.7190696861064965, "grad_norm": 1.4536125659942627, "learning_rate": 7.604147236212149e-07, "loss": 0.129, "step": 30840 }, { "epoch": 0.7193028474833144, "grad_norm": 1.6064858436584473, "learning_rate": 7.603370018031462e-07, "loss": 0.1115, "step": 30850 }, { "epoch": 0.7195360088601324, "grad_norm": 1.4355111122131348, "learning_rate": 7.602592799850773e-07, "loss": 0.1168, "step": 30860 }, { "epoch": 0.7197691702369502, "grad_norm": 1.0880175828933716, "learning_rate": 7.601815581670086e-07, "loss": 0.1178, "step": 30870 }, { "epoch": 0.7200023316137681, "grad_norm": 1.46322762966156, "learning_rate": 7.601038363489399e-07, "loss": 0.1253, "step": 30880 }, { "epoch": 0.7202354929905861, "grad_norm": 5.704732894897461, "learning_rate": 7.60026114530871e-07, "loss": 0.131, "step": 30890 }, { "epoch": 0.720468654367404, "grad_norm": 1.231016993522644, "learning_rate": 7.599483927128023e-07, "loss": 0.1097, "step": 30900 }, { "epoch": 0.720701815744222, "grad_norm": 1.4568228721618652, "learning_rate": 7.598706708947335e-07, "loss": 0.127, "step": 30910 }, { "epoch": 0.7209349771210399, "grad_norm": 1.4414246082305908, "learning_rate": 7.597929490766648e-07, "loss": 0.1193, "step": 30920 }, { "epoch": 0.7211681384978579, "grad_norm": 2.815105438232422, "learning_rate": 7.597152272585961e-07, "loss": 0.117, "step": 30930 }, { "epoch": 0.7214012998746757, "grad_norm": 1.6542953252792358, "learning_rate": 7.596375054405272e-07, "loss": 0.1159, "step": 30940 }, { "epoch": 0.7216344612514937, "grad_norm": 2.397088050842285, "learning_rate": 7.595597836224585e-07, "loss": 0.1415, "step": 30950 }, { "epoch": 0.7218676226283116, "grad_norm": 3.433574914932251, "learning_rate": 7.594820618043896e-07, "loss": 0.121, "step": 30960 }, { "epoch": 0.7221007840051296, "grad_norm": 1.879056453704834, "learning_rate": 7.594043399863209e-07, "loss": 0.1283, "step": 30970 }, { "epoch": 0.7223339453819475, "grad_norm": 1.2865262031555176, "learning_rate": 7.593266181682522e-07, "loss": 0.1131, "step": 30980 }, { "epoch": 0.7225671067587655, "grad_norm": 3.649811029434204, "learning_rate": 7.592488963501834e-07, "loss": 0.1046, "step": 30990 }, { "epoch": 0.7228002681355833, "grad_norm": 1.6015599966049194, "learning_rate": 7.591711745321147e-07, "loss": 0.1073, "step": 31000 }, { "epoch": 0.7230334295124012, "grad_norm": 2.288151741027832, "learning_rate": 7.59093452714046e-07, "loss": 0.1247, "step": 31010 }, { "epoch": 0.7232665908892192, "grad_norm": 1.1794754266738892, "learning_rate": 7.59015730895977e-07, "loss": 0.1113, "step": 31020 }, { "epoch": 0.7234997522660371, "grad_norm": 2.4084720611572266, "learning_rate": 7.589380090779083e-07, "loss": 0.1299, "step": 31030 }, { "epoch": 0.7237329136428551, "grad_norm": 1.579944133758545, "learning_rate": 7.588602872598395e-07, "loss": 0.1177, "step": 31040 }, { "epoch": 0.723966075019673, "grad_norm": 1.334720492362976, "learning_rate": 7.587825654417708e-07, "loss": 0.1217, "step": 31050 }, { "epoch": 0.724199236396491, "grad_norm": 1.9611735343933105, "learning_rate": 7.58704843623702e-07, "loss": 0.1245, "step": 31060 }, { "epoch": 0.7244323977733088, "grad_norm": 1.2713810205459595, "learning_rate": 7.586271218056333e-07, "loss": 0.1229, "step": 31070 }, { "epoch": 0.7246655591501268, "grad_norm": 3.5324838161468506, "learning_rate": 7.585493999875645e-07, "loss": 0.1121, "step": 31080 }, { "epoch": 0.7248987205269447, "grad_norm": 1.168473243713379, "learning_rate": 7.584716781694957e-07, "loss": 0.1079, "step": 31090 }, { "epoch": 0.7251318819037627, "grad_norm": 1.2594577074050903, "learning_rate": 7.583939563514269e-07, "loss": 0.1195, "step": 31100 }, { "epoch": 0.7253650432805806, "grad_norm": 4.289937496185303, "learning_rate": 7.583162345333581e-07, "loss": 0.1393, "step": 31110 }, { "epoch": 0.7255982046573985, "grad_norm": 1.3968433141708374, "learning_rate": 7.582385127152894e-07, "loss": 0.114, "step": 31120 }, { "epoch": 0.7258313660342164, "grad_norm": 1.3562617301940918, "learning_rate": 7.581607908972207e-07, "loss": 0.1166, "step": 31130 }, { "epoch": 0.7260645274110343, "grad_norm": 1.709274411201477, "learning_rate": 7.580830690791518e-07, "loss": 0.1145, "step": 31140 }, { "epoch": 0.7262976887878523, "grad_norm": 1.451088547706604, "learning_rate": 7.580053472610831e-07, "loss": 0.1207, "step": 31150 }, { "epoch": 0.7265308501646702, "grad_norm": 1.590498685836792, "learning_rate": 7.579276254430144e-07, "loss": 0.1139, "step": 31160 }, { "epoch": 0.7267640115414882, "grad_norm": 1.4013714790344238, "learning_rate": 7.578499036249456e-07, "loss": 0.1192, "step": 31170 }, { "epoch": 0.7269971729183061, "grad_norm": 3.3551061153411865, "learning_rate": 7.577721818068768e-07, "loss": 0.1175, "step": 31180 }, { "epoch": 0.727230334295124, "grad_norm": 1.6000864505767822, "learning_rate": 7.57694459988808e-07, "loss": 0.1068, "step": 31190 }, { "epoch": 0.7274634956719419, "grad_norm": 1.5826541185379028, "learning_rate": 7.576167381707392e-07, "loss": 0.1181, "step": 31200 }, { "epoch": 0.7276966570487599, "grad_norm": 1.5623701810836792, "learning_rate": 7.575390163526704e-07, "loss": 0.1207, "step": 31210 }, { "epoch": 0.7279298184255778, "grad_norm": 2.0661330223083496, "learning_rate": 7.574612945346017e-07, "loss": 0.1074, "step": 31220 }, { "epoch": 0.7281629798023957, "grad_norm": 1.6717883348464966, "learning_rate": 7.57383572716533e-07, "loss": 0.1342, "step": 31230 }, { "epoch": 0.7283961411792137, "grad_norm": 1.9710713624954224, "learning_rate": 7.573058508984642e-07, "loss": 0.1159, "step": 31240 }, { "epoch": 0.7286293025560316, "grad_norm": 2.1378016471862793, "learning_rate": 7.572281290803955e-07, "loss": 0.1364, "step": 31250 }, { "epoch": 0.7288624639328495, "grad_norm": 3.199922561645508, "learning_rate": 7.571504072623265e-07, "loss": 0.1178, "step": 31260 }, { "epoch": 0.7290956253096674, "grad_norm": 1.6823121309280396, "learning_rate": 7.570726854442578e-07, "loss": 0.1255, "step": 31270 }, { "epoch": 0.7293287866864854, "grad_norm": 1.4936137199401855, "learning_rate": 7.569949636261891e-07, "loss": 0.1284, "step": 31280 }, { "epoch": 0.7295619480633033, "grad_norm": 1.5958493947982788, "learning_rate": 7.569172418081203e-07, "loss": 0.1103, "step": 31290 }, { "epoch": 0.7297951094401213, "grad_norm": 2.71396541595459, "learning_rate": 7.568395199900516e-07, "loss": 0.1174, "step": 31300 }, { "epoch": 0.7300282708169392, "grad_norm": 1.3700635433197021, "learning_rate": 7.567617981719829e-07, "loss": 0.1263, "step": 31310 }, { "epoch": 0.730261432193757, "grad_norm": 2.8174872398376465, "learning_rate": 7.566840763539141e-07, "loss": 0.1164, "step": 31320 }, { "epoch": 0.730494593570575, "grad_norm": 2.8045971393585205, "learning_rate": 7.566063545358453e-07, "loss": 0.1137, "step": 31330 }, { "epoch": 0.730727754947393, "grad_norm": 1.5944305658340454, "learning_rate": 7.565286327177764e-07, "loss": 0.1192, "step": 31340 }, { "epoch": 0.7309609163242109, "grad_norm": 2.097888231277466, "learning_rate": 7.564509108997077e-07, "loss": 0.1209, "step": 31350 }, { "epoch": 0.7311940777010288, "grad_norm": 4.367529392242432, "learning_rate": 7.56373189081639e-07, "loss": 0.1342, "step": 31360 }, { "epoch": 0.7314272390778468, "grad_norm": 1.5485496520996094, "learning_rate": 7.562954672635702e-07, "loss": 0.1187, "step": 31370 }, { "epoch": 0.7316604004546647, "grad_norm": 1.7057390213012695, "learning_rate": 7.562177454455015e-07, "loss": 0.1072, "step": 31380 }, { "epoch": 0.7318935618314826, "grad_norm": 2.2658743858337402, "learning_rate": 7.561400236274326e-07, "loss": 0.1147, "step": 31390 }, { "epoch": 0.7321267232083005, "grad_norm": 1.1913468837738037, "learning_rate": 7.560623018093639e-07, "loss": 0.1218, "step": 31400 }, { "epoch": 0.7323598845851185, "grad_norm": 1.7585375308990479, "learning_rate": 7.559845799912952e-07, "loss": 0.119, "step": 31410 }, { "epoch": 0.7325930459619364, "grad_norm": 1.8430142402648926, "learning_rate": 7.559068581732263e-07, "loss": 0.1334, "step": 31420 }, { "epoch": 0.7328262073387544, "grad_norm": 1.5603179931640625, "learning_rate": 7.558291363551576e-07, "loss": 0.1148, "step": 31430 }, { "epoch": 0.7330593687155723, "grad_norm": 3.534588575363159, "learning_rate": 7.557514145370888e-07, "loss": 0.1382, "step": 31440 }, { "epoch": 0.7332925300923901, "grad_norm": 1.4334088563919067, "learning_rate": 7.5567369271902e-07, "loss": 0.1266, "step": 31450 }, { "epoch": 0.7335256914692081, "grad_norm": 1.556148648262024, "learning_rate": 7.555959709009513e-07, "loss": 0.1326, "step": 31460 }, { "epoch": 0.733758852846026, "grad_norm": 3.5903639793395996, "learning_rate": 7.555182490828825e-07, "loss": 0.1098, "step": 31470 }, { "epoch": 0.733992014222844, "grad_norm": 3.327434539794922, "learning_rate": 7.554405272648138e-07, "loss": 0.1297, "step": 31480 }, { "epoch": 0.7342251755996619, "grad_norm": 1.4823206663131714, "learning_rate": 7.55362805446745e-07, "loss": 0.1096, "step": 31490 }, { "epoch": 0.7344583369764799, "grad_norm": 1.636248230934143, "learning_rate": 7.552850836286762e-07, "loss": 0.1206, "step": 31500 }, { "epoch": 0.7346914983532977, "grad_norm": 4.267000198364258, "learning_rate": 7.552073618106074e-07, "loss": 0.1226, "step": 31510 }, { "epoch": 0.7349246597301157, "grad_norm": 2.9919652938842773, "learning_rate": 7.551296399925386e-07, "loss": 0.1177, "step": 31520 }, { "epoch": 0.7351578211069336, "grad_norm": 1.3290094137191772, "learning_rate": 7.550519181744699e-07, "loss": 0.1081, "step": 31530 }, { "epoch": 0.7353909824837516, "grad_norm": 4.0825724601745605, "learning_rate": 7.549741963564011e-07, "loss": 0.1175, "step": 31540 }, { "epoch": 0.7356241438605695, "grad_norm": 1.3261322975158691, "learning_rate": 7.548964745383324e-07, "loss": 0.1174, "step": 31550 }, { "epoch": 0.7358573052373875, "grad_norm": 1.4914995431900024, "learning_rate": 7.548187527202637e-07, "loss": 0.1207, "step": 31560 }, { "epoch": 0.7360904666142054, "grad_norm": 1.4627403020858765, "learning_rate": 7.547410309021948e-07, "loss": 0.1332, "step": 31570 }, { "epoch": 0.7363236279910232, "grad_norm": 1.9978487491607666, "learning_rate": 7.54663309084126e-07, "loss": 0.1184, "step": 31580 }, { "epoch": 0.7365567893678412, "grad_norm": 1.1231664419174194, "learning_rate": 7.545855872660572e-07, "loss": 0.118, "step": 31590 }, { "epoch": 0.7367899507446591, "grad_norm": 2.8782150745391846, "learning_rate": 7.545156376297954e-07, "loss": 0.1153, "step": 31600 }, { "epoch": 0.7370231121214771, "grad_norm": 1.5053108930587769, "learning_rate": 7.544379158117266e-07, "loss": 0.1206, "step": 31610 }, { "epoch": 0.737256273498295, "grad_norm": 1.516543984413147, "learning_rate": 7.543601939936579e-07, "loss": 0.1344, "step": 31620 }, { "epoch": 0.737489434875113, "grad_norm": 1.656180739402771, "learning_rate": 7.54282472175589e-07, "loss": 0.1229, "step": 31630 }, { "epoch": 0.7377225962519308, "grad_norm": 3.639289140701294, "learning_rate": 7.542047503575203e-07, "loss": 0.1297, "step": 31640 }, { "epoch": 0.7379557576287488, "grad_norm": 1.1028677225112915, "learning_rate": 7.541270285394516e-07, "loss": 0.1065, "step": 31650 }, { "epoch": 0.7381889190055667, "grad_norm": 1.3653680086135864, "learning_rate": 7.540493067213827e-07, "loss": 0.1076, "step": 31660 }, { "epoch": 0.7384220803823847, "grad_norm": 2.771355390548706, "learning_rate": 7.53971584903314e-07, "loss": 0.1257, "step": 31670 }, { "epoch": 0.7386552417592026, "grad_norm": 1.3789387941360474, "learning_rate": 7.538938630852453e-07, "loss": 0.1216, "step": 31680 }, { "epoch": 0.7388884031360206, "grad_norm": 1.907094120979309, "learning_rate": 7.538161412671765e-07, "loss": 0.1282, "step": 31690 }, { "epoch": 0.7391215645128385, "grad_norm": 2.358196973800659, "learning_rate": 7.537384194491078e-07, "loss": 0.1236, "step": 31700 }, { "epoch": 0.7393547258896563, "grad_norm": 1.53011155128479, "learning_rate": 7.536606976310389e-07, "loss": 0.1182, "step": 31710 }, { "epoch": 0.7395878872664743, "grad_norm": 1.2134630680084229, "learning_rate": 7.535829758129701e-07, "loss": 0.1129, "step": 31720 }, { "epoch": 0.7398210486432922, "grad_norm": 2.6222140789031982, "learning_rate": 7.535052539949014e-07, "loss": 0.1295, "step": 31730 }, { "epoch": 0.7400542100201102, "grad_norm": 1.289207100868225, "learning_rate": 7.534275321768326e-07, "loss": 0.1313, "step": 31740 }, { "epoch": 0.7402873713969281, "grad_norm": 1.337177038192749, "learning_rate": 7.533498103587639e-07, "loss": 0.1201, "step": 31750 }, { "epoch": 0.7405205327737461, "grad_norm": 1.1250343322753906, "learning_rate": 7.532720885406951e-07, "loss": 0.1237, "step": 31760 }, { "epoch": 0.7407536941505639, "grad_norm": 1.337643027305603, "learning_rate": 7.531943667226264e-07, "loss": 0.1198, "step": 31770 }, { "epoch": 0.7409868555273819, "grad_norm": 1.7990477085113525, "learning_rate": 7.531166449045577e-07, "loss": 0.1218, "step": 31780 }, { "epoch": 0.7412200169041998, "grad_norm": 2.3886559009552, "learning_rate": 7.530389230864887e-07, "loss": 0.119, "step": 31790 }, { "epoch": 0.7414531782810178, "grad_norm": 4.459692478179932, "learning_rate": 7.5296120126842e-07, "loss": 0.1212, "step": 31800 }, { "epoch": 0.7416863396578357, "grad_norm": 1.3654255867004395, "learning_rate": 7.528834794503512e-07, "loss": 0.1176, "step": 31810 }, { "epoch": 0.7419195010346537, "grad_norm": 1.406860113143921, "learning_rate": 7.528057576322825e-07, "loss": 0.121, "step": 31820 }, { "epoch": 0.7421526624114715, "grad_norm": 1.4226266145706177, "learning_rate": 7.527280358142138e-07, "loss": 0.1253, "step": 31830 }, { "epoch": 0.7423858237882894, "grad_norm": 3.100442409515381, "learning_rate": 7.52650313996145e-07, "loss": 0.1082, "step": 31840 }, { "epoch": 0.7426189851651074, "grad_norm": 1.7776358127593994, "learning_rate": 7.525725921780762e-07, "loss": 0.1181, "step": 31850 }, { "epoch": 0.7428521465419253, "grad_norm": 2.990352153778076, "learning_rate": 7.524948703600074e-07, "loss": 0.1313, "step": 31860 }, { "epoch": 0.7430853079187433, "grad_norm": 1.5753422975540161, "learning_rate": 7.524171485419386e-07, "loss": 0.1188, "step": 31870 }, { "epoch": 0.7433184692955612, "grad_norm": 2.9874954223632812, "learning_rate": 7.523394267238699e-07, "loss": 0.1221, "step": 31880 }, { "epoch": 0.7435516306723792, "grad_norm": 1.331103801727295, "learning_rate": 7.522617049058011e-07, "loss": 0.1263, "step": 31890 }, { "epoch": 0.743784792049197, "grad_norm": 1.549943447113037, "learning_rate": 7.521839830877324e-07, "loss": 0.1157, "step": 31900 }, { "epoch": 0.744017953426015, "grad_norm": 2.5107007026672363, "learning_rate": 7.521062612696635e-07, "loss": 0.1138, "step": 31910 }, { "epoch": 0.7442511148028329, "grad_norm": 4.271908283233643, "learning_rate": 7.520285394515948e-07, "loss": 0.1239, "step": 31920 }, { "epoch": 0.7444842761796509, "grad_norm": 2.344697952270508, "learning_rate": 7.519508176335261e-07, "loss": 0.1191, "step": 31930 }, { "epoch": 0.7447174375564688, "grad_norm": 1.4862971305847168, "learning_rate": 7.518730958154573e-07, "loss": 0.1236, "step": 31940 }, { "epoch": 0.7449505989332867, "grad_norm": 2.4937963485717773, "learning_rate": 7.517953739973885e-07, "loss": 0.1153, "step": 31950 }, { "epoch": 0.7451837603101046, "grad_norm": 3.5278425216674805, "learning_rate": 7.517176521793198e-07, "loss": 0.1286, "step": 31960 }, { "epoch": 0.7454169216869225, "grad_norm": 1.7890582084655762, "learning_rate": 7.516399303612509e-07, "loss": 0.1168, "step": 31970 }, { "epoch": 0.7456500830637405, "grad_norm": 1.3567349910736084, "learning_rate": 7.515622085431822e-07, "loss": 0.1337, "step": 31980 }, { "epoch": 0.7458832444405584, "grad_norm": 2.657165288925171, "learning_rate": 7.514844867251134e-07, "loss": 0.1089, "step": 31990 }, { "epoch": 0.7461164058173764, "grad_norm": 1.462013840675354, "learning_rate": 7.514067649070447e-07, "loss": 0.129, "step": 32000 }, { "epoch": 0.7463495671941943, "grad_norm": 1.3753042221069336, "learning_rate": 7.51329043088976e-07, "loss": 0.1179, "step": 32010 }, { "epoch": 0.7465827285710123, "grad_norm": 2.26572322845459, "learning_rate": 7.512513212709072e-07, "loss": 0.1126, "step": 32020 }, { "epoch": 0.7468158899478301, "grad_norm": 2.152747631072998, "learning_rate": 7.511735994528383e-07, "loss": 0.1272, "step": 32030 }, { "epoch": 0.747049051324648, "grad_norm": 1.1867254972457886, "learning_rate": 7.510958776347695e-07, "loss": 0.1161, "step": 32040 }, { "epoch": 0.747282212701466, "grad_norm": 3.1454577445983887, "learning_rate": 7.510181558167008e-07, "loss": 0.137, "step": 32050 }, { "epoch": 0.747515374078284, "grad_norm": 2.6143264770507812, "learning_rate": 7.509404339986321e-07, "loss": 0.1293, "step": 32060 }, { "epoch": 0.7477485354551019, "grad_norm": 1.526131510734558, "learning_rate": 7.508627121805633e-07, "loss": 0.1128, "step": 32070 }, { "epoch": 0.7479816968319198, "grad_norm": 1.4478635787963867, "learning_rate": 7.507849903624946e-07, "loss": 0.1231, "step": 32080 }, { "epoch": 0.7482148582087377, "grad_norm": 1.0969911813735962, "learning_rate": 7.507072685444257e-07, "loss": 0.1068, "step": 32090 }, { "epoch": 0.7484480195855556, "grad_norm": 1.3987451791763306, "learning_rate": 7.50629546726357e-07, "loss": 0.1174, "step": 32100 }, { "epoch": 0.7486811809623736, "grad_norm": 1.180733561515808, "learning_rate": 7.505518249082882e-07, "loss": 0.1154, "step": 32110 }, { "epoch": 0.7489143423391915, "grad_norm": 3.8908376693725586, "learning_rate": 7.504741030902194e-07, "loss": 0.1244, "step": 32120 }, { "epoch": 0.7491475037160095, "grad_norm": 1.2200062274932861, "learning_rate": 7.503963812721507e-07, "loss": 0.1186, "step": 32130 }, { "epoch": 0.7493806650928274, "grad_norm": 1.1763430833816528, "learning_rate": 7.503186594540819e-07, "loss": 0.1231, "step": 32140 }, { "epoch": 0.7496138264696453, "grad_norm": 2.592940092086792, "learning_rate": 7.502409376360132e-07, "loss": 0.1172, "step": 32150 }, { "epoch": 0.7498469878464632, "grad_norm": 1.9179837703704834, "learning_rate": 7.501632158179444e-07, "loss": 0.1246, "step": 32160 }, { "epoch": 0.7500801492232811, "grad_norm": 1.582180380821228, "learning_rate": 7.500854939998756e-07, "loss": 0.1226, "step": 32170 }, { "epoch": 0.7503133106000991, "grad_norm": 1.4564412832260132, "learning_rate": 7.500077721818069e-07, "loss": 0.1202, "step": 32180 }, { "epoch": 0.750546471976917, "grad_norm": 1.6412906646728516, "learning_rate": 7.49930050363738e-07, "loss": 0.1144, "step": 32190 }, { "epoch": 0.750779633353735, "grad_norm": 1.3840478658676147, "learning_rate": 7.498523285456693e-07, "loss": 0.1182, "step": 32200 }, { "epoch": 0.7510127947305529, "grad_norm": 3.201317071914673, "learning_rate": 7.497746067276006e-07, "loss": 0.1256, "step": 32210 }, { "epoch": 0.7512459561073708, "grad_norm": 1.549086093902588, "learning_rate": 7.496968849095317e-07, "loss": 0.1301, "step": 32220 }, { "epoch": 0.7514791174841887, "grad_norm": 1.4415481090545654, "learning_rate": 7.49619163091463e-07, "loss": 0.1255, "step": 32230 }, { "epoch": 0.7517122788610067, "grad_norm": 1.2030715942382812, "learning_rate": 7.495414412733942e-07, "loss": 0.1196, "step": 32240 }, { "epoch": 0.7519454402378246, "grad_norm": 1.3380696773529053, "learning_rate": 7.494637194553255e-07, "loss": 0.1291, "step": 32250 }, { "epoch": 0.7521786016146426, "grad_norm": 1.8775852918624878, "learning_rate": 7.493859976372568e-07, "loss": 0.1118, "step": 32260 }, { "epoch": 0.7524117629914605, "grad_norm": 1.1218876838684082, "learning_rate": 7.493082758191879e-07, "loss": 0.1195, "step": 32270 }, { "epoch": 0.7526449243682783, "grad_norm": 1.1359374523162842, "learning_rate": 7.492305540011191e-07, "loss": 0.1172, "step": 32280 }, { "epoch": 0.7528780857450963, "grad_norm": 1.4911030530929565, "learning_rate": 7.491528321830503e-07, "loss": 0.1253, "step": 32290 }, { "epoch": 0.7531112471219142, "grad_norm": 1.3204271793365479, "learning_rate": 7.490751103649816e-07, "loss": 0.1246, "step": 32300 }, { "epoch": 0.7533444084987322, "grad_norm": 1.3440028429031372, "learning_rate": 7.489973885469129e-07, "loss": 0.1159, "step": 32310 }, { "epoch": 0.7535775698755501, "grad_norm": 2.42948317527771, "learning_rate": 7.489196667288441e-07, "loss": 0.1224, "step": 32320 }, { "epoch": 0.7538107312523681, "grad_norm": 1.9660001993179321, "learning_rate": 7.488419449107754e-07, "loss": 0.1226, "step": 32330 }, { "epoch": 0.754043892629186, "grad_norm": 1.3539535999298096, "learning_rate": 7.487642230927065e-07, "loss": 0.1171, "step": 32340 }, { "epoch": 0.7542770540060039, "grad_norm": 2.424574613571167, "learning_rate": 7.486865012746377e-07, "loss": 0.1255, "step": 32350 }, { "epoch": 0.7545102153828218, "grad_norm": 4.505469799041748, "learning_rate": 7.48608779456569e-07, "loss": 0.1267, "step": 32360 }, { "epoch": 0.7547433767596398, "grad_norm": 3.765307903289795, "learning_rate": 7.485310576385002e-07, "loss": 0.1224, "step": 32370 }, { "epoch": 0.7549765381364577, "grad_norm": 2.4289655685424805, "learning_rate": 7.484533358204315e-07, "loss": 0.1222, "step": 32380 }, { "epoch": 0.7552096995132757, "grad_norm": 2.196495771408081, "learning_rate": 7.483756140023628e-07, "loss": 0.1192, "step": 32390 }, { "epoch": 0.7554428608900936, "grad_norm": 2.0056543350219727, "learning_rate": 7.482978921842939e-07, "loss": 0.1077, "step": 32400 }, { "epoch": 0.7556760222669114, "grad_norm": 1.571433186531067, "learning_rate": 7.482201703662252e-07, "loss": 0.1148, "step": 32410 }, { "epoch": 0.7559091836437294, "grad_norm": 1.2638152837753296, "learning_rate": 7.481424485481564e-07, "loss": 0.1084, "step": 32420 }, { "epoch": 0.7561423450205473, "grad_norm": 1.146689772605896, "learning_rate": 7.480647267300877e-07, "loss": 0.1107, "step": 32430 }, { "epoch": 0.7563755063973653, "grad_norm": 1.7797540426254272, "learning_rate": 7.479870049120189e-07, "loss": 0.1252, "step": 32440 }, { "epoch": 0.7566086677741832, "grad_norm": 2.512620210647583, "learning_rate": 7.479092830939501e-07, "loss": 0.1313, "step": 32450 }, { "epoch": 0.7568418291510012, "grad_norm": 2.1354761123657227, "learning_rate": 7.478315612758813e-07, "loss": 0.1138, "step": 32460 }, { "epoch": 0.757074990527819, "grad_norm": 1.5092006921768188, "learning_rate": 7.477538394578125e-07, "loss": 0.1231, "step": 32470 }, { "epoch": 0.757308151904637, "grad_norm": 1.1532692909240723, "learning_rate": 7.476761176397438e-07, "loss": 0.1117, "step": 32480 }, { "epoch": 0.7575413132814549, "grad_norm": 3.266808032989502, "learning_rate": 7.475983958216751e-07, "loss": 0.1167, "step": 32490 }, { "epoch": 0.7577744746582729, "grad_norm": 3.172377347946167, "learning_rate": 7.475206740036063e-07, "loss": 0.1136, "step": 32500 }, { "epoch": 0.7580076360350908, "grad_norm": 2.8556172847747803, "learning_rate": 7.474429521855376e-07, "loss": 0.1249, "step": 32510 }, { "epoch": 0.7582407974119088, "grad_norm": 2.394512414932251, "learning_rate": 7.473652303674686e-07, "loss": 0.1216, "step": 32520 }, { "epoch": 0.7584739587887267, "grad_norm": 1.9202673435211182, "learning_rate": 7.472875085493999e-07, "loss": 0.1112, "step": 32530 }, { "epoch": 0.7587071201655445, "grad_norm": 1.5771698951721191, "learning_rate": 7.472097867313312e-07, "loss": 0.1175, "step": 32540 }, { "epoch": 0.7589402815423625, "grad_norm": 1.0422261953353882, "learning_rate": 7.471320649132624e-07, "loss": 0.1271, "step": 32550 }, { "epoch": 0.7591734429191804, "grad_norm": 2.083444833755493, "learning_rate": 7.470543430951937e-07, "loss": 0.1194, "step": 32560 }, { "epoch": 0.7594066042959984, "grad_norm": 3.9856886863708496, "learning_rate": 7.469766212771249e-07, "loss": 0.1202, "step": 32570 }, { "epoch": 0.7596397656728163, "grad_norm": 3.387214422225952, "learning_rate": 7.468988994590562e-07, "loss": 0.1241, "step": 32580 }, { "epoch": 0.7598729270496343, "grad_norm": 1.6567586660385132, "learning_rate": 7.468211776409874e-07, "loss": 0.1282, "step": 32590 }, { "epoch": 0.7601060884264521, "grad_norm": 2.303229570388794, "learning_rate": 7.467434558229185e-07, "loss": 0.1215, "step": 32600 }, { "epoch": 0.7603392498032701, "grad_norm": 3.2946982383728027, "learning_rate": 7.466657340048498e-07, "loss": 0.1254, "step": 32610 }, { "epoch": 0.760572411180088, "grad_norm": 1.0878403186798096, "learning_rate": 7.46588012186781e-07, "loss": 0.1246, "step": 32620 }, { "epoch": 0.760805572556906, "grad_norm": 2.4871349334716797, "learning_rate": 7.465102903687123e-07, "loss": 0.1197, "step": 32630 }, { "epoch": 0.7610387339337239, "grad_norm": 2.593127489089966, "learning_rate": 7.464325685506436e-07, "loss": 0.1204, "step": 32640 }, { "epoch": 0.7612718953105418, "grad_norm": 3.04331374168396, "learning_rate": 7.463548467325747e-07, "loss": 0.1257, "step": 32650 }, { "epoch": 0.7615050566873598, "grad_norm": 2.2228052616119385, "learning_rate": 7.46277124914506e-07, "loss": 0.1228, "step": 32660 }, { "epoch": 0.7617382180641776, "grad_norm": 1.2189223766326904, "learning_rate": 7.461994030964372e-07, "loss": 0.1162, "step": 32670 }, { "epoch": 0.7619713794409956, "grad_norm": 1.4930039644241333, "learning_rate": 7.461216812783684e-07, "loss": 0.1215, "step": 32680 }, { "epoch": 0.7622045408178135, "grad_norm": 1.6297338008880615, "learning_rate": 7.460439594602997e-07, "loss": 0.118, "step": 32690 }, { "epoch": 0.7624377021946315, "grad_norm": 1.9237618446350098, "learning_rate": 7.459662376422309e-07, "loss": 0.1164, "step": 32700 }, { "epoch": 0.7626708635714494, "grad_norm": 1.945295810699463, "learning_rate": 7.458885158241621e-07, "loss": 0.1284, "step": 32710 }, { "epoch": 0.7629040249482674, "grad_norm": 1.2883166074752808, "learning_rate": 7.458107940060933e-07, "loss": 0.1243, "step": 32720 }, { "epoch": 0.7631371863250852, "grad_norm": 1.1336981058120728, "learning_rate": 7.457330721880246e-07, "loss": 0.122, "step": 32730 }, { "epoch": 0.7633703477019032, "grad_norm": 1.1880720853805542, "learning_rate": 7.456553503699559e-07, "loss": 0.1118, "step": 32740 }, { "epoch": 0.7636035090787211, "grad_norm": 2.538309097290039, "learning_rate": 7.455776285518871e-07, "loss": 0.1136, "step": 32750 }, { "epoch": 0.763836670455539, "grad_norm": 1.4763824939727783, "learning_rate": 7.454999067338183e-07, "loss": 0.1166, "step": 32760 }, { "epoch": 0.764069831832357, "grad_norm": 1.8564587831497192, "learning_rate": 7.454221849157494e-07, "loss": 0.1206, "step": 32770 }, { "epoch": 0.7643029932091749, "grad_norm": 2.2677624225616455, "learning_rate": 7.453444630976807e-07, "loss": 0.1329, "step": 32780 }, { "epoch": 0.7645361545859928, "grad_norm": 1.527330756187439, "learning_rate": 7.45266741279612e-07, "loss": 0.1262, "step": 32790 }, { "epoch": 0.7647693159628107, "grad_norm": 1.8901512622833252, "learning_rate": 7.451890194615432e-07, "loss": 0.1197, "step": 32800 }, { "epoch": 0.7650024773396287, "grad_norm": 1.7299295663833618, "learning_rate": 7.451112976434745e-07, "loss": 0.1161, "step": 32810 }, { "epoch": 0.7652356387164466, "grad_norm": 1.2918483018875122, "learning_rate": 7.450335758254058e-07, "loss": 0.1252, "step": 32820 }, { "epoch": 0.7654688000932646, "grad_norm": 2.800081729888916, "learning_rate": 7.449558540073369e-07, "loss": 0.1142, "step": 32830 }, { "epoch": 0.7657019614700825, "grad_norm": 2.1397647857666016, "learning_rate": 7.448781321892681e-07, "loss": 0.1145, "step": 32840 }, { "epoch": 0.7659351228469005, "grad_norm": 1.979758381843567, "learning_rate": 7.448004103711993e-07, "loss": 0.1132, "step": 32850 }, { "epoch": 0.7661682842237183, "grad_norm": 2.4767372608184814, "learning_rate": 7.447226885531306e-07, "loss": 0.1131, "step": 32860 }, { "epoch": 0.7664014456005362, "grad_norm": 1.6046152114868164, "learning_rate": 7.446449667350619e-07, "loss": 0.1328, "step": 32870 }, { "epoch": 0.7666346069773542, "grad_norm": 2.0425076484680176, "learning_rate": 7.445672449169931e-07, "loss": 0.114, "step": 32880 }, { "epoch": 0.7668677683541721, "grad_norm": 1.249509572982788, "learning_rate": 7.444895230989243e-07, "loss": 0.1111, "step": 32890 }, { "epoch": 0.7671009297309901, "grad_norm": 1.772524118423462, "learning_rate": 7.444118012808555e-07, "loss": 0.1262, "step": 32900 }, { "epoch": 0.767334091107808, "grad_norm": 1.370896577835083, "learning_rate": 7.443340794627868e-07, "loss": 0.1282, "step": 32910 }, { "epoch": 0.7675672524846259, "grad_norm": 2.016706705093384, "learning_rate": 7.44256357644718e-07, "loss": 0.1294, "step": 32920 }, { "epoch": 0.7678004138614438, "grad_norm": 4.156460762023926, "learning_rate": 7.441786358266492e-07, "loss": 0.1194, "step": 32930 }, { "epoch": 0.7680335752382618, "grad_norm": 1.9544976949691772, "learning_rate": 7.441009140085805e-07, "loss": 0.1228, "step": 32940 }, { "epoch": 0.7682667366150797, "grad_norm": 1.5394972562789917, "learning_rate": 7.440231921905116e-07, "loss": 0.1216, "step": 32950 }, { "epoch": 0.7684998979918977, "grad_norm": 1.6590887308120728, "learning_rate": 7.439454703724429e-07, "loss": 0.1207, "step": 32960 }, { "epoch": 0.7687330593687156, "grad_norm": 1.077072262763977, "learning_rate": 7.438677485543742e-07, "loss": 0.1203, "step": 32970 }, { "epoch": 0.7689662207455336, "grad_norm": 2.471179723739624, "learning_rate": 7.437900267363054e-07, "loss": 0.1261, "step": 32980 }, { "epoch": 0.7691993821223514, "grad_norm": 3.00272798538208, "learning_rate": 7.437123049182367e-07, "loss": 0.1327, "step": 32990 }, { "epoch": 0.7694325434991693, "grad_norm": 1.367388129234314, "learning_rate": 7.436345831001678e-07, "loss": 0.1091, "step": 33000 }, { "epoch": 0.7696657048759873, "grad_norm": 1.327991247177124, "learning_rate": 7.43556861282099e-07, "loss": 0.1207, "step": 33010 }, { "epoch": 0.7698988662528052, "grad_norm": 1.9573789834976196, "learning_rate": 7.434791394640303e-07, "loss": 0.111, "step": 33020 }, { "epoch": 0.7701320276296232, "grad_norm": 2.0545918941497803, "learning_rate": 7.434014176459615e-07, "loss": 0.1202, "step": 33030 }, { "epoch": 0.7703651890064411, "grad_norm": 1.4882652759552002, "learning_rate": 7.433236958278928e-07, "loss": 0.1118, "step": 33040 }, { "epoch": 0.770598350383259, "grad_norm": 1.495328426361084, "learning_rate": 7.43245974009824e-07, "loss": 0.12, "step": 33050 }, { "epoch": 0.7708315117600769, "grad_norm": 1.2494486570358276, "learning_rate": 7.431682521917553e-07, "loss": 0.1177, "step": 33060 }, { "epoch": 0.7710646731368949, "grad_norm": 2.4017140865325928, "learning_rate": 7.430905303736866e-07, "loss": 0.1141, "step": 33070 }, { "epoch": 0.7712978345137128, "grad_norm": 1.3093091249465942, "learning_rate": 7.430128085556176e-07, "loss": 0.1149, "step": 33080 }, { "epoch": 0.7715309958905308, "grad_norm": 2.4499998092651367, "learning_rate": 7.429350867375489e-07, "loss": 0.1135, "step": 33090 }, { "epoch": 0.7717641572673487, "grad_norm": 2.8267533779144287, "learning_rate": 7.428573649194801e-07, "loss": 0.1156, "step": 33100 }, { "epoch": 0.7719973186441665, "grad_norm": 2.838663101196289, "learning_rate": 7.427796431014114e-07, "loss": 0.1261, "step": 33110 }, { "epoch": 0.7722304800209845, "grad_norm": 1.5904847383499146, "learning_rate": 7.427019212833427e-07, "loss": 0.1114, "step": 33120 }, { "epoch": 0.7724636413978024, "grad_norm": 1.5460801124572754, "learning_rate": 7.426241994652739e-07, "loss": 0.1124, "step": 33130 }, { "epoch": 0.7726968027746204, "grad_norm": 2.775505781173706, "learning_rate": 7.425464776472051e-07, "loss": 0.1118, "step": 33140 }, { "epoch": 0.7729299641514383, "grad_norm": 1.336013913154602, "learning_rate": 7.424687558291363e-07, "loss": 0.1292, "step": 33150 }, { "epoch": 0.7731631255282563, "grad_norm": 1.4481277465820312, "learning_rate": 7.423910340110675e-07, "loss": 0.12, "step": 33160 }, { "epoch": 0.7733962869050742, "grad_norm": 1.6442854404449463, "learning_rate": 7.423133121929988e-07, "loss": 0.1156, "step": 33170 }, { "epoch": 0.7736294482818921, "grad_norm": 2.2239832878112793, "learning_rate": 7.4223559037493e-07, "loss": 0.1101, "step": 33180 }, { "epoch": 0.77386260965871, "grad_norm": 1.8858264684677124, "learning_rate": 7.421578685568613e-07, "loss": 0.1179, "step": 33190 }, { "epoch": 0.774095771035528, "grad_norm": 1.6684008836746216, "learning_rate": 7.420801467387924e-07, "loss": 0.1203, "step": 33200 }, { "epoch": 0.7743289324123459, "grad_norm": 2.1366896629333496, "learning_rate": 7.420024249207237e-07, "loss": 0.116, "step": 33210 }, { "epoch": 0.7745620937891639, "grad_norm": 2.306845188140869, "learning_rate": 7.41924703102655e-07, "loss": 0.1175, "step": 33220 }, { "epoch": 0.7747952551659818, "grad_norm": 1.1665877103805542, "learning_rate": 7.418469812845862e-07, "loss": 0.1148, "step": 33230 }, { "epoch": 0.7750284165427996, "grad_norm": 2.5177228450775146, "learning_rate": 7.417692594665174e-07, "loss": 0.1281, "step": 33240 }, { "epoch": 0.7752615779196176, "grad_norm": 1.4547170400619507, "learning_rate": 7.416915376484487e-07, "loss": 0.1137, "step": 33250 }, { "epoch": 0.7754947392964355, "grad_norm": 1.284417748451233, "learning_rate": 7.416138158303798e-07, "loss": 0.1175, "step": 33260 }, { "epoch": 0.7757279006732535, "grad_norm": 3.0313968658447266, "learning_rate": 7.415360940123111e-07, "loss": 0.1119, "step": 33270 }, { "epoch": 0.7759610620500714, "grad_norm": 2.3754475116729736, "learning_rate": 7.414583721942423e-07, "loss": 0.1307, "step": 33280 }, { "epoch": 0.7761942234268894, "grad_norm": 3.705440044403076, "learning_rate": 7.413806503761736e-07, "loss": 0.124, "step": 33290 }, { "epoch": 0.7764273848037073, "grad_norm": 1.5397037267684937, "learning_rate": 7.413029285581049e-07, "loss": 0.1293, "step": 33300 }, { "epoch": 0.7766605461805252, "grad_norm": 1.6756362915039062, "learning_rate": 7.412252067400361e-07, "loss": 0.1296, "step": 33310 }, { "epoch": 0.7768937075573431, "grad_norm": 2.005559206008911, "learning_rate": 7.411474849219672e-07, "loss": 0.1268, "step": 33320 }, { "epoch": 0.777126868934161, "grad_norm": 3.0340380668640137, "learning_rate": 7.410697631038984e-07, "loss": 0.1214, "step": 33330 }, { "epoch": 0.777360030310979, "grad_norm": 1.5762073993682861, "learning_rate": 7.409920412858297e-07, "loss": 0.1043, "step": 33340 }, { "epoch": 0.777593191687797, "grad_norm": 1.6743180751800537, "learning_rate": 7.40914319467761e-07, "loss": 0.1278, "step": 33350 }, { "epoch": 0.7778263530646149, "grad_norm": 1.3040826320648193, "learning_rate": 7.408365976496922e-07, "loss": 0.1183, "step": 33360 }, { "epoch": 0.7780595144414327, "grad_norm": 2.0324721336364746, "learning_rate": 7.407588758316235e-07, "loss": 0.1192, "step": 33370 }, { "epoch": 0.7782926758182507, "grad_norm": 1.3579246997833252, "learning_rate": 7.406811540135547e-07, "loss": 0.1277, "step": 33380 }, { "epoch": 0.7785258371950686, "grad_norm": 1.4824819564819336, "learning_rate": 7.406034321954859e-07, "loss": 0.1313, "step": 33390 }, { "epoch": 0.7787589985718866, "grad_norm": 1.6226563453674316, "learning_rate": 7.40525710377417e-07, "loss": 0.1166, "step": 33400 }, { "epoch": 0.7789921599487045, "grad_norm": 1.5968667268753052, "learning_rate": 7.404479885593483e-07, "loss": 0.1089, "step": 33410 }, { "epoch": 0.7792253213255225, "grad_norm": 1.854295253753662, "learning_rate": 7.403702667412796e-07, "loss": 0.1182, "step": 33420 }, { "epoch": 0.7794584827023403, "grad_norm": 1.346335768699646, "learning_rate": 7.402925449232108e-07, "loss": 0.115, "step": 33430 }, { "epoch": 0.7796916440791583, "grad_norm": 2.2505345344543457, "learning_rate": 7.402148231051421e-07, "loss": 0.1195, "step": 33440 }, { "epoch": 0.7799248054559762, "grad_norm": 1.50730562210083, "learning_rate": 7.401371012870733e-07, "loss": 0.1129, "step": 33450 }, { "epoch": 0.7801579668327941, "grad_norm": 1.5618700981140137, "learning_rate": 7.400593794690045e-07, "loss": 0.1108, "step": 33460 }, { "epoch": 0.7803911282096121, "grad_norm": 2.6055908203125, "learning_rate": 7.399816576509358e-07, "loss": 0.1252, "step": 33470 }, { "epoch": 0.78062428958643, "grad_norm": 1.3241078853607178, "learning_rate": 7.399039358328669e-07, "loss": 0.1074, "step": 33480 }, { "epoch": 0.780857450963248, "grad_norm": 2.5221810340881348, "learning_rate": 7.398262140147982e-07, "loss": 0.1279, "step": 33490 }, { "epoch": 0.7810906123400658, "grad_norm": 1.114926815032959, "learning_rate": 7.397484921967295e-07, "loss": 0.1121, "step": 33500 }, { "epoch": 0.7813237737168838, "grad_norm": 1.2337232828140259, "learning_rate": 7.396707703786606e-07, "loss": 0.1117, "step": 33510 }, { "epoch": 0.7815569350937017, "grad_norm": 1.5382524728775024, "learning_rate": 7.395930485605919e-07, "loss": 0.1226, "step": 33520 }, { "epoch": 0.7817900964705197, "grad_norm": 1.652374505996704, "learning_rate": 7.395153267425231e-07, "loss": 0.1089, "step": 33530 }, { "epoch": 0.7820232578473376, "grad_norm": 2.8345015048980713, "learning_rate": 7.394376049244544e-07, "loss": 0.113, "step": 33540 }, { "epoch": 0.7822564192241556, "grad_norm": 3.2007172107696533, "learning_rate": 7.393598831063857e-07, "loss": 0.1097, "step": 33550 }, { "epoch": 0.7824895806009734, "grad_norm": 1.6063467264175415, "learning_rate": 7.392821612883168e-07, "loss": 0.1235, "step": 33560 }, { "epoch": 0.7827227419777913, "grad_norm": 1.5397167205810547, "learning_rate": 7.39204439470248e-07, "loss": 0.129, "step": 33570 }, { "epoch": 0.7829559033546093, "grad_norm": 1.3032563924789429, "learning_rate": 7.391267176521792e-07, "loss": 0.1136, "step": 33580 }, { "epoch": 0.7831890647314272, "grad_norm": 1.1498931646347046, "learning_rate": 7.390489958341105e-07, "loss": 0.1157, "step": 33590 }, { "epoch": 0.7834222261082452, "grad_norm": 1.36810302734375, "learning_rate": 7.389712740160418e-07, "loss": 0.1116, "step": 33600 }, { "epoch": 0.7836553874850631, "grad_norm": 3.375272274017334, "learning_rate": 7.38893552197973e-07, "loss": 0.1294, "step": 33610 }, { "epoch": 0.7838885488618811, "grad_norm": 1.7655662298202515, "learning_rate": 7.388158303799043e-07, "loss": 0.1189, "step": 33620 }, { "epoch": 0.7841217102386989, "grad_norm": 1.2418988943099976, "learning_rate": 7.387381085618354e-07, "loss": 0.1199, "step": 33630 }, { "epoch": 0.7843548716155169, "grad_norm": 4.145369052886963, "learning_rate": 7.386603867437666e-07, "loss": 0.1389, "step": 33640 }, { "epoch": 0.7845880329923348, "grad_norm": 1.6340423822402954, "learning_rate": 7.385826649256979e-07, "loss": 0.1346, "step": 33650 }, { "epoch": 0.7848211943691528, "grad_norm": 1.2297871112823486, "learning_rate": 7.385049431076291e-07, "loss": 0.127, "step": 33660 }, { "epoch": 0.7850543557459707, "grad_norm": 1.7859638929367065, "learning_rate": 7.384272212895604e-07, "loss": 0.1219, "step": 33670 }, { "epoch": 0.7852875171227887, "grad_norm": 2.002201795578003, "learning_rate": 7.383494994714917e-07, "loss": 0.1208, "step": 33680 }, { "epoch": 0.7855206784996065, "grad_norm": 1.117571473121643, "learning_rate": 7.382717776534228e-07, "loss": 0.1144, "step": 33690 }, { "epoch": 0.7857538398764244, "grad_norm": 1.381529450416565, "learning_rate": 7.381940558353541e-07, "loss": 0.1152, "step": 33700 }, { "epoch": 0.7859870012532424, "grad_norm": 2.4586832523345947, "learning_rate": 7.381163340172853e-07, "loss": 0.1198, "step": 33710 }, { "epoch": 0.7862201626300603, "grad_norm": 1.299473524093628, "learning_rate": 7.380386121992165e-07, "loss": 0.1197, "step": 33720 }, { "epoch": 0.7864533240068783, "grad_norm": 2.5308783054351807, "learning_rate": 7.379608903811478e-07, "loss": 0.1355, "step": 33730 }, { "epoch": 0.7866864853836962, "grad_norm": 1.9093703031539917, "learning_rate": 7.37883168563079e-07, "loss": 0.1178, "step": 33740 }, { "epoch": 0.7869196467605141, "grad_norm": 1.2493141889572144, "learning_rate": 7.378054467450102e-07, "loss": 0.1135, "step": 33750 }, { "epoch": 0.787152808137332, "grad_norm": 1.239445686340332, "learning_rate": 7.377277249269414e-07, "loss": 0.1125, "step": 33760 }, { "epoch": 0.78738596951415, "grad_norm": 1.4018709659576416, "learning_rate": 7.376500031088727e-07, "loss": 0.1147, "step": 33770 }, { "epoch": 0.7876191308909679, "grad_norm": 1.8416814804077148, "learning_rate": 7.37572281290804e-07, "loss": 0.1148, "step": 33780 }, { "epoch": 0.7878522922677859, "grad_norm": 1.612296462059021, "learning_rate": 7.374945594727352e-07, "loss": 0.1225, "step": 33790 }, { "epoch": 0.7880854536446038, "grad_norm": 2.6817874908447266, "learning_rate": 7.374168376546664e-07, "loss": 0.1217, "step": 33800 }, { "epoch": 0.7883186150214218, "grad_norm": 1.8113503456115723, "learning_rate": 7.373391158365975e-07, "loss": 0.1287, "step": 33810 }, { "epoch": 0.7885517763982396, "grad_norm": 1.2312090396881104, "learning_rate": 7.372613940185288e-07, "loss": 0.1214, "step": 33820 }, { "epoch": 0.7887849377750575, "grad_norm": 1.428553581237793, "learning_rate": 7.371836722004601e-07, "loss": 0.119, "step": 33830 }, { "epoch": 0.7890180991518755, "grad_norm": 1.7251113653182983, "learning_rate": 7.371059503823913e-07, "loss": 0.1203, "step": 33840 }, { "epoch": 0.7892512605286934, "grad_norm": 1.2960824966430664, "learning_rate": 7.370282285643226e-07, "loss": 0.1146, "step": 33850 }, { "epoch": 0.7894844219055114, "grad_norm": 1.3240160942077637, "learning_rate": 7.369505067462538e-07, "loss": 0.1153, "step": 33860 }, { "epoch": 0.7897175832823293, "grad_norm": 3.517418384552002, "learning_rate": 7.368727849281851e-07, "loss": 0.1238, "step": 33870 }, { "epoch": 0.7899507446591472, "grad_norm": 4.246449947357178, "learning_rate": 7.367950631101162e-07, "loss": 0.1366, "step": 33880 }, { "epoch": 0.7901839060359651, "grad_norm": 1.1817196607589722, "learning_rate": 7.367173412920474e-07, "loss": 0.1142, "step": 33890 }, { "epoch": 0.7904170674127831, "grad_norm": 1.350974678993225, "learning_rate": 7.366396194739787e-07, "loss": 0.121, "step": 33900 }, { "epoch": 0.790650228789601, "grad_norm": 2.035623788833618, "learning_rate": 7.365618976559099e-07, "loss": 0.1119, "step": 33910 }, { "epoch": 0.790883390166419, "grad_norm": 3.5897982120513916, "learning_rate": 7.364841758378412e-07, "loss": 0.1241, "step": 33920 }, { "epoch": 0.7911165515432369, "grad_norm": 1.395402431488037, "learning_rate": 7.364064540197725e-07, "loss": 0.1174, "step": 33930 }, { "epoch": 0.7913497129200547, "grad_norm": 1.2315882444381714, "learning_rate": 7.363287322017036e-07, "loss": 0.1192, "step": 33940 }, { "epoch": 0.7915828742968727, "grad_norm": 1.2421379089355469, "learning_rate": 7.362510103836349e-07, "loss": 0.1103, "step": 33950 }, { "epoch": 0.7918160356736906, "grad_norm": 2.24509859085083, "learning_rate": 7.36173288565566e-07, "loss": 0.1372, "step": 33960 }, { "epoch": 0.7920491970505086, "grad_norm": 1.2312076091766357, "learning_rate": 7.360955667474973e-07, "loss": 0.1172, "step": 33970 }, { "epoch": 0.7922823584273265, "grad_norm": 1.7947940826416016, "learning_rate": 7.360178449294286e-07, "loss": 0.1194, "step": 33980 }, { "epoch": 0.7925155198041445, "grad_norm": 1.4369004964828491, "learning_rate": 7.359401231113598e-07, "loss": 0.1013, "step": 33990 }, { "epoch": 0.7927486811809624, "grad_norm": 1.368769884109497, "learning_rate": 7.35862401293291e-07, "loss": 0.1059, "step": 34000 }, { "epoch": 0.7929818425577803, "grad_norm": 5.227473258972168, "learning_rate": 7.357846794752222e-07, "loss": 0.1093, "step": 34010 }, { "epoch": 0.7932150039345982, "grad_norm": 2.1445345878601074, "learning_rate": 7.357069576571535e-07, "loss": 0.1161, "step": 34020 }, { "epoch": 0.7934481653114162, "grad_norm": 2.3351263999938965, "learning_rate": 7.356292358390848e-07, "loss": 0.1195, "step": 34030 }, { "epoch": 0.7936813266882341, "grad_norm": 1.8113582134246826, "learning_rate": 7.355515140210159e-07, "loss": 0.132, "step": 34040 }, { "epoch": 0.793914488065052, "grad_norm": 3.1764912605285645, "learning_rate": 7.354737922029472e-07, "loss": 0.113, "step": 34050 }, { "epoch": 0.79414764944187, "grad_norm": 1.4774295091629028, "learning_rate": 7.353960703848783e-07, "loss": 0.1174, "step": 34060 }, { "epoch": 0.7943808108186878, "grad_norm": 1.486302375793457, "learning_rate": 7.353183485668096e-07, "loss": 0.1226, "step": 34070 }, { "epoch": 0.7946139721955058, "grad_norm": 1.9373469352722168, "learning_rate": 7.352406267487409e-07, "loss": 0.1117, "step": 34080 }, { "epoch": 0.7948471335723237, "grad_norm": 2.4777426719665527, "learning_rate": 7.351629049306721e-07, "loss": 0.118, "step": 34090 }, { "epoch": 0.7950802949491417, "grad_norm": 1.5986815690994263, "learning_rate": 7.350851831126034e-07, "loss": 0.1151, "step": 34100 }, { "epoch": 0.7953134563259596, "grad_norm": 1.5136271715164185, "learning_rate": 7.350074612945347e-07, "loss": 0.1069, "step": 34110 }, { "epoch": 0.7955466177027776, "grad_norm": 2.1609935760498047, "learning_rate": 7.349297394764657e-07, "loss": 0.1101, "step": 34120 }, { "epoch": 0.7957797790795955, "grad_norm": 1.7320387363433838, "learning_rate": 7.34852017658397e-07, "loss": 0.1139, "step": 34130 }, { "epoch": 0.7960129404564134, "grad_norm": 2.5210518836975098, "learning_rate": 7.347742958403282e-07, "loss": 0.1162, "step": 34140 }, { "epoch": 0.7962461018332313, "grad_norm": 0.9467005133628845, "learning_rate": 7.346965740222595e-07, "loss": 0.1231, "step": 34150 }, { "epoch": 0.7964792632100492, "grad_norm": 1.269331693649292, "learning_rate": 7.346188522041908e-07, "loss": 0.1157, "step": 34160 }, { "epoch": 0.7967124245868672, "grad_norm": 2.1695034503936768, "learning_rate": 7.34541130386122e-07, "loss": 0.1116, "step": 34170 }, { "epoch": 0.7969455859636851, "grad_norm": 2.181746482849121, "learning_rate": 7.344634085680532e-07, "loss": 0.1151, "step": 34180 }, { "epoch": 0.7971787473405031, "grad_norm": 1.6249053478240967, "learning_rate": 7.343856867499844e-07, "loss": 0.126, "step": 34190 }, { "epoch": 0.7974119087173209, "grad_norm": 2.9624879360198975, "learning_rate": 7.343079649319156e-07, "loss": 0.1138, "step": 34200 }, { "epoch": 0.7976450700941389, "grad_norm": 1.4392296075820923, "learning_rate": 7.342302431138469e-07, "loss": 0.126, "step": 34210 }, { "epoch": 0.7978782314709568, "grad_norm": 2.0023720264434814, "learning_rate": 7.341525212957781e-07, "loss": 0.1172, "step": 34220 }, { "epoch": 0.7981113928477748, "grad_norm": 1.6712455749511719, "learning_rate": 7.340747994777094e-07, "loss": 0.1174, "step": 34230 }, { "epoch": 0.7983445542245927, "grad_norm": 2.14623761177063, "learning_rate": 7.339970776596405e-07, "loss": 0.107, "step": 34240 }, { "epoch": 0.7985777156014107, "grad_norm": 1.127564787864685, "learning_rate": 7.339193558415718e-07, "loss": 0.1116, "step": 34250 }, { "epoch": 0.7988108769782285, "grad_norm": 1.3237468004226685, "learning_rate": 7.338416340235031e-07, "loss": 0.1109, "step": 34260 }, { "epoch": 0.7990440383550464, "grad_norm": 2.576181411743164, "learning_rate": 7.337639122054343e-07, "loss": 0.1235, "step": 34270 }, { "epoch": 0.7992771997318644, "grad_norm": 1.4079689979553223, "learning_rate": 7.336861903873655e-07, "loss": 0.1109, "step": 34280 }, { "epoch": 0.7995103611086823, "grad_norm": 2.871495008468628, "learning_rate": 7.336084685692967e-07, "loss": 0.1149, "step": 34290 }, { "epoch": 0.7997435224855003, "grad_norm": 1.5917577743530273, "learning_rate": 7.33530746751228e-07, "loss": 0.1133, "step": 34300 }, { "epoch": 0.7999766838623182, "grad_norm": 3.236853837966919, "learning_rate": 7.334530249331592e-07, "loss": 0.1119, "step": 34310 }, { "epoch": 0.8002098452391362, "grad_norm": 1.488643765449524, "learning_rate": 7.333753031150904e-07, "loss": 0.1131, "step": 34320 }, { "epoch": 0.800443006615954, "grad_norm": 1.68662428855896, "learning_rate": 7.332975812970217e-07, "loss": 0.1191, "step": 34330 }, { "epoch": 0.800676167992772, "grad_norm": 1.595549464225769, "learning_rate": 7.332198594789529e-07, "loss": 0.1198, "step": 34340 }, { "epoch": 0.8009093293695899, "grad_norm": 1.294959545135498, "learning_rate": 7.331421376608842e-07, "loss": 0.1165, "step": 34350 }, { "epoch": 0.8011424907464079, "grad_norm": 1.4480549097061157, "learning_rate": 7.330644158428154e-07, "loss": 0.13, "step": 34360 }, { "epoch": 0.8013756521232258, "grad_norm": 1.4293919801712036, "learning_rate": 7.329866940247465e-07, "loss": 0.1164, "step": 34370 }, { "epoch": 0.8016088135000438, "grad_norm": 1.3945661783218384, "learning_rate": 7.329089722066778e-07, "loss": 0.1115, "step": 34380 }, { "epoch": 0.8018419748768616, "grad_norm": 1.0865037441253662, "learning_rate": 7.32831250388609e-07, "loss": 0.1252, "step": 34390 }, { "epoch": 0.8020751362536795, "grad_norm": 1.460648775100708, "learning_rate": 7.327535285705403e-07, "loss": 0.1145, "step": 34400 }, { "epoch": 0.8023082976304975, "grad_norm": 2.3258371353149414, "learning_rate": 7.326758067524716e-07, "loss": 0.1206, "step": 34410 }, { "epoch": 0.8025414590073154, "grad_norm": 2.62241792678833, "learning_rate": 7.325980849344028e-07, "loss": 0.1196, "step": 34420 }, { "epoch": 0.8027746203841334, "grad_norm": 1.643381953239441, "learning_rate": 7.32520363116334e-07, "loss": 0.1186, "step": 34430 }, { "epoch": 0.8030077817609513, "grad_norm": 3.0828564167022705, "learning_rate": 7.324426412982651e-07, "loss": 0.1191, "step": 34440 }, { "epoch": 0.8032409431377693, "grad_norm": 1.0264697074890137, "learning_rate": 7.323649194801964e-07, "loss": 0.127, "step": 34450 }, { "epoch": 0.8034741045145871, "grad_norm": 1.4016727209091187, "learning_rate": 7.322871976621277e-07, "loss": 0.1307, "step": 34460 }, { "epoch": 0.8037072658914051, "grad_norm": 1.3814547061920166, "learning_rate": 7.322094758440589e-07, "loss": 0.1166, "step": 34470 }, { "epoch": 0.803940427268223, "grad_norm": 2.0465612411499023, "learning_rate": 7.321317540259902e-07, "loss": 0.1166, "step": 34480 }, { "epoch": 0.804173588645041, "grad_norm": 1.6261999607086182, "learning_rate": 7.320540322079213e-07, "loss": 0.1212, "step": 34490 }, { "epoch": 0.8044067500218589, "grad_norm": 1.8845531940460205, "learning_rate": 7.319763103898526e-07, "loss": 0.1136, "step": 34500 }, { "epoch": 0.8046399113986769, "grad_norm": 1.629696011543274, "learning_rate": 7.318985885717839e-07, "loss": 0.1098, "step": 34510 }, { "epoch": 0.8048730727754947, "grad_norm": 1.9409756660461426, "learning_rate": 7.31820866753715e-07, "loss": 0.1182, "step": 34520 }, { "epoch": 0.8051062341523126, "grad_norm": 2.103947639465332, "learning_rate": 7.317431449356463e-07, "loss": 0.12, "step": 34530 }, { "epoch": 0.8053393955291306, "grad_norm": 2.0668063163757324, "learning_rate": 7.316654231175775e-07, "loss": 0.1311, "step": 34540 }, { "epoch": 0.8055725569059485, "grad_norm": 1.6343837976455688, "learning_rate": 7.315877012995087e-07, "loss": 0.1127, "step": 34550 }, { "epoch": 0.8058057182827665, "grad_norm": 1.6438021659851074, "learning_rate": 7.3150997948144e-07, "loss": 0.1223, "step": 34560 }, { "epoch": 0.8060388796595844, "grad_norm": 2.6431822776794434, "learning_rate": 7.314322576633712e-07, "loss": 0.1175, "step": 34570 }, { "epoch": 0.8062720410364023, "grad_norm": 2.6549360752105713, "learning_rate": 7.313545358453025e-07, "loss": 0.1282, "step": 34580 }, { "epoch": 0.8065052024132202, "grad_norm": 3.1099917888641357, "learning_rate": 7.312768140272338e-07, "loss": 0.1234, "step": 34590 }, { "epoch": 0.8067383637900382, "grad_norm": 1.757489800453186, "learning_rate": 7.311990922091649e-07, "loss": 0.1171, "step": 34600 }, { "epoch": 0.8069715251668561, "grad_norm": 5.530789852142334, "learning_rate": 7.311213703910961e-07, "loss": 0.1214, "step": 34610 }, { "epoch": 0.807204686543674, "grad_norm": 1.8023056983947754, "learning_rate": 7.310436485730273e-07, "loss": 0.1296, "step": 34620 }, { "epoch": 0.807437847920492, "grad_norm": 2.6783580780029297, "learning_rate": 7.309659267549586e-07, "loss": 0.1259, "step": 34630 }, { "epoch": 0.80767100929731, "grad_norm": 1.217608094215393, "learning_rate": 7.308882049368899e-07, "loss": 0.1156, "step": 34640 }, { "epoch": 0.8079041706741278, "grad_norm": 2.3304834365844727, "learning_rate": 7.308104831188211e-07, "loss": 0.118, "step": 34650 }, { "epoch": 0.8081373320509457, "grad_norm": 1.238295555114746, "learning_rate": 7.307327613007524e-07, "loss": 0.1288, "step": 34660 }, { "epoch": 0.8083704934277637, "grad_norm": 1.8925138711929321, "learning_rate": 7.306550394826836e-07, "loss": 0.1257, "step": 34670 }, { "epoch": 0.8086036548045816, "grad_norm": 1.4300999641418457, "learning_rate": 7.305773176646147e-07, "loss": 0.1305, "step": 34680 }, { "epoch": 0.8088368161813996, "grad_norm": 1.8947921991348267, "learning_rate": 7.30499595846546e-07, "loss": 0.117, "step": 34690 }, { "epoch": 0.8090699775582175, "grad_norm": 1.4126145839691162, "learning_rate": 7.304218740284772e-07, "loss": 0.1074, "step": 34700 }, { "epoch": 0.8093031389350354, "grad_norm": 3.846606731414795, "learning_rate": 7.303441522104085e-07, "loss": 0.1238, "step": 34710 }, { "epoch": 0.8095363003118533, "grad_norm": 3.0503337383270264, "learning_rate": 7.302664303923397e-07, "loss": 0.1278, "step": 34720 }, { "epoch": 0.8097694616886713, "grad_norm": 3.6682324409484863, "learning_rate": 7.30188708574271e-07, "loss": 0.1112, "step": 34730 }, { "epoch": 0.8100026230654892, "grad_norm": 1.3609610795974731, "learning_rate": 7.301109867562022e-07, "loss": 0.1206, "step": 34740 }, { "epoch": 0.8102357844423071, "grad_norm": 1.616625428199768, "learning_rate": 7.300332649381334e-07, "loss": 0.1209, "step": 34750 }, { "epoch": 0.8104689458191251, "grad_norm": 1.6958012580871582, "learning_rate": 7.299555431200646e-07, "loss": 0.1134, "step": 34760 }, { "epoch": 0.810702107195943, "grad_norm": 2.5356996059417725, "learning_rate": 7.298778213019958e-07, "loss": 0.1153, "step": 34770 }, { "epoch": 0.8109352685727609, "grad_norm": 1.2881017923355103, "learning_rate": 7.298000994839271e-07, "loss": 0.1226, "step": 34780 }, { "epoch": 0.8111684299495788, "grad_norm": 2.0939269065856934, "learning_rate": 7.297223776658584e-07, "loss": 0.1272, "step": 34790 }, { "epoch": 0.8114015913263968, "grad_norm": 5.081720352172852, "learning_rate": 7.296446558477895e-07, "loss": 0.1036, "step": 34800 }, { "epoch": 0.8116347527032147, "grad_norm": 1.3183026313781738, "learning_rate": 7.295669340297208e-07, "loss": 0.123, "step": 34810 }, { "epoch": 0.8118679140800327, "grad_norm": 1.2971092462539673, "learning_rate": 7.29489212211652e-07, "loss": 0.1208, "step": 34820 }, { "epoch": 0.8121010754568506, "grad_norm": 1.7865067720413208, "learning_rate": 7.294114903935833e-07, "loss": 0.1206, "step": 34830 }, { "epoch": 0.8123342368336685, "grad_norm": 1.7559338808059692, "learning_rate": 7.293337685755146e-07, "loss": 0.1193, "step": 34840 }, { "epoch": 0.8125673982104864, "grad_norm": 1.5242440700531006, "learning_rate": 7.292560467574457e-07, "loss": 0.1175, "step": 34850 }, { "epoch": 0.8128005595873043, "grad_norm": 2.0040433406829834, "learning_rate": 7.291783249393769e-07, "loss": 0.1187, "step": 34860 }, { "epoch": 0.8130337209641223, "grad_norm": 1.406544804573059, "learning_rate": 7.291006031213081e-07, "loss": 0.1203, "step": 34870 }, { "epoch": 0.8132668823409402, "grad_norm": 1.1629621982574463, "learning_rate": 7.290228813032394e-07, "loss": 0.1093, "step": 34880 }, { "epoch": 0.8135000437177582, "grad_norm": 1.4529544115066528, "learning_rate": 7.289451594851707e-07, "loss": 0.1134, "step": 34890 }, { "epoch": 0.813733205094576, "grad_norm": 1.9808905124664307, "learning_rate": 7.288674376671019e-07, "loss": 0.1218, "step": 34900 }, { "epoch": 0.813966366471394, "grad_norm": 3.156216621398926, "learning_rate": 7.287897158490332e-07, "loss": 0.1206, "step": 34910 }, { "epoch": 0.8141995278482119, "grad_norm": 1.5568785667419434, "learning_rate": 7.287119940309643e-07, "loss": 0.1116, "step": 34920 }, { "epoch": 0.8144326892250299, "grad_norm": 0.9071192741394043, "learning_rate": 7.286342722128955e-07, "loss": 0.1117, "step": 34930 }, { "epoch": 0.8146658506018478, "grad_norm": 1.1765639781951904, "learning_rate": 7.285565503948268e-07, "loss": 0.1197, "step": 34940 }, { "epoch": 0.8148990119786658, "grad_norm": 4.767841339111328, "learning_rate": 7.28478828576758e-07, "loss": 0.1215, "step": 34950 }, { "epoch": 0.8151321733554837, "grad_norm": 2.3321263790130615, "learning_rate": 7.284011067586893e-07, "loss": 0.1142, "step": 34960 }, { "epoch": 0.8153653347323015, "grad_norm": 1.2810114622116089, "learning_rate": 7.283233849406206e-07, "loss": 0.1169, "step": 34970 }, { "epoch": 0.8155984961091195, "grad_norm": 1.435350775718689, "learning_rate": 7.282456631225517e-07, "loss": 0.1089, "step": 34980 }, { "epoch": 0.8158316574859374, "grad_norm": 1.8772560358047485, "learning_rate": 7.28167941304483e-07, "loss": 0.1247, "step": 34990 }, { "epoch": 0.8160648188627554, "grad_norm": 4.1931891441345215, "learning_rate": 7.280902194864142e-07, "loss": 0.1094, "step": 35000 }, { "epoch": 0.8162979802395733, "grad_norm": 2.3836069107055664, "learning_rate": 7.280124976683454e-07, "loss": 0.1235, "step": 35010 }, { "epoch": 0.8165311416163913, "grad_norm": 1.82329523563385, "learning_rate": 7.279347758502767e-07, "loss": 0.1156, "step": 35020 }, { "epoch": 0.8167643029932091, "grad_norm": 2.5155446529388428, "learning_rate": 7.278570540322079e-07, "loss": 0.1229, "step": 35030 }, { "epoch": 0.8169974643700271, "grad_norm": 1.6022393703460693, "learning_rate": 7.277793322141391e-07, "loss": 0.1127, "step": 35040 }, { "epoch": 0.817230625746845, "grad_norm": 1.3106284141540527, "learning_rate": 7.277016103960703e-07, "loss": 0.111, "step": 35050 }, { "epoch": 0.817463787123663, "grad_norm": 1.1757720708847046, "learning_rate": 7.276238885780016e-07, "loss": 0.1243, "step": 35060 }, { "epoch": 0.8176969485004809, "grad_norm": 1.2951419353485107, "learning_rate": 7.275461667599329e-07, "loss": 0.1104, "step": 35070 }, { "epoch": 0.8179301098772989, "grad_norm": 2.10805606842041, "learning_rate": 7.274684449418641e-07, "loss": 0.1072, "step": 35080 }, { "epoch": 0.8181632712541168, "grad_norm": 1.4893734455108643, "learning_rate": 7.273907231237953e-07, "loss": 0.1191, "step": 35090 }, { "epoch": 0.8183964326309346, "grad_norm": 2.164212226867676, "learning_rate": 7.273130013057264e-07, "loss": 0.1151, "step": 35100 }, { "epoch": 0.8186295940077526, "grad_norm": 1.3713974952697754, "learning_rate": 7.272352794876577e-07, "loss": 0.1175, "step": 35110 }, { "epoch": 0.8188627553845705, "grad_norm": 2.085603713989258, "learning_rate": 7.27157557669589e-07, "loss": 0.1157, "step": 35120 }, { "epoch": 0.8190959167613885, "grad_norm": 2.4202017784118652, "learning_rate": 7.270798358515202e-07, "loss": 0.118, "step": 35130 }, { "epoch": 0.8193290781382064, "grad_norm": 1.4015032052993774, "learning_rate": 7.270021140334515e-07, "loss": 0.1277, "step": 35140 }, { "epoch": 0.8195622395150244, "grad_norm": 1.2610926628112793, "learning_rate": 7.269243922153827e-07, "loss": 0.1273, "step": 35150 }, { "epoch": 0.8197954008918422, "grad_norm": 1.5304335355758667, "learning_rate": 7.26846670397314e-07, "loss": 0.1066, "step": 35160 }, { "epoch": 0.8200285622686602, "grad_norm": 2.219446897506714, "learning_rate": 7.267689485792451e-07, "loss": 0.1194, "step": 35170 }, { "epoch": 0.8202617236454781, "grad_norm": 1.343861699104309, "learning_rate": 7.266912267611763e-07, "loss": 0.12, "step": 35180 }, { "epoch": 0.8204948850222961, "grad_norm": 1.150543451309204, "learning_rate": 7.266135049431076e-07, "loss": 0.1191, "step": 35190 }, { "epoch": 0.820728046399114, "grad_norm": 1.3499624729156494, "learning_rate": 7.265357831250388e-07, "loss": 0.1196, "step": 35200 }, { "epoch": 0.820961207775932, "grad_norm": 1.4697332382202148, "learning_rate": 7.264580613069701e-07, "loss": 0.1218, "step": 35210 }, { "epoch": 0.8211943691527498, "grad_norm": 3.2885165214538574, "learning_rate": 7.263803394889014e-07, "loss": 0.1021, "step": 35220 }, { "epoch": 0.8214275305295677, "grad_norm": 1.443076491355896, "learning_rate": 7.263026176708325e-07, "loss": 0.1182, "step": 35230 }, { "epoch": 0.8216606919063857, "grad_norm": 1.2298815250396729, "learning_rate": 7.262248958527638e-07, "loss": 0.1115, "step": 35240 }, { "epoch": 0.8218938532832036, "grad_norm": 2.6291825771331787, "learning_rate": 7.261471740346949e-07, "loss": 0.1146, "step": 35250 }, { "epoch": 0.8221270146600216, "grad_norm": 1.2899861335754395, "learning_rate": 7.260694522166262e-07, "loss": 0.109, "step": 35260 }, { "epoch": 0.8223601760368395, "grad_norm": 1.380550742149353, "learning_rate": 7.259917303985575e-07, "loss": 0.1188, "step": 35270 }, { "epoch": 0.8225933374136575, "grad_norm": 1.9988373517990112, "learning_rate": 7.259140085804887e-07, "loss": 0.1165, "step": 35280 }, { "epoch": 0.8228264987904753, "grad_norm": 1.3682345151901245, "learning_rate": 7.258362867624199e-07, "loss": 0.1171, "step": 35290 }, { "epoch": 0.8230596601672933, "grad_norm": 1.8905221223831177, "learning_rate": 7.257585649443511e-07, "loss": 0.1067, "step": 35300 }, { "epoch": 0.8232928215441112, "grad_norm": 1.6249433755874634, "learning_rate": 7.256808431262824e-07, "loss": 0.1213, "step": 35310 }, { "epoch": 0.8235259829209292, "grad_norm": 1.4827557802200317, "learning_rate": 7.256031213082137e-07, "loss": 0.1269, "step": 35320 }, { "epoch": 0.8237591442977471, "grad_norm": 1.2530627250671387, "learning_rate": 7.255253994901448e-07, "loss": 0.1162, "step": 35330 }, { "epoch": 0.823992305674565, "grad_norm": 1.5778260231018066, "learning_rate": 7.254476776720761e-07, "loss": 0.1101, "step": 35340 }, { "epoch": 0.8242254670513829, "grad_norm": 2.5434393882751465, "learning_rate": 7.253699558540072e-07, "loss": 0.1088, "step": 35350 }, { "epoch": 0.8244586284282008, "grad_norm": 1.9297016859054565, "learning_rate": 7.252922340359385e-07, "loss": 0.1271, "step": 35360 }, { "epoch": 0.8246917898050188, "grad_norm": 1.8795689344406128, "learning_rate": 7.252145122178698e-07, "loss": 0.1204, "step": 35370 }, { "epoch": 0.8249249511818367, "grad_norm": 1.752332091331482, "learning_rate": 7.25136790399801e-07, "loss": 0.1096, "step": 35380 }, { "epoch": 0.8251581125586547, "grad_norm": 1.7186802625656128, "learning_rate": 7.250590685817323e-07, "loss": 0.1185, "step": 35390 }, { "epoch": 0.8253912739354726, "grad_norm": 1.439133644104004, "learning_rate": 7.249813467636636e-07, "loss": 0.1149, "step": 35400 }, { "epoch": 0.8256244353122906, "grad_norm": 3.284538984298706, "learning_rate": 7.249036249455946e-07, "loss": 0.1252, "step": 35410 }, { "epoch": 0.8258575966891084, "grad_norm": 1.337615728378296, "learning_rate": 7.248259031275259e-07, "loss": 0.1185, "step": 35420 }, { "epoch": 0.8260907580659264, "grad_norm": 1.4388396739959717, "learning_rate": 7.247481813094571e-07, "loss": 0.1223, "step": 35430 }, { "epoch": 0.8263239194427443, "grad_norm": 1.009366750717163, "learning_rate": 7.246704594913884e-07, "loss": 0.1053, "step": 35440 }, { "epoch": 0.8265570808195623, "grad_norm": 2.6993093490600586, "learning_rate": 7.245927376733197e-07, "loss": 0.1237, "step": 35450 }, { "epoch": 0.8267902421963802, "grad_norm": 1.4257618188858032, "learning_rate": 7.245150158552509e-07, "loss": 0.1143, "step": 35460 }, { "epoch": 0.8270234035731981, "grad_norm": 1.7186003923416138, "learning_rate": 7.244372940371821e-07, "loss": 0.1246, "step": 35470 }, { "epoch": 0.827256564950016, "grad_norm": 3.1391232013702393, "learning_rate": 7.243595722191133e-07, "loss": 0.1202, "step": 35480 }, { "epoch": 0.8274897263268339, "grad_norm": 1.3230760097503662, "learning_rate": 7.242818504010445e-07, "loss": 0.1273, "step": 35490 }, { "epoch": 0.8277228877036519, "grad_norm": 1.423807978630066, "learning_rate": 7.242041285829758e-07, "loss": 0.1088, "step": 35500 }, { "epoch": 0.8279560490804698, "grad_norm": 2.2778401374816895, "learning_rate": 7.24126406764907e-07, "loss": 0.1206, "step": 35510 }, { "epoch": 0.8281892104572878, "grad_norm": 2.2546019554138184, "learning_rate": 7.240486849468383e-07, "loss": 0.1026, "step": 35520 }, { "epoch": 0.8284223718341057, "grad_norm": 2.094376802444458, "learning_rate": 7.239709631287694e-07, "loss": 0.1237, "step": 35530 }, { "epoch": 0.8286555332109236, "grad_norm": 3.4867238998413086, "learning_rate": 7.238932413107007e-07, "loss": 0.121, "step": 35540 }, { "epoch": 0.8288886945877415, "grad_norm": 1.4581403732299805, "learning_rate": 7.23815519492632e-07, "loss": 0.1234, "step": 35550 }, { "epoch": 0.8291218559645595, "grad_norm": 2.493964433670044, "learning_rate": 7.237377976745632e-07, "loss": 0.114, "step": 35560 }, { "epoch": 0.8293550173413774, "grad_norm": 1.602905511856079, "learning_rate": 7.236600758564944e-07, "loss": 0.1159, "step": 35570 }, { "epoch": 0.8295881787181953, "grad_norm": 1.2921911478042603, "learning_rate": 7.235823540384256e-07, "loss": 0.131, "step": 35580 }, { "epoch": 0.8298213400950133, "grad_norm": 1.3791205883026123, "learning_rate": 7.235046322203569e-07, "loss": 0.1151, "step": 35590 }, { "epoch": 0.8300545014718312, "grad_norm": 1.7250699996948242, "learning_rate": 7.23434682584095e-07, "loss": 0.1051, "step": 35600 }, { "epoch": 0.8302876628486491, "grad_norm": 1.462701678276062, "learning_rate": 7.233569607660263e-07, "loss": 0.117, "step": 35610 }, { "epoch": 0.830520824225467, "grad_norm": 3.4949800968170166, "learning_rate": 7.232792389479574e-07, "loss": 0.1178, "step": 35620 }, { "epoch": 0.830753985602285, "grad_norm": 1.2041059732437134, "learning_rate": 7.232015171298886e-07, "loss": 0.1095, "step": 35630 }, { "epoch": 0.8309871469791029, "grad_norm": 1.742896556854248, "learning_rate": 7.231237953118199e-07, "loss": 0.1256, "step": 35640 }, { "epoch": 0.8312203083559209, "grad_norm": 3.237400770187378, "learning_rate": 7.230460734937511e-07, "loss": 0.1266, "step": 35650 }, { "epoch": 0.8314534697327388, "grad_norm": 1.6399729251861572, "learning_rate": 7.229683516756824e-07, "loss": 0.1203, "step": 35660 }, { "epoch": 0.8316866311095567, "grad_norm": 2.3214964866638184, "learning_rate": 7.228906298576136e-07, "loss": 0.1232, "step": 35670 }, { "epoch": 0.8319197924863746, "grad_norm": 1.7595585584640503, "learning_rate": 7.228129080395449e-07, "loss": 0.1127, "step": 35680 }, { "epoch": 0.8321529538631925, "grad_norm": 2.095313549041748, "learning_rate": 7.227351862214761e-07, "loss": 0.1168, "step": 35690 }, { "epoch": 0.8323861152400105, "grad_norm": 1.5503710508346558, "learning_rate": 7.226574644034072e-07, "loss": 0.1165, "step": 35700 }, { "epoch": 0.8326192766168284, "grad_norm": 2.8889849185943604, "learning_rate": 7.225797425853385e-07, "loss": 0.111, "step": 35710 }, { "epoch": 0.8328524379936464, "grad_norm": 1.7130476236343384, "learning_rate": 7.225020207672697e-07, "loss": 0.1183, "step": 35720 }, { "epoch": 0.8330855993704643, "grad_norm": 1.1661291122436523, "learning_rate": 7.22424298949201e-07, "loss": 0.1182, "step": 35730 }, { "epoch": 0.8333187607472822, "grad_norm": 1.7795261144638062, "learning_rate": 7.223465771311323e-07, "loss": 0.126, "step": 35740 }, { "epoch": 0.8335519221241001, "grad_norm": 2.994316816329956, "learning_rate": 7.222688553130634e-07, "loss": 0.1291, "step": 35750 }, { "epoch": 0.8337850835009181, "grad_norm": 1.4798946380615234, "learning_rate": 7.221911334949947e-07, "loss": 0.1111, "step": 35760 }, { "epoch": 0.834018244877736, "grad_norm": 1.4437553882598877, "learning_rate": 7.22113411676926e-07, "loss": 0.112, "step": 35770 }, { "epoch": 0.834251406254554, "grad_norm": 3.4782676696777344, "learning_rate": 7.220356898588571e-07, "loss": 0.1159, "step": 35780 }, { "epoch": 0.8344845676313719, "grad_norm": 3.0602991580963135, "learning_rate": 7.219579680407884e-07, "loss": 0.1313, "step": 35790 }, { "epoch": 0.8347177290081897, "grad_norm": 2.0640876293182373, "learning_rate": 7.218802462227196e-07, "loss": 0.1107, "step": 35800 }, { "epoch": 0.8349508903850077, "grad_norm": 2.7824108600616455, "learning_rate": 7.218025244046508e-07, "loss": 0.1188, "step": 35810 }, { "epoch": 0.8351840517618256, "grad_norm": 1.2692621946334839, "learning_rate": 7.21724802586582e-07, "loss": 0.1148, "step": 35820 }, { "epoch": 0.8354172131386436, "grad_norm": 1.596778392791748, "learning_rate": 7.216470807685133e-07, "loss": 0.1227, "step": 35830 }, { "epoch": 0.8356503745154615, "grad_norm": 1.8185529708862305, "learning_rate": 7.215693589504446e-07, "loss": 0.1289, "step": 35840 }, { "epoch": 0.8358835358922795, "grad_norm": 1.1592363119125366, "learning_rate": 7.214916371323758e-07, "loss": 0.1171, "step": 35850 }, { "epoch": 0.8361166972690973, "grad_norm": 2.3655309677124023, "learning_rate": 7.21413915314307e-07, "loss": 0.123, "step": 35860 }, { "epoch": 0.8363498586459153, "grad_norm": 1.4780924320220947, "learning_rate": 7.213361934962381e-07, "loss": 0.129, "step": 35870 }, { "epoch": 0.8365830200227332, "grad_norm": 1.198556900024414, "learning_rate": 7.212584716781694e-07, "loss": 0.1188, "step": 35880 }, { "epoch": 0.8368161813995512, "grad_norm": 1.637356162071228, "learning_rate": 7.211807498601007e-07, "loss": 0.1356, "step": 35890 }, { "epoch": 0.8370493427763691, "grad_norm": 1.78496253490448, "learning_rate": 7.211030280420319e-07, "loss": 0.1134, "step": 35900 }, { "epoch": 0.8372825041531871, "grad_norm": 1.8742784261703491, "learning_rate": 7.210253062239632e-07, "loss": 0.1108, "step": 35910 }, { "epoch": 0.837515665530005, "grad_norm": 1.2899932861328125, "learning_rate": 7.209475844058945e-07, "loss": 0.1215, "step": 35920 }, { "epoch": 0.8377488269068228, "grad_norm": 1.973217248916626, "learning_rate": 7.208698625878257e-07, "loss": 0.1043, "step": 35930 }, { "epoch": 0.8379819882836408, "grad_norm": 1.5027379989624023, "learning_rate": 7.207921407697568e-07, "loss": 0.1311, "step": 35940 }, { "epoch": 0.8382151496604587, "grad_norm": 1.6291719675064087, "learning_rate": 7.20714418951688e-07, "loss": 0.119, "step": 35950 }, { "epoch": 0.8384483110372767, "grad_norm": 2.8424699306488037, "learning_rate": 7.206366971336193e-07, "loss": 0.1085, "step": 35960 }, { "epoch": 0.8386814724140946, "grad_norm": 2.456127643585205, "learning_rate": 7.205589753155506e-07, "loss": 0.1171, "step": 35970 }, { "epoch": 0.8389146337909126, "grad_norm": 1.3751494884490967, "learning_rate": 7.204812534974818e-07, "loss": 0.1148, "step": 35980 }, { "epoch": 0.8391477951677304, "grad_norm": 1.6682965755462646, "learning_rate": 7.204035316794131e-07, "loss": 0.1188, "step": 35990 }, { "epoch": 0.8393809565445484, "grad_norm": 1.4876121282577515, "learning_rate": 7.203258098613442e-07, "loss": 0.1212, "step": 36000 }, { "epoch": 0.8396141179213663, "grad_norm": 1.6679482460021973, "learning_rate": 7.202480880432755e-07, "loss": 0.1243, "step": 36010 }, { "epoch": 0.8398472792981843, "grad_norm": 1.5321621894836426, "learning_rate": 7.201703662252067e-07, "loss": 0.1339, "step": 36020 }, { "epoch": 0.8400804406750022, "grad_norm": 3.170741081237793, "learning_rate": 7.200926444071379e-07, "loss": 0.1264, "step": 36030 }, { "epoch": 0.8403136020518202, "grad_norm": 2.836621046066284, "learning_rate": 7.200149225890692e-07, "loss": 0.1045, "step": 36040 }, { "epoch": 0.8405467634286381, "grad_norm": 1.5227465629577637, "learning_rate": 7.199372007710004e-07, "loss": 0.1123, "step": 36050 }, { "epoch": 0.8407799248054559, "grad_norm": 1.6372759342193604, "learning_rate": 7.198594789529316e-07, "loss": 0.1146, "step": 36060 }, { "epoch": 0.8410130861822739, "grad_norm": 1.112500548362732, "learning_rate": 7.197817571348629e-07, "loss": 0.1124, "step": 36070 }, { "epoch": 0.8412462475590918, "grad_norm": 1.4523853063583374, "learning_rate": 7.197040353167941e-07, "loss": 0.1145, "step": 36080 }, { "epoch": 0.8414794089359098, "grad_norm": 2.945146083831787, "learning_rate": 7.196263134987254e-07, "loss": 0.1213, "step": 36090 }, { "epoch": 0.8417125703127277, "grad_norm": 1.5585969686508179, "learning_rate": 7.195485916806565e-07, "loss": 0.1141, "step": 36100 }, { "epoch": 0.8419457316895457, "grad_norm": 1.7029601335525513, "learning_rate": 7.194708698625878e-07, "loss": 0.1328, "step": 36110 }, { "epoch": 0.8421788930663635, "grad_norm": 3.6324880123138428, "learning_rate": 7.19393148044519e-07, "loss": 0.1012, "step": 36120 }, { "epoch": 0.8424120544431815, "grad_norm": 1.1449943780899048, "learning_rate": 7.193154262264502e-07, "loss": 0.1148, "step": 36130 }, { "epoch": 0.8426452158199994, "grad_norm": 1.7314465045928955, "learning_rate": 7.192377044083815e-07, "loss": 0.1187, "step": 36140 }, { "epoch": 0.8428783771968174, "grad_norm": 1.3884176015853882, "learning_rate": 7.191599825903127e-07, "loss": 0.1098, "step": 36150 }, { "epoch": 0.8431115385736353, "grad_norm": 1.453047275543213, "learning_rate": 7.19082260772244e-07, "loss": 0.1147, "step": 36160 }, { "epoch": 0.8433446999504532, "grad_norm": 1.4351319074630737, "learning_rate": 7.190045389541753e-07, "loss": 0.1062, "step": 36170 }, { "epoch": 0.8435778613272711, "grad_norm": 2.263460636138916, "learning_rate": 7.189268171361063e-07, "loss": 0.1296, "step": 36180 }, { "epoch": 0.843811022704089, "grad_norm": 1.7825825214385986, "learning_rate": 7.188490953180376e-07, "loss": 0.1193, "step": 36190 }, { "epoch": 0.844044184080907, "grad_norm": 1.45077645778656, "learning_rate": 7.187713734999688e-07, "loss": 0.1218, "step": 36200 }, { "epoch": 0.8442773454577249, "grad_norm": 1.3876887559890747, "learning_rate": 7.186936516819001e-07, "loss": 0.1141, "step": 36210 }, { "epoch": 0.8445105068345429, "grad_norm": 1.5313314199447632, "learning_rate": 7.186159298638314e-07, "loss": 0.1285, "step": 36220 }, { "epoch": 0.8447436682113608, "grad_norm": 1.3563472032546997, "learning_rate": 7.185382080457626e-07, "loss": 0.1284, "step": 36230 }, { "epoch": 0.8449768295881788, "grad_norm": 1.8659062385559082, "learning_rate": 7.184604862276938e-07, "loss": 0.1281, "step": 36240 }, { "epoch": 0.8452099909649966, "grad_norm": 3.005838394165039, "learning_rate": 7.18382764409625e-07, "loss": 0.1251, "step": 36250 }, { "epoch": 0.8454431523418146, "grad_norm": 1.393195629119873, "learning_rate": 7.183050425915562e-07, "loss": 0.1167, "step": 36260 }, { "epoch": 0.8456763137186325, "grad_norm": 3.848621368408203, "learning_rate": 7.182273207734875e-07, "loss": 0.1239, "step": 36270 }, { "epoch": 0.8459094750954504, "grad_norm": 1.404181718826294, "learning_rate": 7.181495989554187e-07, "loss": 0.1136, "step": 36280 }, { "epoch": 0.8461426364722684, "grad_norm": 1.2951421737670898, "learning_rate": 7.1807187713735e-07, "loss": 0.1163, "step": 36290 }, { "epoch": 0.8463757978490863, "grad_norm": 2.271350145339966, "learning_rate": 7.179941553192812e-07, "loss": 0.1202, "step": 36300 }, { "epoch": 0.8466089592259042, "grad_norm": 0.9758025407791138, "learning_rate": 7.179164335012124e-07, "loss": 0.1213, "step": 36310 }, { "epoch": 0.8468421206027221, "grad_norm": 1.6322691440582275, "learning_rate": 7.178387116831437e-07, "loss": 0.1174, "step": 36320 }, { "epoch": 0.8470752819795401, "grad_norm": 2.4754765033721924, "learning_rate": 7.177609898650749e-07, "loss": 0.1187, "step": 36330 }, { "epoch": 0.847308443356358, "grad_norm": 2.106555461883545, "learning_rate": 7.176832680470061e-07, "loss": 0.1182, "step": 36340 }, { "epoch": 0.847541604733176, "grad_norm": 1.7348238229751587, "learning_rate": 7.176055462289374e-07, "loss": 0.1203, "step": 36350 }, { "epoch": 0.8477747661099939, "grad_norm": 2.370684862136841, "learning_rate": 7.175278244108685e-07, "loss": 0.1073, "step": 36360 }, { "epoch": 0.8480079274868119, "grad_norm": 1.4328702688217163, "learning_rate": 7.174501025927998e-07, "loss": 0.1241, "step": 36370 }, { "epoch": 0.8482410888636297, "grad_norm": 2.182513952255249, "learning_rate": 7.17372380774731e-07, "loss": 0.1114, "step": 36380 }, { "epoch": 0.8484742502404476, "grad_norm": 1.478695034980774, "learning_rate": 7.172946589566623e-07, "loss": 0.1148, "step": 36390 }, { "epoch": 0.8487074116172656, "grad_norm": 1.5462872982025146, "learning_rate": 7.172169371385936e-07, "loss": 0.1072, "step": 36400 }, { "epoch": 0.8489405729940835, "grad_norm": 2.88889479637146, "learning_rate": 7.171392153205248e-07, "loss": 0.1287, "step": 36410 }, { "epoch": 0.8491737343709015, "grad_norm": 2.0754201412200928, "learning_rate": 7.17061493502456e-07, "loss": 0.1215, "step": 36420 }, { "epoch": 0.8494068957477194, "grad_norm": 4.801910400390625, "learning_rate": 7.169837716843871e-07, "loss": 0.1295, "step": 36430 }, { "epoch": 0.8496400571245373, "grad_norm": 1.3510829210281372, "learning_rate": 7.169060498663184e-07, "loss": 0.1332, "step": 36440 }, { "epoch": 0.8498732185013552, "grad_norm": 3.421359062194824, "learning_rate": 7.168283280482497e-07, "loss": 0.1116, "step": 36450 }, { "epoch": 0.8501063798781732, "grad_norm": 1.650417685508728, "learning_rate": 7.167506062301809e-07, "loss": 0.1177, "step": 36460 }, { "epoch": 0.8503395412549911, "grad_norm": 1.3404638767242432, "learning_rate": 7.166728844121122e-07, "loss": 0.1299, "step": 36470 }, { "epoch": 0.8505727026318091, "grad_norm": 2.0052599906921387, "learning_rate": 7.165951625940434e-07, "loss": 0.1131, "step": 36480 }, { "epoch": 0.850805864008627, "grad_norm": 2.2116525173187256, "learning_rate": 7.165174407759746e-07, "loss": 0.1021, "step": 36490 }, { "epoch": 0.8510390253854448, "grad_norm": 2.330076217651367, "learning_rate": 7.164397189579058e-07, "loss": 0.1189, "step": 36500 }, { "epoch": 0.8512721867622628, "grad_norm": 3.9466230869293213, "learning_rate": 7.16361997139837e-07, "loss": 0.1296, "step": 36510 }, { "epoch": 0.8515053481390807, "grad_norm": 3.2259767055511475, "learning_rate": 7.162842753217683e-07, "loss": 0.1183, "step": 36520 }, { "epoch": 0.8517385095158987, "grad_norm": 3.4335849285125732, "learning_rate": 7.162065535036995e-07, "loss": 0.1276, "step": 36530 }, { "epoch": 0.8519716708927166, "grad_norm": 1.2652291059494019, "learning_rate": 7.161288316856308e-07, "loss": 0.1064, "step": 36540 }, { "epoch": 0.8522048322695346, "grad_norm": 1.714044451713562, "learning_rate": 7.16051109867562e-07, "loss": 0.1097, "step": 36550 }, { "epoch": 0.8524379936463525, "grad_norm": 1.7334665060043335, "learning_rate": 7.159733880494932e-07, "loss": 0.1212, "step": 36560 }, { "epoch": 0.8526711550231704, "grad_norm": 4.668938636779785, "learning_rate": 7.158956662314245e-07, "loss": 0.1176, "step": 36570 }, { "epoch": 0.8529043163999883, "grad_norm": 1.4319543838500977, "learning_rate": 7.158179444133556e-07, "loss": 0.1043, "step": 36580 }, { "epoch": 0.8531374777768063, "grad_norm": 1.829672932624817, "learning_rate": 7.157402225952869e-07, "loss": 0.11, "step": 36590 }, { "epoch": 0.8533706391536242, "grad_norm": 1.9126214981079102, "learning_rate": 7.156625007772182e-07, "loss": 0.1341, "step": 36600 }, { "epoch": 0.8536038005304422, "grad_norm": 2.425161838531494, "learning_rate": 7.155847789591493e-07, "loss": 0.1125, "step": 36610 }, { "epoch": 0.8538369619072601, "grad_norm": 1.4539320468902588, "learning_rate": 7.155070571410806e-07, "loss": 0.1162, "step": 36620 }, { "epoch": 0.8540701232840779, "grad_norm": 2.0157039165496826, "learning_rate": 7.154293353230118e-07, "loss": 0.1234, "step": 36630 }, { "epoch": 0.8543032846608959, "grad_norm": 1.3319768905639648, "learning_rate": 7.153516135049431e-07, "loss": 0.1266, "step": 36640 }, { "epoch": 0.8545364460377138, "grad_norm": 1.413083791732788, "learning_rate": 7.152738916868744e-07, "loss": 0.1161, "step": 36650 }, { "epoch": 0.8547696074145318, "grad_norm": 1.5469775199890137, "learning_rate": 7.151961698688055e-07, "loss": 0.1321, "step": 36660 }, { "epoch": 0.8550027687913497, "grad_norm": 1.824620246887207, "learning_rate": 7.151184480507367e-07, "loss": 0.1179, "step": 36670 }, { "epoch": 0.8552359301681677, "grad_norm": 2.536097764968872, "learning_rate": 7.15040726232668e-07, "loss": 0.1158, "step": 36680 }, { "epoch": 0.8554690915449856, "grad_norm": 2.5310354232788086, "learning_rate": 7.149630044145992e-07, "loss": 0.1265, "step": 36690 }, { "epoch": 0.8557022529218035, "grad_norm": 1.6127253770828247, "learning_rate": 7.148852825965305e-07, "loss": 0.1291, "step": 36700 }, { "epoch": 0.8559354142986214, "grad_norm": 1.141382098197937, "learning_rate": 7.148075607784617e-07, "loss": 0.116, "step": 36710 }, { "epoch": 0.8561685756754394, "grad_norm": 1.976431965827942, "learning_rate": 7.14729838960393e-07, "loss": 0.1289, "step": 36720 }, { "epoch": 0.8564017370522573, "grad_norm": 1.6419521570205688, "learning_rate": 7.146521171423243e-07, "loss": 0.1111, "step": 36730 }, { "epoch": 0.8566348984290753, "grad_norm": 1.963463306427002, "learning_rate": 7.145743953242553e-07, "loss": 0.122, "step": 36740 }, { "epoch": 0.8568680598058932, "grad_norm": 1.406437635421753, "learning_rate": 7.144966735061866e-07, "loss": 0.1095, "step": 36750 }, { "epoch": 0.857101221182711, "grad_norm": 2.021455764770508, "learning_rate": 7.144189516881178e-07, "loss": 0.1119, "step": 36760 }, { "epoch": 0.857334382559529, "grad_norm": 2.7139341831207275, "learning_rate": 7.143412298700491e-07, "loss": 0.1153, "step": 36770 }, { "epoch": 0.8575675439363469, "grad_norm": 2.041494846343994, "learning_rate": 7.142635080519804e-07, "loss": 0.1119, "step": 36780 }, { "epoch": 0.8578007053131649, "grad_norm": 1.40938138961792, "learning_rate": 7.141857862339116e-07, "loss": 0.1189, "step": 36790 }, { "epoch": 0.8580338666899828, "grad_norm": 1.4189975261688232, "learning_rate": 7.141080644158428e-07, "loss": 0.1177, "step": 36800 }, { "epoch": 0.8582670280668008, "grad_norm": 1.393607258796692, "learning_rate": 7.14030342597774e-07, "loss": 0.113, "step": 36810 }, { "epoch": 0.8585001894436186, "grad_norm": 1.4974188804626465, "learning_rate": 7.139526207797052e-07, "loss": 0.1217, "step": 36820 }, { "epoch": 0.8587333508204366, "grad_norm": 1.8439797163009644, "learning_rate": 7.138748989616365e-07, "loss": 0.1218, "step": 36830 }, { "epoch": 0.8589665121972545, "grad_norm": 1.8088417053222656, "learning_rate": 7.137971771435677e-07, "loss": 0.1245, "step": 36840 }, { "epoch": 0.8591996735740725, "grad_norm": 1.0959385633468628, "learning_rate": 7.13719455325499e-07, "loss": 0.1218, "step": 36850 }, { "epoch": 0.8594328349508904, "grad_norm": 1.7188485860824585, "learning_rate": 7.136417335074301e-07, "loss": 0.1055, "step": 36860 }, { "epoch": 0.8596659963277083, "grad_norm": 1.3552943468093872, "learning_rate": 7.135640116893614e-07, "loss": 0.119, "step": 36870 }, { "epoch": 0.8598991577045263, "grad_norm": 1.4336576461791992, "learning_rate": 7.134862898712927e-07, "loss": 0.1178, "step": 36880 }, { "epoch": 0.8601323190813441, "grad_norm": 3.0991134643554688, "learning_rate": 7.134085680532239e-07, "loss": 0.115, "step": 36890 }, { "epoch": 0.8603654804581621, "grad_norm": 1.1295329332351685, "learning_rate": 7.133308462351551e-07, "loss": 0.1234, "step": 36900 }, { "epoch": 0.86059864183498, "grad_norm": 2.1727442741394043, "learning_rate": 7.132531244170863e-07, "loss": 0.1218, "step": 36910 }, { "epoch": 0.860831803211798, "grad_norm": 1.3047373294830322, "learning_rate": 7.131754025990175e-07, "loss": 0.1201, "step": 36920 }, { "epoch": 0.8610649645886159, "grad_norm": 2.2786190509796143, "learning_rate": 7.130976807809488e-07, "loss": 0.1143, "step": 36930 }, { "epoch": 0.8612981259654339, "grad_norm": 1.4082914590835571, "learning_rate": 7.1301995896288e-07, "loss": 0.1076, "step": 36940 }, { "epoch": 0.8615312873422517, "grad_norm": 1.605339527130127, "learning_rate": 7.129422371448113e-07, "loss": 0.1156, "step": 36950 }, { "epoch": 0.8617644487190697, "grad_norm": 1.3914306163787842, "learning_rate": 7.128645153267425e-07, "loss": 0.127, "step": 36960 }, { "epoch": 0.8619976100958876, "grad_norm": 1.7351378202438354, "learning_rate": 7.127867935086738e-07, "loss": 0.1139, "step": 36970 }, { "epoch": 0.8622307714727055, "grad_norm": 1.2412322759628296, "learning_rate": 7.127090716906049e-07, "loss": 0.1146, "step": 36980 }, { "epoch": 0.8624639328495235, "grad_norm": 1.2703652381896973, "learning_rate": 7.126313498725361e-07, "loss": 0.1128, "step": 36990 }, { "epoch": 0.8626970942263414, "grad_norm": 2.5588021278381348, "learning_rate": 7.125536280544674e-07, "loss": 0.1366, "step": 37000 }, { "epoch": 0.8629302556031594, "grad_norm": 1.9820142984390259, "learning_rate": 7.124759062363986e-07, "loss": 0.1212, "step": 37010 }, { "epoch": 0.8631634169799772, "grad_norm": 2.006047010421753, "learning_rate": 7.123981844183299e-07, "loss": 0.1117, "step": 37020 }, { "epoch": 0.8633965783567952, "grad_norm": 2.5404064655303955, "learning_rate": 7.123204626002612e-07, "loss": 0.1264, "step": 37030 }, { "epoch": 0.8636297397336131, "grad_norm": 3.180729866027832, "learning_rate": 7.122427407821923e-07, "loss": 0.1182, "step": 37040 }, { "epoch": 0.8638629011104311, "grad_norm": 3.4220945835113525, "learning_rate": 7.121650189641236e-07, "loss": 0.119, "step": 37050 }, { "epoch": 0.864096062487249, "grad_norm": 1.2530611753463745, "learning_rate": 7.120872971460547e-07, "loss": 0.1295, "step": 37060 }, { "epoch": 0.864329223864067, "grad_norm": 3.0446932315826416, "learning_rate": 7.12009575327986e-07, "loss": 0.1008, "step": 37070 }, { "epoch": 0.8645623852408848, "grad_norm": 1.4836584329605103, "learning_rate": 7.119318535099173e-07, "loss": 0.1246, "step": 37080 }, { "epoch": 0.8647955466177027, "grad_norm": 1.5176305770874023, "learning_rate": 7.118541316918485e-07, "loss": 0.1155, "step": 37090 }, { "epoch": 0.8650287079945207, "grad_norm": 2.1800620555877686, "learning_rate": 7.117764098737797e-07, "loss": 0.1246, "step": 37100 }, { "epoch": 0.8652618693713386, "grad_norm": 1.0305883884429932, "learning_rate": 7.11698688055711e-07, "loss": 0.1176, "step": 37110 }, { "epoch": 0.8654950307481566, "grad_norm": 2.1110405921936035, "learning_rate": 7.116209662376422e-07, "loss": 0.1152, "step": 37120 }, { "epoch": 0.8657281921249745, "grad_norm": 2.111827850341797, "learning_rate": 7.115432444195735e-07, "loss": 0.1188, "step": 37130 }, { "epoch": 0.8659613535017924, "grad_norm": 3.423252582550049, "learning_rate": 7.114655226015046e-07, "loss": 0.1178, "step": 37140 }, { "epoch": 0.8661945148786103, "grad_norm": 1.251268744468689, "learning_rate": 7.113878007834359e-07, "loss": 0.1185, "step": 37150 }, { "epoch": 0.8664276762554283, "grad_norm": 2.100677013397217, "learning_rate": 7.11310078965367e-07, "loss": 0.1109, "step": 37160 }, { "epoch": 0.8666608376322462, "grad_norm": 1.6391624212265015, "learning_rate": 7.112323571472983e-07, "loss": 0.1093, "step": 37170 }, { "epoch": 0.8668939990090642, "grad_norm": 1.5075339078903198, "learning_rate": 7.111546353292296e-07, "loss": 0.1087, "step": 37180 }, { "epoch": 0.8671271603858821, "grad_norm": 4.467412948608398, "learning_rate": 7.110769135111608e-07, "loss": 0.1076, "step": 37190 }, { "epoch": 0.8673603217627001, "grad_norm": 2.07796311378479, "learning_rate": 7.109991916930921e-07, "loss": 0.114, "step": 37200 }, { "epoch": 0.8675934831395179, "grad_norm": 1.8772375583648682, "learning_rate": 7.109214698750234e-07, "loss": 0.1127, "step": 37210 }, { "epoch": 0.8678266445163358, "grad_norm": 1.1204174757003784, "learning_rate": 7.108437480569544e-07, "loss": 0.1119, "step": 37220 }, { "epoch": 0.8680598058931538, "grad_norm": 2.335515260696411, "learning_rate": 7.107660262388857e-07, "loss": 0.1154, "step": 37230 }, { "epoch": 0.8682929672699717, "grad_norm": 1.192286491394043, "learning_rate": 7.106883044208169e-07, "loss": 0.1229, "step": 37240 }, { "epoch": 0.8685261286467897, "grad_norm": 1.6077884435653687, "learning_rate": 7.106105826027482e-07, "loss": 0.111, "step": 37250 }, { "epoch": 0.8687592900236076, "grad_norm": 1.8203996419906616, "learning_rate": 7.105328607846795e-07, "loss": 0.1142, "step": 37260 }, { "epoch": 0.8689924514004255, "grad_norm": 1.952723503112793, "learning_rate": 7.104551389666107e-07, "loss": 0.1113, "step": 37270 }, { "epoch": 0.8692256127772434, "grad_norm": 1.6575145721435547, "learning_rate": 7.10377417148542e-07, "loss": 0.1141, "step": 37280 }, { "epoch": 0.8694587741540614, "grad_norm": 1.095949411392212, "learning_rate": 7.102996953304731e-07, "loss": 0.1073, "step": 37290 }, { "epoch": 0.8696919355308793, "grad_norm": 2.281024694442749, "learning_rate": 7.102219735124043e-07, "loss": 0.1183, "step": 37300 }, { "epoch": 0.8699250969076973, "grad_norm": 1.6542482376098633, "learning_rate": 7.101442516943356e-07, "loss": 0.1137, "step": 37310 }, { "epoch": 0.8701582582845152, "grad_norm": 1.6646028757095337, "learning_rate": 7.100665298762668e-07, "loss": 0.1108, "step": 37320 }, { "epoch": 0.8703914196613332, "grad_norm": 2.283524513244629, "learning_rate": 7.099888080581981e-07, "loss": 0.1289, "step": 37330 }, { "epoch": 0.870624581038151, "grad_norm": 3.1887974739074707, "learning_rate": 7.099110862401293e-07, "loss": 0.1249, "step": 37340 }, { "epoch": 0.8708577424149689, "grad_norm": 1.2440279722213745, "learning_rate": 7.098333644220605e-07, "loss": 0.1135, "step": 37350 }, { "epoch": 0.8710909037917869, "grad_norm": 2.7194128036499023, "learning_rate": 7.097556426039918e-07, "loss": 0.1173, "step": 37360 }, { "epoch": 0.8713240651686048, "grad_norm": 2.2103805541992188, "learning_rate": 7.09677920785923e-07, "loss": 0.1205, "step": 37370 }, { "epoch": 0.8715572265454228, "grad_norm": 1.4609630107879639, "learning_rate": 7.096001989678542e-07, "loss": 0.1233, "step": 37380 }, { "epoch": 0.8717903879222407, "grad_norm": 3.4923629760742188, "learning_rate": 7.095224771497854e-07, "loss": 0.1155, "step": 37390 }, { "epoch": 0.8720235492990586, "grad_norm": 1.3066695928573608, "learning_rate": 7.094447553317167e-07, "loss": 0.1216, "step": 37400 }, { "epoch": 0.8722567106758765, "grad_norm": 1.5640212297439575, "learning_rate": 7.093670335136479e-07, "loss": 0.1183, "step": 37410 }, { "epoch": 0.8724898720526945, "grad_norm": 3.2942957878112793, "learning_rate": 7.092893116955791e-07, "loss": 0.1234, "step": 37420 }, { "epoch": 0.8727230334295124, "grad_norm": 2.556302070617676, "learning_rate": 7.092115898775104e-07, "loss": 0.1315, "step": 37430 }, { "epoch": 0.8729561948063304, "grad_norm": 1.1559804677963257, "learning_rate": 7.091338680594416e-07, "loss": 0.121, "step": 37440 }, { "epoch": 0.8731893561831483, "grad_norm": 3.446047067642212, "learning_rate": 7.090561462413729e-07, "loss": 0.1111, "step": 37450 }, { "epoch": 0.8734225175599661, "grad_norm": 1.3277664184570312, "learning_rate": 7.089784244233041e-07, "loss": 0.1216, "step": 37460 }, { "epoch": 0.8736556789367841, "grad_norm": 1.2473676204681396, "learning_rate": 7.089007026052352e-07, "loss": 0.1059, "step": 37470 }, { "epoch": 0.873888840313602, "grad_norm": 3.6068766117095947, "learning_rate": 7.088229807871665e-07, "loss": 0.1215, "step": 37480 }, { "epoch": 0.87412200169042, "grad_norm": 1.7393157482147217, "learning_rate": 7.087452589690977e-07, "loss": 0.1158, "step": 37490 }, { "epoch": 0.8743551630672379, "grad_norm": 1.5452086925506592, "learning_rate": 7.08667537151029e-07, "loss": 0.1242, "step": 37500 }, { "epoch": 0.8745883244440559, "grad_norm": 3.3051626682281494, "learning_rate": 7.085898153329603e-07, "loss": 0.1126, "step": 37510 }, { "epoch": 0.8748214858208738, "grad_norm": 1.804885745048523, "learning_rate": 7.085120935148915e-07, "loss": 0.1158, "step": 37520 }, { "epoch": 0.8750546471976917, "grad_norm": 1.9905084371566772, "learning_rate": 7.084343716968227e-07, "loss": 0.1145, "step": 37530 }, { "epoch": 0.8752878085745096, "grad_norm": 1.919148564338684, "learning_rate": 7.08356649878754e-07, "loss": 0.1127, "step": 37540 }, { "epoch": 0.8755209699513276, "grad_norm": 2.419563055038452, "learning_rate": 7.082789280606851e-07, "loss": 0.1189, "step": 37550 }, { "epoch": 0.8757541313281455, "grad_norm": 1.584446907043457, "learning_rate": 7.082012062426164e-07, "loss": 0.124, "step": 37560 }, { "epoch": 0.8759872927049634, "grad_norm": 1.1461759805679321, "learning_rate": 7.081234844245476e-07, "loss": 0.1276, "step": 37570 }, { "epoch": 0.8762204540817814, "grad_norm": 3.613440752029419, "learning_rate": 7.080457626064789e-07, "loss": 0.1024, "step": 37580 }, { "epoch": 0.8764536154585992, "grad_norm": 1.9283149242401123, "learning_rate": 7.0796804078841e-07, "loss": 0.1242, "step": 37590 }, { "epoch": 0.8766867768354172, "grad_norm": Infinity, "learning_rate": 7.078980911521482e-07, "loss": 0.1148, "step": 37600 }, { "epoch": 0.8769199382122351, "grad_norm": 3.0422792434692383, "learning_rate": 7.078203693340794e-07, "loss": 0.117, "step": 37610 }, { "epoch": 0.8771530995890531, "grad_norm": 2.6013214588165283, "learning_rate": 7.077426475160107e-07, "loss": 0.1282, "step": 37620 }, { "epoch": 0.877386260965871, "grad_norm": 2.352201461791992, "learning_rate": 7.076649256979419e-07, "loss": 0.1253, "step": 37630 }, { "epoch": 0.877619422342689, "grad_norm": 1.5410161018371582, "learning_rate": 7.075872038798731e-07, "loss": 0.1209, "step": 37640 }, { "epoch": 0.8778525837195069, "grad_norm": 1.3921153545379639, "learning_rate": 7.075094820618044e-07, "loss": 0.1252, "step": 37650 }, { "epoch": 0.8780857450963248, "grad_norm": 1.2015436887741089, "learning_rate": 7.074317602437356e-07, "loss": 0.125, "step": 37660 }, { "epoch": 0.8783189064731427, "grad_norm": 1.2868646383285522, "learning_rate": 7.073540384256668e-07, "loss": 0.108, "step": 37670 }, { "epoch": 0.8785520678499606, "grad_norm": 1.1909059286117554, "learning_rate": 7.072763166075981e-07, "loss": 0.1128, "step": 37680 }, { "epoch": 0.8787852292267786, "grad_norm": 1.6455142498016357, "learning_rate": 7.071985947895292e-07, "loss": 0.1096, "step": 37690 }, { "epoch": 0.8790183906035965, "grad_norm": 1.2198914289474487, "learning_rate": 7.071208729714605e-07, "loss": 0.1123, "step": 37700 }, { "epoch": 0.8792515519804145, "grad_norm": 1.581505537033081, "learning_rate": 7.070431511533917e-07, "loss": 0.111, "step": 37710 }, { "epoch": 0.8794847133572323, "grad_norm": 2.626009702682495, "learning_rate": 7.06965429335323e-07, "loss": 0.1333, "step": 37720 }, { "epoch": 0.8797178747340503, "grad_norm": 1.848440408706665, "learning_rate": 7.068877075172543e-07, "loss": 0.1249, "step": 37730 }, { "epoch": 0.8799510361108682, "grad_norm": 1.3495765924453735, "learning_rate": 7.068099856991855e-07, "loss": 0.1206, "step": 37740 }, { "epoch": 0.8801841974876862, "grad_norm": 1.5246495008468628, "learning_rate": 7.067322638811167e-07, "loss": 0.1163, "step": 37750 }, { "epoch": 0.8804173588645041, "grad_norm": 3.978060483932495, "learning_rate": 7.066545420630478e-07, "loss": 0.1175, "step": 37760 }, { "epoch": 0.8806505202413221, "grad_norm": 1.4069900512695312, "learning_rate": 7.065768202449791e-07, "loss": 0.1159, "step": 37770 }, { "epoch": 0.8808836816181399, "grad_norm": 1.8039641380310059, "learning_rate": 7.064990984269104e-07, "loss": 0.1197, "step": 37780 }, { "epoch": 0.8811168429949578, "grad_norm": 1.0859750509262085, "learning_rate": 7.064213766088416e-07, "loss": 0.1237, "step": 37790 }, { "epoch": 0.8813500043717758, "grad_norm": 1.44515860080719, "learning_rate": 7.063436547907729e-07, "loss": 0.113, "step": 37800 }, { "epoch": 0.8815831657485937, "grad_norm": 1.6348532438278198, "learning_rate": 7.06265932972704e-07, "loss": 0.1317, "step": 37810 }, { "epoch": 0.8818163271254117, "grad_norm": 2.042015552520752, "learning_rate": 7.061882111546353e-07, "loss": 0.11, "step": 37820 }, { "epoch": 0.8820494885022296, "grad_norm": 2.1150503158569336, "learning_rate": 7.061104893365666e-07, "loss": 0.111, "step": 37830 }, { "epoch": 0.8822826498790476, "grad_norm": 2.1964111328125, "learning_rate": 7.060327675184977e-07, "loss": 0.1085, "step": 37840 }, { "epoch": 0.8825158112558654, "grad_norm": 1.5661616325378418, "learning_rate": 7.05955045700429e-07, "loss": 0.1186, "step": 37850 }, { "epoch": 0.8827489726326834, "grad_norm": 1.6059426069259644, "learning_rate": 7.058773238823603e-07, "loss": 0.1129, "step": 37860 }, { "epoch": 0.8829821340095013, "grad_norm": 1.7364482879638672, "learning_rate": 7.057996020642914e-07, "loss": 0.1141, "step": 37870 }, { "epoch": 0.8832152953863193, "grad_norm": 1.6191638708114624, "learning_rate": 7.057218802462227e-07, "loss": 0.1164, "step": 37880 }, { "epoch": 0.8834484567631372, "grad_norm": 1.657626986503601, "learning_rate": 7.056441584281539e-07, "loss": 0.1113, "step": 37890 }, { "epoch": 0.8836816181399552, "grad_norm": 3.5146844387054443, "learning_rate": 7.055664366100852e-07, "loss": 0.1074, "step": 37900 }, { "epoch": 0.883914779516773, "grad_norm": 1.447134256362915, "learning_rate": 7.054887147920165e-07, "loss": 0.1119, "step": 37910 }, { "epoch": 0.884147940893591, "grad_norm": 2.180864095687866, "learning_rate": 7.054109929739476e-07, "loss": 0.118, "step": 37920 }, { "epoch": 0.8843811022704089, "grad_norm": 4.387259483337402, "learning_rate": 7.053332711558788e-07, "loss": 0.1138, "step": 37930 }, { "epoch": 0.8846142636472268, "grad_norm": 1.618281602859497, "learning_rate": 7.0525554933781e-07, "loss": 0.118, "step": 37940 }, { "epoch": 0.8848474250240448, "grad_norm": 1.7475018501281738, "learning_rate": 7.051778275197413e-07, "loss": 0.1166, "step": 37950 }, { "epoch": 0.8850805864008627, "grad_norm": 1.098446011543274, "learning_rate": 7.051001057016726e-07, "loss": 0.1041, "step": 37960 }, { "epoch": 0.8853137477776807, "grad_norm": 1.248266577720642, "learning_rate": 7.050223838836038e-07, "loss": 0.125, "step": 37970 }, { "epoch": 0.8855469091544985, "grad_norm": 1.2240104675292969, "learning_rate": 7.049446620655351e-07, "loss": 0.1202, "step": 37980 }, { "epoch": 0.8857800705313165, "grad_norm": 2.829810619354248, "learning_rate": 7.048669402474663e-07, "loss": 0.1128, "step": 37990 }, { "epoch": 0.8860132319081344, "grad_norm": 1.874094843864441, "learning_rate": 7.047892184293974e-07, "loss": 0.121, "step": 38000 }, { "epoch": 0.8862463932849524, "grad_norm": 1.3852428197860718, "learning_rate": 7.047114966113287e-07, "loss": 0.1184, "step": 38010 }, { "epoch": 0.8864795546617703, "grad_norm": 2.024085521697998, "learning_rate": 7.046337747932599e-07, "loss": 0.1068, "step": 38020 }, { "epoch": 0.8867127160385883, "grad_norm": 1.0570365190505981, "learning_rate": 7.045560529751912e-07, "loss": 0.1244, "step": 38030 }, { "epoch": 0.8869458774154061, "grad_norm": 2.8184776306152344, "learning_rate": 7.044783311571224e-07, "loss": 0.1194, "step": 38040 }, { "epoch": 0.887179038792224, "grad_norm": 3.7182319164276123, "learning_rate": 7.044006093390537e-07, "loss": 0.1114, "step": 38050 }, { "epoch": 0.887412200169042, "grad_norm": 2.092766761779785, "learning_rate": 7.043228875209849e-07, "loss": 0.1085, "step": 38060 }, { "epoch": 0.8876453615458599, "grad_norm": 3.7464025020599365, "learning_rate": 7.042451657029161e-07, "loss": 0.1168, "step": 38070 }, { "epoch": 0.8878785229226779, "grad_norm": 1.444809079170227, "learning_rate": 7.041674438848473e-07, "loss": 0.1095, "step": 38080 }, { "epoch": 0.8881116842994958, "grad_norm": 3.2071213722229004, "learning_rate": 7.040897220667785e-07, "loss": 0.1188, "step": 38090 }, { "epoch": 0.8883448456763137, "grad_norm": 1.7708338499069214, "learning_rate": 7.040120002487098e-07, "loss": 0.1157, "step": 38100 }, { "epoch": 0.8885780070531316, "grad_norm": 1.1877387762069702, "learning_rate": 7.039342784306411e-07, "loss": 0.1222, "step": 38110 }, { "epoch": 0.8888111684299496, "grad_norm": 1.3482081890106201, "learning_rate": 7.038565566125722e-07, "loss": 0.1258, "step": 38120 }, { "epoch": 0.8890443298067675, "grad_norm": 1.5353195667266846, "learning_rate": 7.037788347945035e-07, "loss": 0.1173, "step": 38130 }, { "epoch": 0.8892774911835855, "grad_norm": 4.235682964324951, "learning_rate": 7.037011129764347e-07, "loss": 0.1111, "step": 38140 }, { "epoch": 0.8895106525604034, "grad_norm": 1.2734698057174683, "learning_rate": 7.03623391158366e-07, "loss": 0.1101, "step": 38150 }, { "epoch": 0.8897438139372214, "grad_norm": 1.5877876281738281, "learning_rate": 7.035456693402972e-07, "loss": 0.1171, "step": 38160 }, { "epoch": 0.8899769753140392, "grad_norm": 1.741897463798523, "learning_rate": 7.034679475222284e-07, "loss": 0.1234, "step": 38170 }, { "epoch": 0.8902101366908571, "grad_norm": 3.293447971343994, "learning_rate": 7.033902257041596e-07, "loss": 0.1141, "step": 38180 }, { "epoch": 0.8904432980676751, "grad_norm": 1.8927139043807983, "learning_rate": 7.033125038860908e-07, "loss": 0.1224, "step": 38190 }, { "epoch": 0.890676459444493, "grad_norm": 1.6919082403182983, "learning_rate": 7.032347820680221e-07, "loss": 0.1292, "step": 38200 }, { "epoch": 0.890909620821311, "grad_norm": 1.4646605253219604, "learning_rate": 7.031570602499534e-07, "loss": 0.1132, "step": 38210 }, { "epoch": 0.8911427821981289, "grad_norm": 1.4049042463302612, "learning_rate": 7.030793384318846e-07, "loss": 0.1139, "step": 38220 }, { "epoch": 0.8913759435749468, "grad_norm": 1.1768524646759033, "learning_rate": 7.030016166138159e-07, "loss": 0.1132, "step": 38230 }, { "epoch": 0.8916091049517647, "grad_norm": 2.003056287765503, "learning_rate": 7.029238947957469e-07, "loss": 0.1231, "step": 38240 }, { "epoch": 0.8918422663285827, "grad_norm": 1.966365098953247, "learning_rate": 7.028461729776782e-07, "loss": 0.118, "step": 38250 }, { "epoch": 0.8920754277054006, "grad_norm": 1.5098166465759277, "learning_rate": 7.027684511596095e-07, "loss": 0.1163, "step": 38260 }, { "epoch": 0.8923085890822186, "grad_norm": 1.343883991241455, "learning_rate": 7.026907293415407e-07, "loss": 0.1216, "step": 38270 }, { "epoch": 0.8925417504590365, "grad_norm": 1.4305192232131958, "learning_rate": 7.02613007523472e-07, "loss": 0.121, "step": 38280 }, { "epoch": 0.8927749118358544, "grad_norm": 2.4723989963531494, "learning_rate": 7.025352857054033e-07, "loss": 0.1096, "step": 38290 }, { "epoch": 0.8930080732126723, "grad_norm": 1.2972848415374756, "learning_rate": 7.024575638873344e-07, "loss": 0.1177, "step": 38300 }, { "epoch": 0.8932412345894902, "grad_norm": 2.357872724533081, "learning_rate": 7.023798420692657e-07, "loss": 0.1144, "step": 38310 }, { "epoch": 0.8934743959663082, "grad_norm": 1.6204255819320679, "learning_rate": 7.023021202511968e-07, "loss": 0.1108, "step": 38320 }, { "epoch": 0.8937075573431261, "grad_norm": 3.301833391189575, "learning_rate": 7.022243984331281e-07, "loss": 0.1376, "step": 38330 }, { "epoch": 0.8939407187199441, "grad_norm": 1.8966542482376099, "learning_rate": 7.021466766150594e-07, "loss": 0.1186, "step": 38340 }, { "epoch": 0.894173880096762, "grad_norm": 2.966651678085327, "learning_rate": 7.020689547969906e-07, "loss": 0.1139, "step": 38350 }, { "epoch": 0.8944070414735799, "grad_norm": 1.6939374208450317, "learning_rate": 7.019912329789218e-07, "loss": 0.1193, "step": 38360 }, { "epoch": 0.8946402028503978, "grad_norm": 1.2746779918670654, "learning_rate": 7.01913511160853e-07, "loss": 0.1039, "step": 38370 }, { "epoch": 0.8948733642272158, "grad_norm": 1.2845232486724854, "learning_rate": 7.018357893427843e-07, "loss": 0.1355, "step": 38380 }, { "epoch": 0.8951065256040337, "grad_norm": 1.5152326822280884, "learning_rate": 7.017580675247156e-07, "loss": 0.1204, "step": 38390 }, { "epoch": 0.8953396869808516, "grad_norm": 2.1050970554351807, "learning_rate": 7.016803457066467e-07, "loss": 0.129, "step": 38400 }, { "epoch": 0.8955728483576696, "grad_norm": 1.4035024642944336, "learning_rate": 7.01602623888578e-07, "loss": 0.1232, "step": 38410 }, { "epoch": 0.8958060097344874, "grad_norm": 1.2623019218444824, "learning_rate": 7.015249020705091e-07, "loss": 0.1098, "step": 38420 }, { "epoch": 0.8960391711113054, "grad_norm": 1.8892163038253784, "learning_rate": 7.014471802524404e-07, "loss": 0.1053, "step": 38430 }, { "epoch": 0.8962723324881233, "grad_norm": 1.1658780574798584, "learning_rate": 7.013694584343717e-07, "loss": 0.1256, "step": 38440 }, { "epoch": 0.8965054938649413, "grad_norm": 1.7665860652923584, "learning_rate": 7.012917366163029e-07, "loss": 0.1207, "step": 38450 }, { "epoch": 0.8967386552417592, "grad_norm": 1.0944561958312988, "learning_rate": 7.012140147982342e-07, "loss": 0.115, "step": 38460 }, { "epoch": 0.8969718166185772, "grad_norm": 1.0535740852355957, "learning_rate": 7.011362929801654e-07, "loss": 0.1145, "step": 38470 }, { "epoch": 0.8972049779953951, "grad_norm": 1.2894004583358765, "learning_rate": 7.010585711620965e-07, "loss": 0.1217, "step": 38480 }, { "epoch": 0.897438139372213, "grad_norm": 1.399780035018921, "learning_rate": 7.009808493440278e-07, "loss": 0.1378, "step": 38490 }, { "epoch": 0.8976713007490309, "grad_norm": 1.2338422536849976, "learning_rate": 7.00903127525959e-07, "loss": 0.1181, "step": 38500 }, { "epoch": 0.8979044621258488, "grad_norm": 1.4155422449111938, "learning_rate": 7.008254057078903e-07, "loss": 0.1174, "step": 38510 }, { "epoch": 0.8981376235026668, "grad_norm": 2.6366844177246094, "learning_rate": 7.007476838898215e-07, "loss": 0.1082, "step": 38520 }, { "epoch": 0.8983707848794847, "grad_norm": 1.3759453296661377, "learning_rate": 7.006699620717528e-07, "loss": 0.1151, "step": 38530 }, { "epoch": 0.8986039462563027, "grad_norm": 2.223649024963379, "learning_rate": 7.005922402536841e-07, "loss": 0.1292, "step": 38540 }, { "epoch": 0.8988371076331205, "grad_norm": 2.20562744140625, "learning_rate": 7.005145184356152e-07, "loss": 0.1317, "step": 38550 }, { "epoch": 0.8990702690099385, "grad_norm": 1.5760188102722168, "learning_rate": 7.004367966175464e-07, "loss": 0.1283, "step": 38560 }, { "epoch": 0.8993034303867564, "grad_norm": 1.1577081680297852, "learning_rate": 7.003590747994776e-07, "loss": 0.1053, "step": 38570 }, { "epoch": 0.8995365917635744, "grad_norm": 1.6759823560714722, "learning_rate": 7.002813529814089e-07, "loss": 0.1157, "step": 38580 }, { "epoch": 0.8997697531403923, "grad_norm": 1.326480507850647, "learning_rate": 7.002036311633402e-07, "loss": 0.1197, "step": 38590 }, { "epoch": 0.9000029145172103, "grad_norm": 1.4191431999206543, "learning_rate": 7.001259093452714e-07, "loss": 0.1224, "step": 38600 }, { "epoch": 0.9002360758940282, "grad_norm": 1.2576355934143066, "learning_rate": 7.000481875272026e-07, "loss": 0.1101, "step": 38610 }, { "epoch": 0.900469237270846, "grad_norm": 2.0770153999328613, "learning_rate": 6.999704657091338e-07, "loss": 0.1208, "step": 38620 }, { "epoch": 0.900702398647664, "grad_norm": 1.239411473274231, "learning_rate": 6.998927438910651e-07, "loss": 0.1069, "step": 38630 }, { "epoch": 0.9009355600244819, "grad_norm": 1.3443585634231567, "learning_rate": 6.998150220729963e-07, "loss": 0.1072, "step": 38640 }, { "epoch": 0.9011687214012999, "grad_norm": 1.815372347831726, "learning_rate": 6.997373002549275e-07, "loss": 0.1145, "step": 38650 }, { "epoch": 0.9014018827781178, "grad_norm": 1.6769992113113403, "learning_rate": 6.996595784368588e-07, "loss": 0.12, "step": 38660 }, { "epoch": 0.9016350441549358, "grad_norm": 2.7445108890533447, "learning_rate": 6.995818566187899e-07, "loss": 0.1085, "step": 38670 }, { "epoch": 0.9018682055317536, "grad_norm": 2.05379581451416, "learning_rate": 6.995041348007212e-07, "loss": 0.1073, "step": 38680 }, { "epoch": 0.9021013669085716, "grad_norm": 1.5554885864257812, "learning_rate": 6.994264129826525e-07, "loss": 0.1045, "step": 38690 }, { "epoch": 0.9023345282853895, "grad_norm": 2.910382032394409, "learning_rate": 6.993486911645837e-07, "loss": 0.118, "step": 38700 }, { "epoch": 0.9025676896622075, "grad_norm": 1.8133081197738647, "learning_rate": 6.99270969346515e-07, "loss": 0.1154, "step": 38710 }, { "epoch": 0.9028008510390254, "grad_norm": 1.4638948440551758, "learning_rate": 6.991932475284461e-07, "loss": 0.1234, "step": 38720 }, { "epoch": 0.9030340124158434, "grad_norm": 3.341332197189331, "learning_rate": 6.991155257103773e-07, "loss": 0.1084, "step": 38730 }, { "epoch": 0.9032671737926612, "grad_norm": 2.2149460315704346, "learning_rate": 6.990378038923086e-07, "loss": 0.1147, "step": 38740 }, { "epoch": 0.9035003351694791, "grad_norm": 1.1375399827957153, "learning_rate": 6.989600820742398e-07, "loss": 0.111, "step": 38750 }, { "epoch": 0.9037334965462971, "grad_norm": 1.5561516284942627, "learning_rate": 6.988823602561711e-07, "loss": 0.1185, "step": 38760 }, { "epoch": 0.903966657923115, "grad_norm": 1.3535046577453613, "learning_rate": 6.988046384381024e-07, "loss": 0.1118, "step": 38770 }, { "epoch": 0.904199819299933, "grad_norm": 2.6663782596588135, "learning_rate": 6.987269166200336e-07, "loss": 0.1115, "step": 38780 }, { "epoch": 0.9044329806767509, "grad_norm": 4.328267574310303, "learning_rate": 6.986491948019648e-07, "loss": 0.1174, "step": 38790 }, { "epoch": 0.9046661420535689, "grad_norm": 1.1902005672454834, "learning_rate": 6.985714729838959e-07, "loss": 0.1091, "step": 38800 }, { "epoch": 0.9048993034303867, "grad_norm": 1.8743469715118408, "learning_rate": 6.984937511658272e-07, "loss": 0.126, "step": 38810 }, { "epoch": 0.9051324648072047, "grad_norm": 1.5374979972839355, "learning_rate": 6.984160293477585e-07, "loss": 0.116, "step": 38820 }, { "epoch": 0.9053656261840226, "grad_norm": 1.4931840896606445, "learning_rate": 6.983383075296897e-07, "loss": 0.1268, "step": 38830 }, { "epoch": 0.9055987875608406, "grad_norm": 1.802624225616455, "learning_rate": 6.98260585711621e-07, "loss": 0.1132, "step": 38840 }, { "epoch": 0.9058319489376585, "grad_norm": 1.9927912950515747, "learning_rate": 6.981828638935522e-07, "loss": 0.1205, "step": 38850 }, { "epoch": 0.9060651103144765, "grad_norm": 2.3076839447021484, "learning_rate": 6.981051420754834e-07, "loss": 0.1081, "step": 38860 }, { "epoch": 0.9062982716912943, "grad_norm": 1.933112382888794, "learning_rate": 6.980274202574147e-07, "loss": 0.1144, "step": 38870 }, { "epoch": 0.9065314330681122, "grad_norm": 1.2487859725952148, "learning_rate": 6.979496984393458e-07, "loss": 0.1239, "step": 38880 }, { "epoch": 0.9067645944449302, "grad_norm": 2.185317277908325, "learning_rate": 6.978719766212771e-07, "loss": 0.1269, "step": 38890 }, { "epoch": 0.9069977558217481, "grad_norm": 1.1558573246002197, "learning_rate": 6.977942548032083e-07, "loss": 0.1103, "step": 38900 }, { "epoch": 0.9072309171985661, "grad_norm": 3.2731194496154785, "learning_rate": 6.977165329851396e-07, "loss": 0.1258, "step": 38910 }, { "epoch": 0.907464078575384, "grad_norm": 1.431923508644104, "learning_rate": 6.976388111670708e-07, "loss": 0.118, "step": 38920 }, { "epoch": 0.907697239952202, "grad_norm": 2.022737979888916, "learning_rate": 6.97561089349002e-07, "loss": 0.1152, "step": 38930 }, { "epoch": 0.9079304013290198, "grad_norm": 2.605619192123413, "learning_rate": 6.974833675309333e-07, "loss": 0.1208, "step": 38940 }, { "epoch": 0.9081635627058378, "grad_norm": 1.1989493370056152, "learning_rate": 6.974056457128645e-07, "loss": 0.1064, "step": 38950 }, { "epoch": 0.9083967240826557, "grad_norm": 1.6098014116287231, "learning_rate": 6.973279238947957e-07, "loss": 0.098, "step": 38960 }, { "epoch": 0.9086298854594737, "grad_norm": 1.7319577932357788, "learning_rate": 6.97250202076727e-07, "loss": 0.1072, "step": 38970 }, { "epoch": 0.9088630468362916, "grad_norm": 3.1951308250427246, "learning_rate": 6.971724802586581e-07, "loss": 0.1201, "step": 38980 }, { "epoch": 0.9090962082131095, "grad_norm": 3.460256338119507, "learning_rate": 6.970947584405894e-07, "loss": 0.1126, "step": 38990 }, { "epoch": 0.9093293695899274, "grad_norm": 1.6915029287338257, "learning_rate": 6.970170366225206e-07, "loss": 0.116, "step": 39000 }, { "epoch": 0.9095625309667453, "grad_norm": 1.7154955863952637, "learning_rate": 6.969393148044519e-07, "loss": 0.1179, "step": 39010 }, { "epoch": 0.9097956923435633, "grad_norm": 1.2584154605865479, "learning_rate": 6.968615929863832e-07, "loss": 0.1033, "step": 39020 }, { "epoch": 0.9100288537203812, "grad_norm": 1.324416995048523, "learning_rate": 6.967838711683144e-07, "loss": 0.1119, "step": 39030 }, { "epoch": 0.9102620150971992, "grad_norm": 1.322080373764038, "learning_rate": 6.967061493502455e-07, "loss": 0.1094, "step": 39040 }, { "epoch": 0.9104951764740171, "grad_norm": 1.7856014966964722, "learning_rate": 6.966284275321767e-07, "loss": 0.1203, "step": 39050 }, { "epoch": 0.910728337850835, "grad_norm": 1.409688115119934, "learning_rate": 6.96550705714108e-07, "loss": 0.1284, "step": 39060 }, { "epoch": 0.9109614992276529, "grad_norm": 2.2346417903900146, "learning_rate": 6.964729838960393e-07, "loss": 0.1136, "step": 39070 }, { "epoch": 0.9111946606044709, "grad_norm": 1.3720629215240479, "learning_rate": 6.963952620779705e-07, "loss": 0.1201, "step": 39080 }, { "epoch": 0.9114278219812888, "grad_norm": 1.178660273551941, "learning_rate": 6.963175402599018e-07, "loss": 0.1099, "step": 39090 }, { "epoch": 0.9116609833581067, "grad_norm": 1.9649585485458374, "learning_rate": 6.962398184418329e-07, "loss": 0.1259, "step": 39100 }, { "epoch": 0.9118941447349247, "grad_norm": 3.079644203186035, "learning_rate": 6.961620966237642e-07, "loss": 0.118, "step": 39110 }, { "epoch": 0.9121273061117426, "grad_norm": 1.560988187789917, "learning_rate": 6.960843748056954e-07, "loss": 0.1144, "step": 39120 }, { "epoch": 0.9123604674885605, "grad_norm": 1.2139637470245361, "learning_rate": 6.960066529876266e-07, "loss": 0.1174, "step": 39130 }, { "epoch": 0.9125936288653784, "grad_norm": 1.8172645568847656, "learning_rate": 6.959289311695579e-07, "loss": 0.1106, "step": 39140 }, { "epoch": 0.9128267902421964, "grad_norm": 1.7119739055633545, "learning_rate": 6.958512093514892e-07, "loss": 0.1113, "step": 39150 }, { "epoch": 0.9130599516190143, "grad_norm": 1.2801451683044434, "learning_rate": 6.957734875334203e-07, "loss": 0.1116, "step": 39160 }, { "epoch": 0.9132931129958323, "grad_norm": 1.1552625894546509, "learning_rate": 6.956957657153516e-07, "loss": 0.1028, "step": 39170 }, { "epoch": 0.9135262743726502, "grad_norm": 2.1562695503234863, "learning_rate": 6.956180438972828e-07, "loss": 0.1152, "step": 39180 }, { "epoch": 0.913759435749468, "grad_norm": 2.5852138996124268, "learning_rate": 6.955403220792141e-07, "loss": 0.1344, "step": 39190 }, { "epoch": 0.913992597126286, "grad_norm": 2.4775121212005615, "learning_rate": 6.954626002611453e-07, "loss": 0.1215, "step": 39200 }, { "epoch": 0.914225758503104, "grad_norm": 1.6799473762512207, "learning_rate": 6.953848784430765e-07, "loss": 0.1196, "step": 39210 }, { "epoch": 0.9144589198799219, "grad_norm": 1.402654767036438, "learning_rate": 6.953071566250077e-07, "loss": 0.1268, "step": 39220 }, { "epoch": 0.9146920812567398, "grad_norm": 1.4439867734909058, "learning_rate": 6.952294348069389e-07, "loss": 0.1139, "step": 39230 }, { "epoch": 0.9149252426335578, "grad_norm": 2.8241028785705566, "learning_rate": 6.951517129888702e-07, "loss": 0.1186, "step": 39240 }, { "epoch": 0.9151584040103757, "grad_norm": 1.341476559638977, "learning_rate": 6.950739911708015e-07, "loss": 0.1181, "step": 39250 }, { "epoch": 0.9153915653871936, "grad_norm": 1.4604337215423584, "learning_rate": 6.949962693527327e-07, "loss": 0.1193, "step": 39260 }, { "epoch": 0.9156247267640115, "grad_norm": 1.07036554813385, "learning_rate": 6.94918547534664e-07, "loss": 0.1256, "step": 39270 }, { "epoch": 0.9158578881408295, "grad_norm": 1.7344911098480225, "learning_rate": 6.94840825716595e-07, "loss": 0.1318, "step": 39280 }, { "epoch": 0.9160910495176474, "grad_norm": 1.7332415580749512, "learning_rate": 6.947631038985263e-07, "loss": 0.1047, "step": 39290 }, { "epoch": 0.9163242108944654, "grad_norm": 1.3245272636413574, "learning_rate": 6.946853820804576e-07, "loss": 0.1202, "step": 39300 }, { "epoch": 0.9165573722712833, "grad_norm": 2.509654998779297, "learning_rate": 6.946076602623888e-07, "loss": 0.1057, "step": 39310 }, { "epoch": 0.9167905336481011, "grad_norm": 1.3310962915420532, "learning_rate": 6.945299384443201e-07, "loss": 0.121, "step": 39320 }, { "epoch": 0.9170236950249191, "grad_norm": 1.240966796875, "learning_rate": 6.944522166262513e-07, "loss": 0.124, "step": 39330 }, { "epoch": 0.917256856401737, "grad_norm": 1.0788782835006714, "learning_rate": 6.943744948081826e-07, "loss": 0.1161, "step": 39340 }, { "epoch": 0.917490017778555, "grad_norm": 3.4852590560913086, "learning_rate": 6.942967729901138e-07, "loss": 0.1155, "step": 39350 }, { "epoch": 0.9177231791553729, "grad_norm": 1.8163145780563354, "learning_rate": 6.942190511720449e-07, "loss": 0.1229, "step": 39360 }, { "epoch": 0.9179563405321909, "grad_norm": 1.6048811674118042, "learning_rate": 6.941413293539762e-07, "loss": 0.1277, "step": 39370 }, { "epoch": 0.9181895019090087, "grad_norm": 3.196209192276001, "learning_rate": 6.940636075359074e-07, "loss": 0.1074, "step": 39380 }, { "epoch": 0.9184226632858267, "grad_norm": 1.2984931468963623, "learning_rate": 6.939858857178387e-07, "loss": 0.121, "step": 39390 }, { "epoch": 0.9186558246626446, "grad_norm": 2.1133933067321777, "learning_rate": 6.9390816389977e-07, "loss": 0.1201, "step": 39400 }, { "epoch": 0.9188889860394626, "grad_norm": 1.275447130203247, "learning_rate": 6.938304420817011e-07, "loss": 0.1188, "step": 39410 }, { "epoch": 0.9191221474162805, "grad_norm": 1.6964634656906128, "learning_rate": 6.937527202636324e-07, "loss": 0.1144, "step": 39420 }, { "epoch": 0.9193553087930985, "grad_norm": 1.4190950393676758, "learning_rate": 6.936749984455636e-07, "loss": 0.1148, "step": 39430 }, { "epoch": 0.9195884701699164, "grad_norm": 2.2941174507141113, "learning_rate": 6.935972766274948e-07, "loss": 0.1239, "step": 39440 }, { "epoch": 0.9198216315467342, "grad_norm": 1.945504903793335, "learning_rate": 6.935195548094261e-07, "loss": 0.1121, "step": 39450 }, { "epoch": 0.9200547929235522, "grad_norm": 1.7797194719314575, "learning_rate": 6.934418329913573e-07, "loss": 0.112, "step": 39460 }, { "epoch": 0.9202879543003701, "grad_norm": 1.284010887145996, "learning_rate": 6.933641111732885e-07, "loss": 0.1254, "step": 39470 }, { "epoch": 0.9205211156771881, "grad_norm": 1.4987906217575073, "learning_rate": 6.932863893552197e-07, "loss": 0.0995, "step": 39480 }, { "epoch": 0.920754277054006, "grad_norm": 1.9471354484558105, "learning_rate": 6.93208667537151e-07, "loss": 0.1378, "step": 39490 }, { "epoch": 0.920987438430824, "grad_norm": 3.005551338195801, "learning_rate": 6.931309457190823e-07, "loss": 0.1109, "step": 39500 }, { "epoch": 0.9212205998076418, "grad_norm": 1.1894420385360718, "learning_rate": 6.930532239010135e-07, "loss": 0.1093, "step": 39510 }, { "epoch": 0.9214537611844598, "grad_norm": 1.5739129781723022, "learning_rate": 6.929755020829447e-07, "loss": 0.1223, "step": 39520 }, { "epoch": 0.9216869225612777, "grad_norm": 1.338074803352356, "learning_rate": 6.928977802648758e-07, "loss": 0.1099, "step": 39530 }, { "epoch": 0.9219200839380957, "grad_norm": 2.2203333377838135, "learning_rate": 6.928200584468071e-07, "loss": 0.1156, "step": 39540 }, { "epoch": 0.9221532453149136, "grad_norm": 1.157416582107544, "learning_rate": 6.927423366287384e-07, "loss": 0.1213, "step": 39550 }, { "epoch": 0.9223864066917316, "grad_norm": 1.513593316078186, "learning_rate": 6.926646148106696e-07, "loss": 0.1117, "step": 39560 }, { "epoch": 0.9226195680685494, "grad_norm": 3.77400279045105, "learning_rate": 6.925868929926009e-07, "loss": 0.1122, "step": 39570 }, { "epoch": 0.9228527294453673, "grad_norm": 1.854946494102478, "learning_rate": 6.925091711745322e-07, "loss": 0.1278, "step": 39580 }, { "epoch": 0.9230858908221853, "grad_norm": 1.9816689491271973, "learning_rate": 6.924314493564633e-07, "loss": 0.1169, "step": 39590 }, { "epoch": 0.9233190521990032, "grad_norm": 1.72904372215271, "learning_rate": 6.923537275383945e-07, "loss": 0.1113, "step": 39600 }, { "epoch": 0.9235522135758212, "grad_norm": 2.736794948577881, "learning_rate": 6.922837779021327e-07, "loss": 0.1033, "step": 39610 }, { "epoch": 0.9237853749526391, "grad_norm": 2.5969643592834473, "learning_rate": 6.922060560840639e-07, "loss": 0.1098, "step": 39620 }, { "epoch": 0.9240185363294571, "grad_norm": 1.3864169120788574, "learning_rate": 6.921283342659951e-07, "loss": 0.1125, "step": 39630 }, { "epoch": 0.9242516977062749, "grad_norm": 2.7517521381378174, "learning_rate": 6.920506124479264e-07, "loss": 0.1104, "step": 39640 }, { "epoch": 0.9244848590830929, "grad_norm": 2.987269163131714, "learning_rate": 6.919728906298575e-07, "loss": 0.1216, "step": 39650 }, { "epoch": 0.9247180204599108, "grad_norm": 1.8881045579910278, "learning_rate": 6.918951688117888e-07, "loss": 0.1308, "step": 39660 }, { "epoch": 0.9249511818367288, "grad_norm": 2.014983654022217, "learning_rate": 6.918174469937201e-07, "loss": 0.1116, "step": 39670 }, { "epoch": 0.9251843432135467, "grad_norm": 1.1069363355636597, "learning_rate": 6.917397251756512e-07, "loss": 0.1185, "step": 39680 }, { "epoch": 0.9254175045903646, "grad_norm": 1.5793156623840332, "learning_rate": 6.916620033575825e-07, "loss": 0.1109, "step": 39690 }, { "epoch": 0.9256506659671825, "grad_norm": 1.2553865909576416, "learning_rate": 6.915842815395137e-07, "loss": 0.1153, "step": 39700 }, { "epoch": 0.9258838273440004, "grad_norm": 2.0308432579040527, "learning_rate": 6.91506559721445e-07, "loss": 0.124, "step": 39710 }, { "epoch": 0.9261169887208184, "grad_norm": 4.794121742248535, "learning_rate": 6.914288379033763e-07, "loss": 0.1148, "step": 39720 }, { "epoch": 0.9263501500976363, "grad_norm": 1.383956789970398, "learning_rate": 6.913511160853074e-07, "loss": 0.1002, "step": 39730 }, { "epoch": 0.9265833114744543, "grad_norm": 1.8121511936187744, "learning_rate": 6.912733942672387e-07, "loss": 0.1187, "step": 39740 }, { "epoch": 0.9268164728512722, "grad_norm": 1.9470438957214355, "learning_rate": 6.911956724491698e-07, "loss": 0.1174, "step": 39750 }, { "epoch": 0.9270496342280902, "grad_norm": 1.7574323415756226, "learning_rate": 6.911179506311011e-07, "loss": 0.1106, "step": 39760 }, { "epoch": 0.927282795604908, "grad_norm": 2.84851336479187, "learning_rate": 6.910402288130324e-07, "loss": 0.1123, "step": 39770 }, { "epoch": 0.927515956981726, "grad_norm": 2.5602118968963623, "learning_rate": 6.909625069949636e-07, "loss": 0.1273, "step": 39780 }, { "epoch": 0.9277491183585439, "grad_norm": 1.2772130966186523, "learning_rate": 6.908847851768949e-07, "loss": 0.1293, "step": 39790 }, { "epoch": 0.9279822797353618, "grad_norm": 1.4170210361480713, "learning_rate": 6.908070633588261e-07, "loss": 0.1086, "step": 39800 }, { "epoch": 0.9282154411121798, "grad_norm": 3.469127655029297, "learning_rate": 6.907293415407572e-07, "loss": 0.1128, "step": 39810 }, { "epoch": 0.9284486024889977, "grad_norm": 1.5920612812042236, "learning_rate": 6.906516197226885e-07, "loss": 0.1182, "step": 39820 }, { "epoch": 0.9286817638658156, "grad_norm": 1.9303433895111084, "learning_rate": 6.905738979046197e-07, "loss": 0.1222, "step": 39830 }, { "epoch": 0.9289149252426335, "grad_norm": 2.590549945831299, "learning_rate": 6.90496176086551e-07, "loss": 0.1006, "step": 39840 }, { "epoch": 0.9291480866194515, "grad_norm": 1.3032184839248657, "learning_rate": 6.904184542684822e-07, "loss": 0.0997, "step": 39850 }, { "epoch": 0.9293812479962694, "grad_norm": 1.0696184635162354, "learning_rate": 6.903407324504135e-07, "loss": 0.0986, "step": 39860 }, { "epoch": 0.9296144093730874, "grad_norm": 3.174858808517456, "learning_rate": 6.902630106323447e-07, "loss": 0.1245, "step": 39870 }, { "epoch": 0.9298475707499053, "grad_norm": 2.241065502166748, "learning_rate": 6.901852888142759e-07, "loss": 0.1265, "step": 39880 }, { "epoch": 0.9300807321267232, "grad_norm": 2.1461822986602783, "learning_rate": 6.901075669962071e-07, "loss": 0.1178, "step": 39890 }, { "epoch": 0.9303138935035411, "grad_norm": 1.182816982269287, "learning_rate": 6.900298451781383e-07, "loss": 0.1222, "step": 39900 }, { "epoch": 0.930547054880359, "grad_norm": 1.429795503616333, "learning_rate": 6.899521233600696e-07, "loss": 0.1218, "step": 39910 }, { "epoch": 0.930780216257177, "grad_norm": 1.711546540260315, "learning_rate": 6.898744015420009e-07, "loss": 0.123, "step": 39920 }, { "epoch": 0.9310133776339949, "grad_norm": 3.5475361347198486, "learning_rate": 6.89796679723932e-07, "loss": 0.1105, "step": 39930 }, { "epoch": 0.9312465390108129, "grad_norm": 3.0152077674865723, "learning_rate": 6.897189579058633e-07, "loss": 0.1099, "step": 39940 }, { "epoch": 0.9314797003876308, "grad_norm": 1.8523815870285034, "learning_rate": 6.896412360877946e-07, "loss": 0.1166, "step": 39950 }, { "epoch": 0.9317128617644487, "grad_norm": 1.1992908716201782, "learning_rate": 6.895635142697258e-07, "loss": 0.1093, "step": 39960 }, { "epoch": 0.9319460231412666, "grad_norm": 1.1772156953811646, "learning_rate": 6.89485792451657e-07, "loss": 0.1152, "step": 39970 }, { "epoch": 0.9321791845180846, "grad_norm": 4.2854156494140625, "learning_rate": 6.894080706335882e-07, "loss": 0.1037, "step": 39980 }, { "epoch": 0.9324123458949025, "grad_norm": 1.7519090175628662, "learning_rate": 6.893303488155194e-07, "loss": 0.1283, "step": 39990 }, { "epoch": 0.9326455072717205, "grad_norm": 1.9794650077819824, "learning_rate": 6.892526269974507e-07, "loss": 0.1191, "step": 40000 }, { "epoch": 0.9326455072717205, "eval_accuracy": 0.9410596540391427, "eval_f1": 0.9577300517322465, "eval_loss": 0.1523699313402176, "eval_runtime": 4481.0174, "eval_samples_per_second": 408.371, "eval_steps_per_second": 51.046, "step": 40000 }, { "epoch": 0.9328786686485384, "grad_norm": 1.4325065612792969, "learning_rate": 6.891749051793819e-07, "loss": 0.1194, "step": 40010 }, { "epoch": 0.9331118300253562, "grad_norm": 1.974455714225769, "learning_rate": 6.890971833613132e-07, "loss": 0.108, "step": 40020 }, { "epoch": 0.9333449914021742, "grad_norm": 2.33461332321167, "learning_rate": 6.890194615432444e-07, "loss": 0.1169, "step": 40030 }, { "epoch": 0.9335781527789921, "grad_norm": 1.0861070156097412, "learning_rate": 6.889417397251757e-07, "loss": 0.1136, "step": 40040 }, { "epoch": 0.9338113141558101, "grad_norm": 1.39127516746521, "learning_rate": 6.888640179071067e-07, "loss": 0.1108, "step": 40050 }, { "epoch": 0.934044475532628, "grad_norm": 2.1045925617218018, "learning_rate": 6.88786296089038e-07, "loss": 0.1094, "step": 40060 }, { "epoch": 0.934277636909446, "grad_norm": 1.3949681520462036, "learning_rate": 6.887085742709693e-07, "loss": 0.112, "step": 40070 }, { "epoch": 0.9345107982862639, "grad_norm": 1.1712353229522705, "learning_rate": 6.886308524529005e-07, "loss": 0.1144, "step": 40080 }, { "epoch": 0.9347439596630818, "grad_norm": 1.6479144096374512, "learning_rate": 6.885531306348318e-07, "loss": 0.1237, "step": 40090 }, { "epoch": 0.9349771210398997, "grad_norm": 1.4633342027664185, "learning_rate": 6.884754088167631e-07, "loss": 0.1214, "step": 40100 }, { "epoch": 0.9352102824167177, "grad_norm": 1.2352524995803833, "learning_rate": 6.883976869986943e-07, "loss": 0.1105, "step": 40110 }, { "epoch": 0.9354434437935356, "grad_norm": 2.1361072063446045, "learning_rate": 6.883199651806255e-07, "loss": 0.1214, "step": 40120 }, { "epoch": 0.9356766051703536, "grad_norm": 1.0809037685394287, "learning_rate": 6.882422433625566e-07, "loss": 0.1145, "step": 40130 }, { "epoch": 0.9359097665471715, "grad_norm": 1.8325746059417725, "learning_rate": 6.881645215444879e-07, "loss": 0.1078, "step": 40140 }, { "epoch": 0.9361429279239893, "grad_norm": 1.6764893531799316, "learning_rate": 6.880867997264192e-07, "loss": 0.1097, "step": 40150 }, { "epoch": 0.9363760893008073, "grad_norm": 1.9020750522613525, "learning_rate": 6.880090779083504e-07, "loss": 0.1237, "step": 40160 }, { "epoch": 0.9366092506776252, "grad_norm": 1.4131368398666382, "learning_rate": 6.879313560902817e-07, "loss": 0.1204, "step": 40170 }, { "epoch": 0.9368424120544432, "grad_norm": 2.5343048572540283, "learning_rate": 6.878536342722128e-07, "loss": 0.1082, "step": 40180 }, { "epoch": 0.9370755734312611, "grad_norm": 1.7008570432662964, "learning_rate": 6.877759124541441e-07, "loss": 0.1207, "step": 40190 }, { "epoch": 0.9373087348080791, "grad_norm": 1.3675639629364014, "learning_rate": 6.876981906360754e-07, "loss": 0.1187, "step": 40200 }, { "epoch": 0.9375418961848969, "grad_norm": 1.5322265625, "learning_rate": 6.876204688180065e-07, "loss": 0.1209, "step": 40210 }, { "epoch": 0.9377750575617149, "grad_norm": 3.4204726219177246, "learning_rate": 6.875427469999378e-07, "loss": 0.1168, "step": 40220 }, { "epoch": 0.9380082189385328, "grad_norm": 2.856760025024414, "learning_rate": 6.87465025181869e-07, "loss": 0.1119, "step": 40230 }, { "epoch": 0.9382413803153508, "grad_norm": 1.702548861503601, "learning_rate": 6.873873033638002e-07, "loss": 0.1114, "step": 40240 }, { "epoch": 0.9384745416921687, "grad_norm": 2.9587042331695557, "learning_rate": 6.873095815457315e-07, "loss": 0.116, "step": 40250 }, { "epoch": 0.9387077030689867, "grad_norm": 1.2163165807724, "learning_rate": 6.872318597276627e-07, "loss": 0.1223, "step": 40260 }, { "epoch": 0.9389408644458046, "grad_norm": 1.4739196300506592, "learning_rate": 6.87154137909594e-07, "loss": 0.11, "step": 40270 }, { "epoch": 0.9391740258226224, "grad_norm": 2.5015242099761963, "learning_rate": 6.870764160915252e-07, "loss": 0.13, "step": 40280 }, { "epoch": 0.9394071871994404, "grad_norm": 2.9137377738952637, "learning_rate": 6.869986942734564e-07, "loss": 0.1092, "step": 40290 }, { "epoch": 0.9396403485762583, "grad_norm": 1.4084973335266113, "learning_rate": 6.869209724553876e-07, "loss": 0.1098, "step": 40300 }, { "epoch": 0.9398735099530763, "grad_norm": 3.0058865547180176, "learning_rate": 6.868432506373188e-07, "loss": 0.1205, "step": 40310 }, { "epoch": 0.9401066713298942, "grad_norm": 2.9811086654663086, "learning_rate": 6.867655288192501e-07, "loss": 0.109, "step": 40320 }, { "epoch": 0.9403398327067122, "grad_norm": 4.499175548553467, "learning_rate": 6.866878070011813e-07, "loss": 0.1133, "step": 40330 }, { "epoch": 0.94057299408353, "grad_norm": 1.8525381088256836, "learning_rate": 6.866100851831126e-07, "loss": 0.1238, "step": 40340 }, { "epoch": 0.940806155460348, "grad_norm": 2.213452100753784, "learning_rate": 6.865323633650439e-07, "loss": 0.1179, "step": 40350 }, { "epoch": 0.9410393168371659, "grad_norm": 1.3198356628417969, "learning_rate": 6.86454641546975e-07, "loss": 0.1092, "step": 40360 }, { "epoch": 0.9412724782139839, "grad_norm": 1.9753576517105103, "learning_rate": 6.863769197289063e-07, "loss": 0.1037, "step": 40370 }, { "epoch": 0.9415056395908018, "grad_norm": 1.723075270652771, "learning_rate": 6.862991979108374e-07, "loss": 0.1236, "step": 40380 }, { "epoch": 0.9417388009676197, "grad_norm": 2.2692267894744873, "learning_rate": 6.862214760927687e-07, "loss": 0.1273, "step": 40390 }, { "epoch": 0.9419719623444377, "grad_norm": 1.367195725440979, "learning_rate": 6.861437542747e-07, "loss": 0.1132, "step": 40400 }, { "epoch": 0.9422051237212555, "grad_norm": 1.2752375602722168, "learning_rate": 6.860738046384381e-07, "loss": 0.1169, "step": 40410 }, { "epoch": 0.9424382850980735, "grad_norm": 2.861013412475586, "learning_rate": 6.859960828203693e-07, "loss": 0.1091, "step": 40420 }, { "epoch": 0.9426714464748914, "grad_norm": 1.5969030857086182, "learning_rate": 6.859183610023005e-07, "loss": 0.1197, "step": 40430 }, { "epoch": 0.9429046078517094, "grad_norm": 1.863095998764038, "learning_rate": 6.858406391842318e-07, "loss": 0.1079, "step": 40440 }, { "epoch": 0.9431377692285273, "grad_norm": 1.7665997743606567, "learning_rate": 6.857629173661629e-07, "loss": 0.1196, "step": 40450 }, { "epoch": 0.9433709306053453, "grad_norm": 2.771472215652466, "learning_rate": 6.856851955480942e-07, "loss": 0.1105, "step": 40460 }, { "epoch": 0.9436040919821631, "grad_norm": 1.4519940614700317, "learning_rate": 6.856074737300255e-07, "loss": 0.1165, "step": 40470 }, { "epoch": 0.943837253358981, "grad_norm": 1.2026352882385254, "learning_rate": 6.855297519119567e-07, "loss": 0.1029, "step": 40480 }, { "epoch": 0.944070414735799, "grad_norm": 1.113072156906128, "learning_rate": 6.85452030093888e-07, "loss": 0.1081, "step": 40490 }, { "epoch": 0.944303576112617, "grad_norm": 1.5762003660202026, "learning_rate": 6.853743082758191e-07, "loss": 0.1255, "step": 40500 }, { "epoch": 0.9445367374894349, "grad_norm": 1.714988112449646, "learning_rate": 6.852965864577503e-07, "loss": 0.1161, "step": 40510 }, { "epoch": 0.9447698988662528, "grad_norm": 1.7626960277557373, "learning_rate": 6.852188646396816e-07, "loss": 0.1226, "step": 40520 }, { "epoch": 0.9450030602430707, "grad_norm": 1.8320553302764893, "learning_rate": 6.851411428216128e-07, "loss": 0.1056, "step": 40530 }, { "epoch": 0.9452362216198886, "grad_norm": 1.4610930681228638, "learning_rate": 6.850634210035441e-07, "loss": 0.1084, "step": 40540 }, { "epoch": 0.9454693829967066, "grad_norm": 2.0719118118286133, "learning_rate": 6.849856991854753e-07, "loss": 0.1207, "step": 40550 }, { "epoch": 0.9457025443735245, "grad_norm": 1.5941990613937378, "learning_rate": 6.849079773674066e-07, "loss": 0.1138, "step": 40560 }, { "epoch": 0.9459357057503425, "grad_norm": 1.3227380514144897, "learning_rate": 6.848302555493379e-07, "loss": 0.113, "step": 40570 }, { "epoch": 0.9461688671271604, "grad_norm": 1.4964702129364014, "learning_rate": 6.847525337312689e-07, "loss": 0.1168, "step": 40580 }, { "epoch": 0.9464020285039784, "grad_norm": 1.8601194620132446, "learning_rate": 6.846748119132002e-07, "loss": 0.1161, "step": 40590 }, { "epoch": 0.9466351898807962, "grad_norm": 1.754821538925171, "learning_rate": 6.845970900951314e-07, "loss": 0.1248, "step": 40600 }, { "epoch": 0.9468683512576141, "grad_norm": 3.1939971446990967, "learning_rate": 6.845193682770627e-07, "loss": 0.0944, "step": 40610 }, { "epoch": 0.9471015126344321, "grad_norm": 2.9287543296813965, "learning_rate": 6.84441646458994e-07, "loss": 0.1206, "step": 40620 }, { "epoch": 0.94733467401125, "grad_norm": 2.094985246658325, "learning_rate": 6.843639246409252e-07, "loss": 0.1116, "step": 40630 }, { "epoch": 0.947567835388068, "grad_norm": 1.7839607000350952, "learning_rate": 6.842862028228564e-07, "loss": 0.1029, "step": 40640 }, { "epoch": 0.9478009967648859, "grad_norm": 3.1961495876312256, "learning_rate": 6.842084810047876e-07, "loss": 0.1057, "step": 40650 }, { "epoch": 0.9480341581417038, "grad_norm": 2.364356517791748, "learning_rate": 6.841307591867189e-07, "loss": 0.1104, "step": 40660 }, { "epoch": 0.9482673195185217, "grad_norm": 2.0250072479248047, "learning_rate": 6.840530373686501e-07, "loss": 0.1137, "step": 40670 }, { "epoch": 0.9485004808953397, "grad_norm": 1.9003963470458984, "learning_rate": 6.839753155505813e-07, "loss": 0.1031, "step": 40680 }, { "epoch": 0.9487336422721576, "grad_norm": 1.3035640716552734, "learning_rate": 6.838975937325126e-07, "loss": 0.1095, "step": 40690 }, { "epoch": 0.9489668036489756, "grad_norm": 1.5855317115783691, "learning_rate": 6.838198719144437e-07, "loss": 0.109, "step": 40700 }, { "epoch": 0.9491999650257935, "grad_norm": 1.6774013042449951, "learning_rate": 6.83742150096375e-07, "loss": 0.1223, "step": 40710 }, { "epoch": 0.9494331264026115, "grad_norm": 2.3125932216644287, "learning_rate": 6.836644282783063e-07, "loss": 0.1102, "step": 40720 }, { "epoch": 0.9496662877794293, "grad_norm": 1.609035849571228, "learning_rate": 6.835867064602375e-07, "loss": 0.1169, "step": 40730 }, { "epoch": 0.9498994491562472, "grad_norm": 3.7172064781188965, "learning_rate": 6.835089846421688e-07, "loss": 0.1264, "step": 40740 }, { "epoch": 0.9501326105330652, "grad_norm": 2.188364028930664, "learning_rate": 6.834312628241e-07, "loss": 0.1067, "step": 40750 }, { "epoch": 0.9503657719098831, "grad_norm": 1.654518485069275, "learning_rate": 6.833535410060311e-07, "loss": 0.1227, "step": 40760 }, { "epoch": 0.9505989332867011, "grad_norm": 1.412645697593689, "learning_rate": 6.832758191879624e-07, "loss": 0.1234, "step": 40770 }, { "epoch": 0.950832094663519, "grad_norm": 1.9248952865600586, "learning_rate": 6.831980973698936e-07, "loss": 0.116, "step": 40780 }, { "epoch": 0.9510652560403369, "grad_norm": 2.6797873973846436, "learning_rate": 6.831203755518249e-07, "loss": 0.1058, "step": 40790 }, { "epoch": 0.9512984174171548, "grad_norm": 3.5488133430480957, "learning_rate": 6.830426537337562e-07, "loss": 0.1237, "step": 40800 }, { "epoch": 0.9515315787939728, "grad_norm": 2.026057481765747, "learning_rate": 6.829649319156874e-07, "loss": 0.1231, "step": 40810 }, { "epoch": 0.9517647401707907, "grad_norm": 1.2132141590118408, "learning_rate": 6.828872100976186e-07, "loss": 0.128, "step": 40820 }, { "epoch": 0.9519979015476087, "grad_norm": 1.2953654527664185, "learning_rate": 6.828094882795497e-07, "loss": 0.1127, "step": 40830 }, { "epoch": 0.9522310629244266, "grad_norm": 1.4601386785507202, "learning_rate": 6.82731766461481e-07, "loss": 0.1168, "step": 40840 }, { "epoch": 0.9524642243012444, "grad_norm": 1.8189142942428589, "learning_rate": 6.826540446434123e-07, "loss": 0.1227, "step": 40850 }, { "epoch": 0.9526973856780624, "grad_norm": 3.1157166957855225, "learning_rate": 6.825763228253435e-07, "loss": 0.1129, "step": 40860 }, { "epoch": 0.9529305470548803, "grad_norm": 2.2680840492248535, "learning_rate": 6.824986010072748e-07, "loss": 0.1142, "step": 40870 }, { "epoch": 0.9531637084316983, "grad_norm": 1.363818883895874, "learning_rate": 6.82420879189206e-07, "loss": 0.1144, "step": 40880 }, { "epoch": 0.9533968698085162, "grad_norm": 1.359098196029663, "learning_rate": 6.823431573711372e-07, "loss": 0.126, "step": 40890 }, { "epoch": 0.9536300311853342, "grad_norm": 1.2239010334014893, "learning_rate": 6.822654355530685e-07, "loss": 0.1116, "step": 40900 }, { "epoch": 0.9538631925621521, "grad_norm": 3.1912691593170166, "learning_rate": 6.821877137349996e-07, "loss": 0.1087, "step": 40910 }, { "epoch": 0.95409635393897, "grad_norm": 1.353097677230835, "learning_rate": 6.821099919169309e-07, "loss": 0.1101, "step": 40920 }, { "epoch": 0.9543295153157879, "grad_norm": 4.166326522827148, "learning_rate": 6.820322700988621e-07, "loss": 0.1209, "step": 40930 }, { "epoch": 0.9545626766926059, "grad_norm": 1.9969825744628906, "learning_rate": 6.819545482807934e-07, "loss": 0.1047, "step": 40940 }, { "epoch": 0.9547958380694238, "grad_norm": 2.54467511177063, "learning_rate": 6.818768264627246e-07, "loss": 0.1124, "step": 40950 }, { "epoch": 0.9550289994462418, "grad_norm": 2.2815752029418945, "learning_rate": 6.817991046446558e-07, "loss": 0.1219, "step": 40960 }, { "epoch": 0.9552621608230597, "grad_norm": 1.5475733280181885, "learning_rate": 6.817213828265871e-07, "loss": 0.1142, "step": 40970 }, { "epoch": 0.9554953221998775, "grad_norm": 4.350159168243408, "learning_rate": 6.816436610085183e-07, "loss": 0.106, "step": 40980 }, { "epoch": 0.9557284835766955, "grad_norm": 2.4431653022766113, "learning_rate": 6.815659391904495e-07, "loss": 0.1134, "step": 40990 }, { "epoch": 0.9559616449535134, "grad_norm": 2.7125141620635986, "learning_rate": 6.814882173723808e-07, "loss": 0.113, "step": 41000 }, { "epoch": 0.9561948063303314, "grad_norm": 1.4104571342468262, "learning_rate": 6.814104955543119e-07, "loss": 0.1256, "step": 41010 }, { "epoch": 0.9564279677071493, "grad_norm": 1.398072361946106, "learning_rate": 6.813327737362432e-07, "loss": 0.1171, "step": 41020 }, { "epoch": 0.9566611290839673, "grad_norm": 4.3370232582092285, "learning_rate": 6.812550519181744e-07, "loss": 0.1138, "step": 41030 }, { "epoch": 0.9568942904607852, "grad_norm": 1.5378869771957397, "learning_rate": 6.811773301001057e-07, "loss": 0.1274, "step": 41040 }, { "epoch": 0.9571274518376031, "grad_norm": 3.9943439960479736, "learning_rate": 6.81099608282037e-07, "loss": 0.118, "step": 41050 }, { "epoch": 0.957360613214421, "grad_norm": 2.183445453643799, "learning_rate": 6.810218864639682e-07, "loss": 0.1195, "step": 41060 }, { "epoch": 0.957593774591239, "grad_norm": 1.1985739469528198, "learning_rate": 6.809441646458993e-07, "loss": 0.1185, "step": 41070 }, { "epoch": 0.9578269359680569, "grad_norm": 2.636791467666626, "learning_rate": 6.808664428278305e-07, "loss": 0.1149, "step": 41080 }, { "epoch": 0.9580600973448749, "grad_norm": 3.2678773403167725, "learning_rate": 6.807887210097618e-07, "loss": 0.1229, "step": 41090 }, { "epoch": 0.9582932587216928, "grad_norm": 2.1364784240722656, "learning_rate": 6.807109991916931e-07, "loss": 0.1069, "step": 41100 }, { "epoch": 0.9585264200985106, "grad_norm": 1.2639470100402832, "learning_rate": 6.806332773736243e-07, "loss": 0.1228, "step": 41110 }, { "epoch": 0.9587595814753286, "grad_norm": 2.8669419288635254, "learning_rate": 6.805555555555556e-07, "loss": 0.1187, "step": 41120 }, { "epoch": 0.9589927428521465, "grad_norm": 3.545149087905884, "learning_rate": 6.804778337374867e-07, "loss": 0.1103, "step": 41130 }, { "epoch": 0.9592259042289645, "grad_norm": 1.7246603965759277, "learning_rate": 6.80400111919418e-07, "loss": 0.1164, "step": 41140 }, { "epoch": 0.9594590656057824, "grad_norm": 5.942121982574463, "learning_rate": 6.803223901013492e-07, "loss": 0.1087, "step": 41150 }, { "epoch": 0.9596922269826004, "grad_norm": 1.3265442848205566, "learning_rate": 6.802446682832804e-07, "loss": 0.1096, "step": 41160 }, { "epoch": 0.9599253883594182, "grad_norm": 2.7868711948394775, "learning_rate": 6.801669464652117e-07, "loss": 0.1291, "step": 41170 }, { "epoch": 0.9601585497362362, "grad_norm": 1.9976626634597778, "learning_rate": 6.80089224647143e-07, "loss": 0.1151, "step": 41180 }, { "epoch": 0.9603917111130541, "grad_norm": 1.0583739280700684, "learning_rate": 6.800115028290741e-07, "loss": 0.1109, "step": 41190 }, { "epoch": 0.960624872489872, "grad_norm": 1.4006720781326294, "learning_rate": 6.799337810110054e-07, "loss": 0.1179, "step": 41200 }, { "epoch": 0.96085803386669, "grad_norm": 1.4114259481430054, "learning_rate": 6.798560591929366e-07, "loss": 0.109, "step": 41210 }, { "epoch": 0.961091195243508, "grad_norm": 1.0119166374206543, "learning_rate": 6.797783373748679e-07, "loss": 0.1193, "step": 41220 }, { "epoch": 0.9613243566203259, "grad_norm": 1.5525383949279785, "learning_rate": 6.79700615556799e-07, "loss": 0.1206, "step": 41230 }, { "epoch": 0.9615575179971437, "grad_norm": 1.8728642463684082, "learning_rate": 6.796228937387303e-07, "loss": 0.1222, "step": 41240 }, { "epoch": 0.9617906793739617, "grad_norm": 1.9130691289901733, "learning_rate": 6.795451719206615e-07, "loss": 0.1184, "step": 41250 }, { "epoch": 0.9620238407507796, "grad_norm": 1.8437342643737793, "learning_rate": 6.794674501025927e-07, "loss": 0.114, "step": 41260 }, { "epoch": 0.9622570021275976, "grad_norm": 1.1885154247283936, "learning_rate": 6.79389728284524e-07, "loss": 0.1144, "step": 41270 }, { "epoch": 0.9624901635044155, "grad_norm": 1.3028279542922974, "learning_rate": 6.793120064664553e-07, "loss": 0.1122, "step": 41280 }, { "epoch": 0.9627233248812335, "grad_norm": 1.5443555116653442, "learning_rate": 6.792342846483865e-07, "loss": 0.1155, "step": 41290 }, { "epoch": 0.9629564862580513, "grad_norm": 1.3592896461486816, "learning_rate": 6.791565628303178e-07, "loss": 0.1158, "step": 41300 }, { "epoch": 0.9631896476348693, "grad_norm": 1.769754409790039, "learning_rate": 6.790788410122488e-07, "loss": 0.1124, "step": 41310 }, { "epoch": 0.9634228090116872, "grad_norm": 1.3290964365005493, "learning_rate": 6.790011191941801e-07, "loss": 0.1073, "step": 41320 }, { "epoch": 0.9636559703885051, "grad_norm": 1.1638990640640259, "learning_rate": 6.789233973761114e-07, "loss": 0.1077, "step": 41330 }, { "epoch": 0.9638891317653231, "grad_norm": 1.54939866065979, "learning_rate": 6.788456755580426e-07, "loss": 0.1157, "step": 41340 }, { "epoch": 0.964122293142141, "grad_norm": 1.601948618888855, "learning_rate": 6.787679537399739e-07, "loss": 0.1085, "step": 41350 }, { "epoch": 0.964355454518959, "grad_norm": 1.4308416843414307, "learning_rate": 6.786902319219051e-07, "loss": 0.1205, "step": 41360 }, { "epoch": 0.9645886158957768, "grad_norm": 2.179504632949829, "learning_rate": 6.786125101038364e-07, "loss": 0.1078, "step": 41370 }, { "epoch": 0.9648217772725948, "grad_norm": 3.911921262741089, "learning_rate": 6.785347882857676e-07, "loss": 0.1289, "step": 41380 }, { "epoch": 0.9650549386494127, "grad_norm": 1.6694304943084717, "learning_rate": 6.784570664676987e-07, "loss": 0.1223, "step": 41390 }, { "epoch": 0.9652881000262307, "grad_norm": 2.008049726486206, "learning_rate": 6.7837934464963e-07, "loss": 0.1057, "step": 41400 }, { "epoch": 0.9655212614030486, "grad_norm": 2.3828682899475098, "learning_rate": 6.783016228315612e-07, "loss": 0.1119, "step": 41410 }, { "epoch": 0.9657544227798666, "grad_norm": 2.590627908706665, "learning_rate": 6.782239010134925e-07, "loss": 0.1053, "step": 41420 }, { "epoch": 0.9659875841566844, "grad_norm": 1.1349180936813354, "learning_rate": 6.781461791954238e-07, "loss": 0.107, "step": 41430 }, { "epoch": 0.9662207455335023, "grad_norm": 3.6551995277404785, "learning_rate": 6.780684573773549e-07, "loss": 0.1047, "step": 41440 }, { "epoch": 0.9664539069103203, "grad_norm": 1.4692983627319336, "learning_rate": 6.779907355592862e-07, "loss": 0.119, "step": 41450 }, { "epoch": 0.9666870682871382, "grad_norm": 1.6507073640823364, "learning_rate": 6.779130137412174e-07, "loss": 0.1341, "step": 41460 }, { "epoch": 0.9669202296639562, "grad_norm": 1.4605212211608887, "learning_rate": 6.778352919231486e-07, "loss": 0.1156, "step": 41470 }, { "epoch": 0.9671533910407741, "grad_norm": 1.4781367778778076, "learning_rate": 6.777575701050799e-07, "loss": 0.1197, "step": 41480 }, { "epoch": 0.967386552417592, "grad_norm": 1.8348321914672852, "learning_rate": 6.776798482870111e-07, "loss": 0.1128, "step": 41490 }, { "epoch": 0.9676197137944099, "grad_norm": 2.9016916751861572, "learning_rate": 6.776021264689423e-07, "loss": 0.1087, "step": 41500 }, { "epoch": 0.9678528751712279, "grad_norm": 3.5373117923736572, "learning_rate": 6.775244046508735e-07, "loss": 0.1184, "step": 41510 }, { "epoch": 0.9680860365480458, "grad_norm": 2.5050127506256104, "learning_rate": 6.774466828328048e-07, "loss": 0.125, "step": 41520 }, { "epoch": 0.9683191979248638, "grad_norm": 1.02182936668396, "learning_rate": 6.773689610147361e-07, "loss": 0.1151, "step": 41530 }, { "epoch": 0.9685523593016817, "grad_norm": 1.320731520652771, "learning_rate": 6.772912391966673e-07, "loss": 0.1205, "step": 41540 }, { "epoch": 0.9687855206784997, "grad_norm": 1.975958228111267, "learning_rate": 6.772135173785985e-07, "loss": 0.1103, "step": 41550 }, { "epoch": 0.9690186820553175, "grad_norm": 1.2993227243423462, "learning_rate": 6.771357955605296e-07, "loss": 0.1156, "step": 41560 }, { "epoch": 0.9692518434321354, "grad_norm": 1.4767075777053833, "learning_rate": 6.770580737424609e-07, "loss": 0.1269, "step": 41570 }, { "epoch": 0.9694850048089534, "grad_norm": 1.3562591075897217, "learning_rate": 6.769803519243922e-07, "loss": 0.1125, "step": 41580 }, { "epoch": 0.9697181661857713, "grad_norm": 1.0707364082336426, "learning_rate": 6.769026301063234e-07, "loss": 0.1145, "step": 41590 }, { "epoch": 0.9699513275625893, "grad_norm": 1.8915642499923706, "learning_rate": 6.768249082882547e-07, "loss": 0.1169, "step": 41600 }, { "epoch": 0.9701844889394072, "grad_norm": 2.657057523727417, "learning_rate": 6.76747186470186e-07, "loss": 0.1159, "step": 41610 }, { "epoch": 0.9704176503162251, "grad_norm": 1.6288403272628784, "learning_rate": 6.766694646521171e-07, "loss": 0.1161, "step": 41620 }, { "epoch": 0.970650811693043, "grad_norm": 4.2083048820495605, "learning_rate": 6.765917428340483e-07, "loss": 0.1125, "step": 41630 }, { "epoch": 0.970883973069861, "grad_norm": 3.0629193782806396, "learning_rate": 6.765140210159795e-07, "loss": 0.1238, "step": 41640 }, { "epoch": 0.9711171344466789, "grad_norm": 2.9604196548461914, "learning_rate": 6.764362991979108e-07, "loss": 0.115, "step": 41650 }, { "epoch": 0.9713502958234969, "grad_norm": 1.1472073793411255, "learning_rate": 6.763585773798421e-07, "loss": 0.1192, "step": 41660 }, { "epoch": 0.9715834572003148, "grad_norm": 1.3732420206069946, "learning_rate": 6.762808555617733e-07, "loss": 0.1185, "step": 41670 }, { "epoch": 0.9718166185771328, "grad_norm": 1.35305917263031, "learning_rate": 6.762031337437045e-07, "loss": 0.123, "step": 41680 }, { "epoch": 0.9720497799539506, "grad_norm": 1.1686170101165771, "learning_rate": 6.761254119256357e-07, "loss": 0.1151, "step": 41690 }, { "epoch": 0.9722829413307685, "grad_norm": 1.120723009109497, "learning_rate": 6.76047690107567e-07, "loss": 0.1079, "step": 41700 }, { "epoch": 0.9725161027075865, "grad_norm": 1.8351547718048096, "learning_rate": 6.759699682894982e-07, "loss": 0.1101, "step": 41710 }, { "epoch": 0.9727492640844044, "grad_norm": 1.98857581615448, "learning_rate": 6.758922464714294e-07, "loss": 0.1265, "step": 41720 }, { "epoch": 0.9729824254612224, "grad_norm": 1.428112268447876, "learning_rate": 6.758145246533607e-07, "loss": 0.1132, "step": 41730 }, { "epoch": 0.9732155868380403, "grad_norm": 1.4222900867462158, "learning_rate": 6.757368028352918e-07, "loss": 0.1212, "step": 41740 }, { "epoch": 0.9734487482148582, "grad_norm": 2.9182512760162354, "learning_rate": 6.756590810172231e-07, "loss": 0.1027, "step": 41750 }, { "epoch": 0.9736819095916761, "grad_norm": 1.2212326526641846, "learning_rate": 6.755813591991544e-07, "loss": 0.1157, "step": 41760 }, { "epoch": 0.9739150709684941, "grad_norm": 2.250977039337158, "learning_rate": 6.755036373810856e-07, "loss": 0.1107, "step": 41770 }, { "epoch": 0.974148232345312, "grad_norm": 1.257788062095642, "learning_rate": 6.754259155630169e-07, "loss": 0.122, "step": 41780 }, { "epoch": 0.97438139372213, "grad_norm": 1.5213379859924316, "learning_rate": 6.75348193744948e-07, "loss": 0.1156, "step": 41790 }, { "epoch": 0.9746145550989479, "grad_norm": 1.9401631355285645, "learning_rate": 6.752704719268793e-07, "loss": 0.1134, "step": 41800 }, { "epoch": 0.9748477164757657, "grad_norm": 3.7433600425720215, "learning_rate": 6.751927501088105e-07, "loss": 0.1062, "step": 41810 }, { "epoch": 0.9750808778525837, "grad_norm": 2.1913814544677734, "learning_rate": 6.751150282907417e-07, "loss": 0.1271, "step": 41820 }, { "epoch": 0.9753140392294016, "grad_norm": 2.76709246635437, "learning_rate": 6.75037306472673e-07, "loss": 0.113, "step": 41830 }, { "epoch": 0.9755472006062196, "grad_norm": 2.0878477096557617, "learning_rate": 6.749595846546042e-07, "loss": 0.1145, "step": 41840 }, { "epoch": 0.9757803619830375, "grad_norm": 1.3980005979537964, "learning_rate": 6.748818628365355e-07, "loss": 0.1221, "step": 41850 }, { "epoch": 0.9760135233598555, "grad_norm": 2.975081205368042, "learning_rate": 6.748041410184668e-07, "loss": 0.1242, "step": 41860 }, { "epoch": 0.9762466847366734, "grad_norm": 1.595451831817627, "learning_rate": 6.747264192003978e-07, "loss": 0.1249, "step": 41870 }, { "epoch": 0.9764798461134913, "grad_norm": 1.3351380825042725, "learning_rate": 6.746486973823291e-07, "loss": 0.1093, "step": 41880 }, { "epoch": 0.9767130074903092, "grad_norm": 2.059688091278076, "learning_rate": 6.745709755642603e-07, "loss": 0.1094, "step": 41890 }, { "epoch": 0.9769461688671272, "grad_norm": 1.1404783725738525, "learning_rate": 6.744932537461916e-07, "loss": 0.1223, "step": 41900 }, { "epoch": 0.9771793302439451, "grad_norm": 1.5165886878967285, "learning_rate": 6.744155319281229e-07, "loss": 0.1046, "step": 41910 }, { "epoch": 0.977412491620763, "grad_norm": 1.5962952375411987, "learning_rate": 6.743378101100541e-07, "loss": 0.1131, "step": 41920 }, { "epoch": 0.977645652997581, "grad_norm": 2.594804525375366, "learning_rate": 6.742600882919853e-07, "loss": 0.1073, "step": 41930 }, { "epoch": 0.9778788143743988, "grad_norm": 1.4882513284683228, "learning_rate": 6.741823664739165e-07, "loss": 0.1186, "step": 41940 }, { "epoch": 0.9781119757512168, "grad_norm": 1.3042584657669067, "learning_rate": 6.741046446558477e-07, "loss": 0.1191, "step": 41950 }, { "epoch": 0.9783451371280347, "grad_norm": 2.2409539222717285, "learning_rate": 6.74026922837779e-07, "loss": 0.1126, "step": 41960 }, { "epoch": 0.9785782985048527, "grad_norm": 1.3268349170684814, "learning_rate": 6.739492010197102e-07, "loss": 0.1172, "step": 41970 }, { "epoch": 0.9788114598816706, "grad_norm": 3.1163060665130615, "learning_rate": 6.738714792016415e-07, "loss": 0.1131, "step": 41980 }, { "epoch": 0.9790446212584886, "grad_norm": 2.9339911937713623, "learning_rate": 6.737937573835726e-07, "loss": 0.1129, "step": 41990 }, { "epoch": 0.9792777826353065, "grad_norm": 1.9665155410766602, "learning_rate": 6.737160355655039e-07, "loss": 0.1043, "step": 42000 }, { "epoch": 0.9795109440121244, "grad_norm": 1.5201756954193115, "learning_rate": 6.736383137474352e-07, "loss": 0.1231, "step": 42010 }, { "epoch": 0.9797441053889423, "grad_norm": 1.7823455333709717, "learning_rate": 6.735605919293664e-07, "loss": 0.1088, "step": 42020 }, { "epoch": 0.9799772667657602, "grad_norm": 1.5063623189926147, "learning_rate": 6.734828701112976e-07, "loss": 0.1023, "step": 42030 }, { "epoch": 0.9802104281425782, "grad_norm": 1.102889060974121, "learning_rate": 6.734051482932289e-07, "loss": 0.1079, "step": 42040 }, { "epoch": 0.9804435895193961, "grad_norm": 2.0729153156280518, "learning_rate": 6.7332742647516e-07, "loss": 0.1016, "step": 42050 }, { "epoch": 0.9806767508962141, "grad_norm": 1.3170729875564575, "learning_rate": 6.732497046570913e-07, "loss": 0.11, "step": 42060 }, { "epoch": 0.9809099122730319, "grad_norm": 2.999279499053955, "learning_rate": 6.731719828390225e-07, "loss": 0.0947, "step": 42070 }, { "epoch": 0.9811430736498499, "grad_norm": 3.352458953857422, "learning_rate": 6.730942610209538e-07, "loss": 0.1206, "step": 42080 }, { "epoch": 0.9813762350266678, "grad_norm": 1.8363357782363892, "learning_rate": 6.730165392028851e-07, "loss": 0.121, "step": 42090 }, { "epoch": 0.9816093964034858, "grad_norm": 1.5879404544830322, "learning_rate": 6.729388173848163e-07, "loss": 0.1062, "step": 42100 }, { "epoch": 0.9818425577803037, "grad_norm": 1.402471899986267, "learning_rate": 6.728610955667474e-07, "loss": 0.1058, "step": 42110 }, { "epoch": 0.9820757191571217, "grad_norm": 2.2627034187316895, "learning_rate": 6.727833737486786e-07, "loss": 0.1189, "step": 42120 }, { "epoch": 0.9823088805339395, "grad_norm": 3.3724396228790283, "learning_rate": 6.727056519306099e-07, "loss": 0.1162, "step": 42130 }, { "epoch": 0.9825420419107574, "grad_norm": 2.3853416442871094, "learning_rate": 6.726279301125412e-07, "loss": 0.1266, "step": 42140 }, { "epoch": 0.9827752032875754, "grad_norm": 2.5791661739349365, "learning_rate": 6.725502082944724e-07, "loss": 0.1069, "step": 42150 }, { "epoch": 0.9830083646643933, "grad_norm": 1.4136332273483276, "learning_rate": 6.724724864764037e-07, "loss": 0.1101, "step": 42160 }, { "epoch": 0.9832415260412113, "grad_norm": 1.1908841133117676, "learning_rate": 6.723947646583349e-07, "loss": 0.1092, "step": 42170 }, { "epoch": 0.9834746874180292, "grad_norm": 1.379057765007019, "learning_rate": 6.723170428402661e-07, "loss": 0.1058, "step": 42180 }, { "epoch": 0.9837078487948472, "grad_norm": 2.5770487785339355, "learning_rate": 6.722393210221973e-07, "loss": 0.1203, "step": 42190 }, { "epoch": 0.983941010171665, "grad_norm": 1.3159617185592651, "learning_rate": 6.721615992041285e-07, "loss": 0.1175, "step": 42200 }, { "epoch": 0.984174171548483, "grad_norm": 2.5069942474365234, "learning_rate": 6.720838773860598e-07, "loss": 0.1194, "step": 42210 }, { "epoch": 0.9844073329253009, "grad_norm": 2.3529956340789795, "learning_rate": 6.72006155567991e-07, "loss": 0.1126, "step": 42220 }, { "epoch": 0.9846404943021189, "grad_norm": 1.3967995643615723, "learning_rate": 6.719284337499223e-07, "loss": 0.1196, "step": 42230 }, { "epoch": 0.9848736556789368, "grad_norm": 2.538994312286377, "learning_rate": 6.718507119318535e-07, "loss": 0.114, "step": 42240 }, { "epoch": 0.9851068170557548, "grad_norm": 1.3798116445541382, "learning_rate": 6.717729901137847e-07, "loss": 0.1204, "step": 42250 }, { "epoch": 0.9853399784325726, "grad_norm": 1.9617705345153809, "learning_rate": 6.71695268295716e-07, "loss": 0.1046, "step": 42260 }, { "epoch": 0.9855731398093905, "grad_norm": 1.2248812913894653, "learning_rate": 6.716175464776471e-07, "loss": 0.1141, "step": 42270 }, { "epoch": 0.9858063011862085, "grad_norm": 1.28922438621521, "learning_rate": 6.715398246595784e-07, "loss": 0.1203, "step": 42280 }, { "epoch": 0.9860394625630264, "grad_norm": 2.940340757369995, "learning_rate": 6.714621028415097e-07, "loss": 0.1117, "step": 42290 }, { "epoch": 0.9862726239398444, "grad_norm": 2.145414113998413, "learning_rate": 6.713843810234408e-07, "loss": 0.1205, "step": 42300 }, { "epoch": 0.9865057853166623, "grad_norm": 3.083158254623413, "learning_rate": 6.713066592053721e-07, "loss": 0.1158, "step": 42310 }, { "epoch": 0.9867389466934803, "grad_norm": 1.3157163858413696, "learning_rate": 6.712289373873033e-07, "loss": 0.1184, "step": 42320 }, { "epoch": 0.9869721080702981, "grad_norm": 1.7824070453643799, "learning_rate": 6.711512155692346e-07, "loss": 0.1035, "step": 42330 }, { "epoch": 0.9872052694471161, "grad_norm": 1.7012364864349365, "learning_rate": 6.710734937511659e-07, "loss": 0.1041, "step": 42340 }, { "epoch": 0.987438430823934, "grad_norm": 1.1842843294143677, "learning_rate": 6.70995771933097e-07, "loss": 0.1085, "step": 42350 }, { "epoch": 0.987671592200752, "grad_norm": 4.514113903045654, "learning_rate": 6.709180501150282e-07, "loss": 0.1151, "step": 42360 }, { "epoch": 0.9879047535775699, "grad_norm": 2.760007858276367, "learning_rate": 6.708403282969594e-07, "loss": 0.118, "step": 42370 }, { "epoch": 0.9881379149543879, "grad_norm": 1.7272344827651978, "learning_rate": 6.707626064788907e-07, "loss": 0.1077, "step": 42380 }, { "epoch": 0.9883710763312057, "grad_norm": 2.1141343116760254, "learning_rate": 6.70684884660822e-07, "loss": 0.1086, "step": 42390 }, { "epoch": 0.9886042377080236, "grad_norm": 1.28755521774292, "learning_rate": 6.706071628427532e-07, "loss": 0.0936, "step": 42400 }, { "epoch": 0.9888373990848416, "grad_norm": 1.2703933715820312, "learning_rate": 6.705294410246845e-07, "loss": 0.1091, "step": 42410 }, { "epoch": 0.9890705604616595, "grad_norm": 3.042788028717041, "learning_rate": 6.704517192066156e-07, "loss": 0.1203, "step": 42420 }, { "epoch": 0.9893037218384775, "grad_norm": 1.9172184467315674, "learning_rate": 6.703739973885468e-07, "loss": 0.1138, "step": 42430 }, { "epoch": 0.9895368832152954, "grad_norm": 1.1965126991271973, "learning_rate": 6.702962755704781e-07, "loss": 0.1264, "step": 42440 }, { "epoch": 0.9897700445921133, "grad_norm": 1.693169116973877, "learning_rate": 6.702185537524093e-07, "loss": 0.1208, "step": 42450 }, { "epoch": 0.9900032059689312, "grad_norm": 1.6743724346160889, "learning_rate": 6.701408319343406e-07, "loss": 0.1144, "step": 42460 }, { "epoch": 0.9902363673457492, "grad_norm": 1.3277995586395264, "learning_rate": 6.700631101162719e-07, "loss": 0.1088, "step": 42470 }, { "epoch": 0.9904695287225671, "grad_norm": 1.4600201845169067, "learning_rate": 6.69985388298203e-07, "loss": 0.1167, "step": 42480 }, { "epoch": 0.990702690099385, "grad_norm": 1.4937453269958496, "learning_rate": 6.699076664801343e-07, "loss": 0.1224, "step": 42490 }, { "epoch": 0.990935851476203, "grad_norm": 1.4809452295303345, "learning_rate": 6.698299446620655e-07, "loss": 0.1136, "step": 42500 }, { "epoch": 0.991169012853021, "grad_norm": 1.3513506650924683, "learning_rate": 6.697522228439967e-07, "loss": 0.1096, "step": 42510 }, { "epoch": 0.9914021742298388, "grad_norm": 1.5317589044570923, "learning_rate": 6.69674501025928e-07, "loss": 0.1251, "step": 42520 }, { "epoch": 0.9916353356066567, "grad_norm": 1.3405612707138062, "learning_rate": 6.695967792078592e-07, "loss": 0.1145, "step": 42530 }, { "epoch": 0.9918684969834747, "grad_norm": 2.292470693588257, "learning_rate": 6.695190573897904e-07, "loss": 0.1174, "step": 42540 }, { "epoch": 0.9921016583602926, "grad_norm": 1.636466145515442, "learning_rate": 6.694413355717216e-07, "loss": 0.1164, "step": 42550 }, { "epoch": 0.9923348197371106, "grad_norm": 1.845940351486206, "learning_rate": 6.693636137536529e-07, "loss": 0.1066, "step": 42560 }, { "epoch": 0.9925679811139285, "grad_norm": 1.3995521068572998, "learning_rate": 6.692858919355842e-07, "loss": 0.11, "step": 42570 }, { "epoch": 0.9928011424907464, "grad_norm": 2.102052927017212, "learning_rate": 6.692081701175154e-07, "loss": 0.1156, "step": 42580 }, { "epoch": 0.9930343038675643, "grad_norm": 1.264116644859314, "learning_rate": 6.691304482994466e-07, "loss": 0.1202, "step": 42590 }, { "epoch": 0.9932674652443823, "grad_norm": 2.9574618339538574, "learning_rate": 6.690527264813777e-07, "loss": 0.1206, "step": 42600 }, { "epoch": 0.9935006266212002, "grad_norm": 1.3909026384353638, "learning_rate": 6.68975004663309e-07, "loss": 0.1113, "step": 42610 }, { "epoch": 0.9937337879980181, "grad_norm": 1.2650809288024902, "learning_rate": 6.688972828452403e-07, "loss": 0.1179, "step": 42620 }, { "epoch": 0.9939669493748361, "grad_norm": 1.9800901412963867, "learning_rate": 6.688195610271715e-07, "loss": 0.1103, "step": 42630 }, { "epoch": 0.994200110751654, "grad_norm": 1.286891222000122, "learning_rate": 6.687418392091028e-07, "loss": 0.1247, "step": 42640 }, { "epoch": 0.9944332721284719, "grad_norm": 1.9845548868179321, "learning_rate": 6.68664117391034e-07, "loss": 0.1299, "step": 42650 }, { "epoch": 0.9946664335052898, "grad_norm": 2.340146064758301, "learning_rate": 6.685863955729653e-07, "loss": 0.1139, "step": 42660 }, { "epoch": 0.9948995948821078, "grad_norm": 2.7770748138427734, "learning_rate": 6.685086737548964e-07, "loss": 0.1214, "step": 42670 }, { "epoch": 0.9951327562589257, "grad_norm": 1.922062873840332, "learning_rate": 6.684309519368276e-07, "loss": 0.1148, "step": 42680 }, { "epoch": 0.9953659176357437, "grad_norm": 1.5351817607879639, "learning_rate": 6.683532301187589e-07, "loss": 0.1108, "step": 42690 }, { "epoch": 0.9955990790125616, "grad_norm": 1.5464253425598145, "learning_rate": 6.682755083006901e-07, "loss": 0.1155, "step": 42700 }, { "epoch": 0.9958322403893795, "grad_norm": 1.3427789211273193, "learning_rate": 6.681977864826214e-07, "loss": 0.1196, "step": 42710 }, { "epoch": 0.9960654017661974, "grad_norm": 1.0989177227020264, "learning_rate": 6.681200646645527e-07, "loss": 0.1171, "step": 42720 }, { "epoch": 0.9962985631430153, "grad_norm": 2.3308229446411133, "learning_rate": 6.680423428464838e-07, "loss": 0.1274, "step": 42730 }, { "epoch": 0.9965317245198333, "grad_norm": 1.673357367515564, "learning_rate": 6.679646210284151e-07, "loss": 0.1162, "step": 42740 }, { "epoch": 0.9967648858966512, "grad_norm": 2.24039626121521, "learning_rate": 6.678868992103462e-07, "loss": 0.1132, "step": 42750 }, { "epoch": 0.9969980472734692, "grad_norm": 2.326784372329712, "learning_rate": 6.678091773922775e-07, "loss": 0.1166, "step": 42760 }, { "epoch": 0.997231208650287, "grad_norm": 1.2546441555023193, "learning_rate": 6.677314555742088e-07, "loss": 0.1145, "step": 42770 }, { "epoch": 0.997464370027105, "grad_norm": 1.5087093114852905, "learning_rate": 6.6765373375614e-07, "loss": 0.109, "step": 42780 }, { "epoch": 0.9976975314039229, "grad_norm": 3.7459962368011475, "learning_rate": 6.675760119380712e-07, "loss": 0.1237, "step": 42790 }, { "epoch": 0.9979306927807409, "grad_norm": 1.5329738855361938, "learning_rate": 6.674982901200024e-07, "loss": 0.1057, "step": 42800 }, { "epoch": 0.9981638541575588, "grad_norm": 1.7229368686676025, "learning_rate": 6.674205683019337e-07, "loss": 0.1101, "step": 42810 }, { "epoch": 0.9983970155343768, "grad_norm": 2.890960931777954, "learning_rate": 6.67342846483865e-07, "loss": 0.1051, "step": 42820 }, { "epoch": 0.9986301769111947, "grad_norm": 1.4441943168640137, "learning_rate": 6.672651246657961e-07, "loss": 0.1078, "step": 42830 }, { "epoch": 0.9988633382880125, "grad_norm": 1.341977596282959, "learning_rate": 6.671874028477274e-07, "loss": 0.1129, "step": 42840 }, { "epoch": 0.9990964996648305, "grad_norm": 3.0293378829956055, "learning_rate": 6.671096810296585e-07, "loss": 0.1212, "step": 42850 }, { "epoch": 0.9993296610416484, "grad_norm": 1.9554506540298462, "learning_rate": 6.670319592115898e-07, "loss": 0.1194, "step": 42860 }, { "epoch": 0.9995628224184664, "grad_norm": 1.6708863973617554, "learning_rate": 6.669542373935211e-07, "loss": 0.1127, "step": 42870 }, { "epoch": 0.9997959837952843, "grad_norm": 1.0958200693130493, "learning_rate": 6.668765155754523e-07, "loss": 0.0994, "step": 42880 }, { "epoch": 1.0000466322753636, "grad_norm": 2.16703200340271, "learning_rate": 6.667987937573836e-07, "loss": 0.1135, "step": 42890 }, { "epoch": 1.0002797936521814, "grad_norm": 2.3639698028564453, "learning_rate": 6.667210719393149e-07, "loss": 0.1118, "step": 42900 }, { "epoch": 1.0005129550289995, "grad_norm": 2.8674240112304688, "learning_rate": 6.666433501212459e-07, "loss": 0.1145, "step": 42910 }, { "epoch": 1.0007461164058173, "grad_norm": 2.3955283164978027, "learning_rate": 6.665656283031772e-07, "loss": 0.1091, "step": 42920 }, { "epoch": 1.0009792777826354, "grad_norm": 2.105161666870117, "learning_rate": 6.664879064851084e-07, "loss": 0.1188, "step": 42930 }, { "epoch": 1.0012124391594532, "grad_norm": 1.5620112419128418, "learning_rate": 6.664101846670397e-07, "loss": 0.114, "step": 42940 }, { "epoch": 1.001445600536271, "grad_norm": 1.8348872661590576, "learning_rate": 6.66332462848971e-07, "loss": 0.11, "step": 42950 }, { "epoch": 1.0016787619130891, "grad_norm": 2.058525562286377, "learning_rate": 6.662547410309022e-07, "loss": 0.1151, "step": 42960 }, { "epoch": 1.001911923289907, "grad_norm": 1.7464038133621216, "learning_rate": 6.661770192128334e-07, "loss": 0.1194, "step": 42970 }, { "epoch": 1.002145084666725, "grad_norm": 1.5043333768844604, "learning_rate": 6.660992973947646e-07, "loss": 0.1039, "step": 42980 }, { "epoch": 1.0023782460435429, "grad_norm": 1.8642804622650146, "learning_rate": 6.660215755766959e-07, "loss": 0.1166, "step": 42990 }, { "epoch": 1.002611407420361, "grad_norm": 2.1560580730438232, "learning_rate": 6.659438537586271e-07, "loss": 0.1141, "step": 43000 }, { "epoch": 1.0028445687971788, "grad_norm": 1.1910574436187744, "learning_rate": 6.658661319405583e-07, "loss": 0.1133, "step": 43010 }, { "epoch": 1.0030777301739966, "grad_norm": 1.3381682634353638, "learning_rate": 6.657884101224896e-07, "loss": 0.1087, "step": 43020 }, { "epoch": 1.0033108915508147, "grad_norm": 1.2544306516647339, "learning_rate": 6.657106883044207e-07, "loss": 0.1162, "step": 43030 }, { "epoch": 1.0035440529276325, "grad_norm": 3.999556541442871, "learning_rate": 6.65632966486352e-07, "loss": 0.1101, "step": 43040 }, { "epoch": 1.0037772143044505, "grad_norm": 3.798598051071167, "learning_rate": 6.655552446682833e-07, "loss": 0.106, "step": 43050 }, { "epoch": 1.0040103756812684, "grad_norm": 2.6403753757476807, "learning_rate": 6.654775228502145e-07, "loss": 0.1249, "step": 43060 }, { "epoch": 1.0042435370580862, "grad_norm": 1.2285839319229126, "learning_rate": 6.653998010321458e-07, "loss": 0.1148, "step": 43070 }, { "epoch": 1.0044766984349043, "grad_norm": 3.6402692794799805, "learning_rate": 6.653220792140769e-07, "loss": 0.1124, "step": 43080 }, { "epoch": 1.0047098598117221, "grad_norm": 1.0887529850006104, "learning_rate": 6.652443573960082e-07, "loss": 0.1099, "step": 43090 }, { "epoch": 1.0049430211885402, "grad_norm": 1.403088092803955, "learning_rate": 6.651666355779394e-07, "loss": 0.1095, "step": 43100 }, { "epoch": 1.005176182565358, "grad_norm": 3.170287609100342, "learning_rate": 6.650889137598706e-07, "loss": 0.1255, "step": 43110 }, { "epoch": 1.005409343942176, "grad_norm": 1.0789591073989868, "learning_rate": 6.650111919418019e-07, "loss": 0.1021, "step": 43120 }, { "epoch": 1.005642505318994, "grad_norm": 1.5988667011260986, "learning_rate": 6.649334701237331e-07, "loss": 0.1029, "step": 43130 }, { "epoch": 1.0058756666958117, "grad_norm": 1.9302037954330444, "learning_rate": 6.648557483056644e-07, "loss": 0.1157, "step": 43140 }, { "epoch": 1.0061088280726298, "grad_norm": 1.752901554107666, "learning_rate": 6.647780264875957e-07, "loss": 0.1113, "step": 43150 }, { "epoch": 1.0063419894494476, "grad_norm": 1.5785280466079712, "learning_rate": 6.647003046695267e-07, "loss": 0.1154, "step": 43160 }, { "epoch": 1.0065751508262657, "grad_norm": 1.4504340887069702, "learning_rate": 6.64622582851458e-07, "loss": 0.121, "step": 43170 }, { "epoch": 1.0068083122030835, "grad_norm": 1.925700306892395, "learning_rate": 6.645448610333892e-07, "loss": 0.1229, "step": 43180 }, { "epoch": 1.0070414735799016, "grad_norm": 2.3092265129089355, "learning_rate": 6.644671392153205e-07, "loss": 0.1081, "step": 43190 }, { "epoch": 1.0072746349567194, "grad_norm": 1.5969338417053223, "learning_rate": 6.643894173972518e-07, "loss": 0.1133, "step": 43200 }, { "epoch": 1.0075077963335373, "grad_norm": 1.529039740562439, "learning_rate": 6.64311695579183e-07, "loss": 0.1239, "step": 43210 }, { "epoch": 1.0077409577103553, "grad_norm": 2.8323287963867188, "learning_rate": 6.642339737611142e-07, "loss": 0.1122, "step": 43220 }, { "epoch": 1.0079741190871732, "grad_norm": 1.5152794122695923, "learning_rate": 6.641562519430454e-07, "loss": 0.1196, "step": 43230 }, { "epoch": 1.0082072804639912, "grad_norm": 2.7120115756988525, "learning_rate": 6.640785301249766e-07, "loss": 0.1125, "step": 43240 }, { "epoch": 1.008440441840809, "grad_norm": 2.617842197418213, "learning_rate": 6.640008083069079e-07, "loss": 0.1214, "step": 43250 }, { "epoch": 1.008673603217627, "grad_norm": 1.5930954217910767, "learning_rate": 6.639230864888391e-07, "loss": 0.1256, "step": 43260 }, { "epoch": 1.008906764594445, "grad_norm": 1.2240915298461914, "learning_rate": 6.638453646707704e-07, "loss": 0.1064, "step": 43270 }, { "epoch": 1.0091399259712628, "grad_norm": 1.264346718788147, "learning_rate": 6.637676428527015e-07, "loss": 0.1054, "step": 43280 }, { "epoch": 1.0093730873480808, "grad_norm": 1.7120519876480103, "learning_rate": 6.636899210346328e-07, "loss": 0.1111, "step": 43290 }, { "epoch": 1.0096062487248987, "grad_norm": 1.468389868736267, "learning_rate": 6.636121992165641e-07, "loss": 0.1167, "step": 43300 }, { "epoch": 1.0098394101017167, "grad_norm": 4.424067974090576, "learning_rate": 6.635344773984953e-07, "loss": 0.1211, "step": 43310 }, { "epoch": 1.0100725714785346, "grad_norm": 3.1654930114746094, "learning_rate": 6.634567555804265e-07, "loss": 0.1066, "step": 43320 }, { "epoch": 1.0103057328553524, "grad_norm": 1.4744255542755127, "learning_rate": 6.633790337623578e-07, "loss": 0.1259, "step": 43330 }, { "epoch": 1.0105388942321705, "grad_norm": 1.9138826131820679, "learning_rate": 6.633013119442889e-07, "loss": 0.1047, "step": 43340 }, { "epoch": 1.0107720556089883, "grad_norm": 3.844572067260742, "learning_rate": 6.632235901262202e-07, "loss": 0.1084, "step": 43350 }, { "epoch": 1.0110052169858064, "grad_norm": 1.2919965982437134, "learning_rate": 6.631458683081514e-07, "loss": 0.1105, "step": 43360 }, { "epoch": 1.0112383783626242, "grad_norm": 2.280250310897827, "learning_rate": 6.630681464900827e-07, "loss": 0.1091, "step": 43370 }, { "epoch": 1.0114715397394423, "grad_norm": 1.1761999130249023, "learning_rate": 6.62990424672014e-07, "loss": 0.1117, "step": 43380 }, { "epoch": 1.01170470111626, "grad_norm": 3.8325417041778564, "learning_rate": 6.629127028539452e-07, "loss": 0.111, "step": 43390 }, { "epoch": 1.011937862493078, "grad_norm": 1.621169924736023, "learning_rate": 6.628349810358763e-07, "loss": 0.1037, "step": 43400 }, { "epoch": 1.012171023869896, "grad_norm": 1.8960325717926025, "learning_rate": 6.627572592178075e-07, "loss": 0.1145, "step": 43410 }, { "epoch": 1.0124041852467138, "grad_norm": 1.6462814807891846, "learning_rate": 6.626795373997388e-07, "loss": 0.1098, "step": 43420 }, { "epoch": 1.0126373466235319, "grad_norm": 1.4794871807098389, "learning_rate": 6.626018155816701e-07, "loss": 0.123, "step": 43430 }, { "epoch": 1.0128705080003497, "grad_norm": 1.2071435451507568, "learning_rate": 6.625240937636013e-07, "loss": 0.1069, "step": 43440 }, { "epoch": 1.0131036693771678, "grad_norm": 2.8961894512176514, "learning_rate": 6.624463719455326e-07, "loss": 0.1331, "step": 43450 }, { "epoch": 1.0133368307539856, "grad_norm": 1.8695087432861328, "learning_rate": 6.623686501274638e-07, "loss": 0.1173, "step": 43460 }, { "epoch": 1.0135699921308035, "grad_norm": 1.3139008283615112, "learning_rate": 6.62290928309395e-07, "loss": 0.119, "step": 43470 }, { "epoch": 1.0138031535076215, "grad_norm": 2.1521525382995605, "learning_rate": 6.622132064913262e-07, "loss": 0.1113, "step": 43480 }, { "epoch": 1.0140363148844393, "grad_norm": 1.4529484510421753, "learning_rate": 6.621354846732574e-07, "loss": 0.1216, "step": 43490 }, { "epoch": 1.0142694762612574, "grad_norm": 1.7166643142700195, "learning_rate": 6.620577628551887e-07, "loss": 0.1148, "step": 43500 }, { "epoch": 1.0145026376380752, "grad_norm": 1.4683150053024292, "learning_rate": 6.619800410371199e-07, "loss": 0.1105, "step": 43510 }, { "epoch": 1.014735799014893, "grad_norm": 1.5980366468429565, "learning_rate": 6.619023192190512e-07, "loss": 0.1096, "step": 43520 }, { "epoch": 1.0149689603917111, "grad_norm": 2.124129056930542, "learning_rate": 6.618245974009824e-07, "loss": 0.1157, "step": 43530 }, { "epoch": 1.015202121768529, "grad_norm": 1.9686585664749146, "learning_rate": 6.617468755829136e-07, "loss": 0.1043, "step": 43540 }, { "epoch": 1.015435283145347, "grad_norm": 1.5633944272994995, "learning_rate": 6.616691537648449e-07, "loss": 0.1055, "step": 43550 }, { "epoch": 1.0156684445221649, "grad_norm": 1.2662584781646729, "learning_rate": 6.61591431946776e-07, "loss": 0.1262, "step": 43560 }, { "epoch": 1.015901605898983, "grad_norm": 2.461249589920044, "learning_rate": 6.615137101287073e-07, "loss": 0.1071, "step": 43570 }, { "epoch": 1.0161347672758008, "grad_norm": 1.4553077220916748, "learning_rate": 6.614359883106386e-07, "loss": 0.115, "step": 43580 }, { "epoch": 1.0163679286526186, "grad_norm": 1.8589352369308472, "learning_rate": 6.613582664925697e-07, "loss": 0.1201, "step": 43590 }, { "epoch": 1.0166010900294367, "grad_norm": 1.5814833641052246, "learning_rate": 6.61280544674501e-07, "loss": 0.1154, "step": 43600 }, { "epoch": 1.0168342514062545, "grad_norm": 1.2742488384246826, "learning_rate": 6.612028228564322e-07, "loss": 0.1178, "step": 43610 }, { "epoch": 1.0170674127830726, "grad_norm": 1.3855184316635132, "learning_rate": 6.611251010383635e-07, "loss": 0.1173, "step": 43620 }, { "epoch": 1.0173005741598904, "grad_norm": 1.0944174528121948, "learning_rate": 6.610473792202948e-07, "loss": 0.1149, "step": 43630 }, { "epoch": 1.0175337355367085, "grad_norm": 1.301151990890503, "learning_rate": 6.609696574022259e-07, "loss": 0.1118, "step": 43640 }, { "epoch": 1.0177668969135263, "grad_norm": 1.6376922130584717, "learning_rate": 6.608919355841571e-07, "loss": 0.107, "step": 43650 }, { "epoch": 1.0180000582903441, "grad_norm": 1.46074640750885, "learning_rate": 6.608142137660883e-07, "loss": 0.1108, "step": 43660 }, { "epoch": 1.0182332196671622, "grad_norm": 1.5371512174606323, "learning_rate": 6.607364919480196e-07, "loss": 0.1088, "step": 43670 }, { "epoch": 1.01846638104398, "grad_norm": 3.038177490234375, "learning_rate": 6.606587701299509e-07, "loss": 0.1115, "step": 43680 }, { "epoch": 1.018699542420798, "grad_norm": 3.7840428352355957, "learning_rate": 6.605810483118821e-07, "loss": 0.1194, "step": 43690 }, { "epoch": 1.018932703797616, "grad_norm": 2.3782894611358643, "learning_rate": 6.605033264938134e-07, "loss": 0.1141, "step": 43700 }, { "epoch": 1.0191658651744337, "grad_norm": 1.3234293460845947, "learning_rate": 6.604256046757445e-07, "loss": 0.1075, "step": 43710 }, { "epoch": 1.0193990265512518, "grad_norm": 2.9627318382263184, "learning_rate": 6.603478828576757e-07, "loss": 0.0992, "step": 43720 }, { "epoch": 1.0196321879280696, "grad_norm": 1.393823504447937, "learning_rate": 6.60270161039607e-07, "loss": 0.1111, "step": 43730 }, { "epoch": 1.0198653493048877, "grad_norm": 1.524849772453308, "learning_rate": 6.601924392215382e-07, "loss": 0.1246, "step": 43740 }, { "epoch": 1.0200985106817055, "grad_norm": 2.208177328109741, "learning_rate": 6.601147174034695e-07, "loss": 0.1109, "step": 43750 }, { "epoch": 1.0203316720585236, "grad_norm": 2.3663136959075928, "learning_rate": 6.600369955854008e-07, "loss": 0.1047, "step": 43760 }, { "epoch": 1.0205648334353414, "grad_norm": 3.1227433681488037, "learning_rate": 6.599592737673319e-07, "loss": 0.1128, "step": 43770 }, { "epoch": 1.0207979948121593, "grad_norm": 0.9188883900642395, "learning_rate": 6.598815519492632e-07, "loss": 0.111, "step": 43780 }, { "epoch": 1.0210311561889773, "grad_norm": 1.363269329071045, "learning_rate": 6.598038301311944e-07, "loss": 0.1207, "step": 43790 }, { "epoch": 1.0212643175657952, "grad_norm": 2.98071551322937, "learning_rate": 6.597261083131256e-07, "loss": 0.1136, "step": 43800 }, { "epoch": 1.0214974789426132, "grad_norm": 2.1742987632751465, "learning_rate": 6.596483864950569e-07, "loss": 0.1161, "step": 43810 }, { "epoch": 1.021730640319431, "grad_norm": 1.3472239971160889, "learning_rate": 6.595706646769881e-07, "loss": 0.1071, "step": 43820 }, { "epoch": 1.0219638016962491, "grad_norm": 2.3301339149475098, "learning_rate": 6.594929428589193e-07, "loss": 0.1087, "step": 43830 }, { "epoch": 1.022196963073067, "grad_norm": 1.6878116130828857, "learning_rate": 6.594152210408505e-07, "loss": 0.1236, "step": 43840 }, { "epoch": 1.0224301244498848, "grad_norm": 2.112302541732788, "learning_rate": 6.593374992227818e-07, "loss": 0.1238, "step": 43850 }, { "epoch": 1.0226632858267029, "grad_norm": 1.5826642513275146, "learning_rate": 6.592597774047131e-07, "loss": 0.1144, "step": 43860 }, { "epoch": 1.0228964472035207, "grad_norm": 2.6112468242645264, "learning_rate": 6.591820555866443e-07, "loss": 0.1103, "step": 43870 }, { "epoch": 1.0231296085803387, "grad_norm": 2.015915632247925, "learning_rate": 6.591043337685755e-07, "loss": 0.1349, "step": 43880 }, { "epoch": 1.0233627699571566, "grad_norm": 2.1971733570098877, "learning_rate": 6.590266119505066e-07, "loss": 0.1182, "step": 43890 }, { "epoch": 1.0235959313339744, "grad_norm": 1.4486303329467773, "learning_rate": 6.589488901324379e-07, "loss": 0.1148, "step": 43900 }, { "epoch": 1.0238290927107925, "grad_norm": 1.6248425245285034, "learning_rate": 6.588711683143692e-07, "loss": 0.1046, "step": 43910 }, { "epoch": 1.0240622540876103, "grad_norm": 3.376094102859497, "learning_rate": 6.587934464963004e-07, "loss": 0.1168, "step": 43920 }, { "epoch": 1.0242954154644284, "grad_norm": 2.284491777420044, "learning_rate": 6.587157246782317e-07, "loss": 0.1123, "step": 43930 }, { "epoch": 1.0245285768412462, "grad_norm": 2.657630205154419, "learning_rate": 6.586380028601629e-07, "loss": 0.1198, "step": 43940 }, { "epoch": 1.0247617382180643, "grad_norm": 1.2330808639526367, "learning_rate": 6.585602810420942e-07, "loss": 0.1125, "step": 43950 }, { "epoch": 1.024994899594882, "grad_norm": 1.1787165403366089, "learning_rate": 6.584825592240253e-07, "loss": 0.1086, "step": 43960 }, { "epoch": 1.0252280609717, "grad_norm": 1.5474187135696411, "learning_rate": 6.584048374059565e-07, "loss": 0.116, "step": 43970 }, { "epoch": 1.025461222348518, "grad_norm": 1.7870844602584839, "learning_rate": 6.583271155878878e-07, "loss": 0.119, "step": 43980 }, { "epoch": 1.0256943837253358, "grad_norm": 1.4270825386047363, "learning_rate": 6.58249393769819e-07, "loss": 0.1375, "step": 43990 }, { "epoch": 1.025927545102154, "grad_norm": 2.453911066055298, "learning_rate": 6.581716719517503e-07, "loss": 0.1153, "step": 44000 }, { "epoch": 1.0261607064789717, "grad_norm": 1.2219394445419312, "learning_rate": 6.580939501336816e-07, "loss": 0.1149, "step": 44010 }, { "epoch": 1.0263938678557898, "grad_norm": 1.4378507137298584, "learning_rate": 6.580162283156127e-07, "loss": 0.1156, "step": 44020 }, { "epoch": 1.0266270292326076, "grad_norm": 1.6379135847091675, "learning_rate": 6.57938506497544e-07, "loss": 0.1017, "step": 44030 }, { "epoch": 1.0268601906094255, "grad_norm": 1.692382574081421, "learning_rate": 6.578607846794751e-07, "loss": 0.1105, "step": 44040 }, { "epoch": 1.0270933519862435, "grad_norm": 1.2166857719421387, "learning_rate": 6.577830628614064e-07, "loss": 0.1168, "step": 44050 }, { "epoch": 1.0273265133630614, "grad_norm": 1.5050774812698364, "learning_rate": 6.577053410433377e-07, "loss": 0.114, "step": 44060 }, { "epoch": 1.0275596747398794, "grad_norm": 1.3040757179260254, "learning_rate": 6.576276192252689e-07, "loss": 0.0986, "step": 44070 }, { "epoch": 1.0277928361166973, "grad_norm": 0.881609320640564, "learning_rate": 6.575498974072001e-07, "loss": 0.1041, "step": 44080 }, { "epoch": 1.0280259974935153, "grad_norm": 1.380578875541687, "learning_rate": 6.574721755891313e-07, "loss": 0.1138, "step": 44090 }, { "epoch": 1.0282591588703331, "grad_norm": 1.3185806274414062, "learning_rate": 6.573944537710626e-07, "loss": 0.1168, "step": 44100 }, { "epoch": 1.028492320247151, "grad_norm": 1.6568856239318848, "learning_rate": 6.573167319529939e-07, "loss": 0.1184, "step": 44110 }, { "epoch": 1.028725481623969, "grad_norm": 1.888848900794983, "learning_rate": 6.57239010134925e-07, "loss": 0.1182, "step": 44120 }, { "epoch": 1.0289586430007869, "grad_norm": 2.411245584487915, "learning_rate": 6.571612883168563e-07, "loss": 0.1151, "step": 44130 }, { "epoch": 1.029191804377605, "grad_norm": 1.7450308799743652, "learning_rate": 6.570835664987874e-07, "loss": 0.1116, "step": 44140 }, { "epoch": 1.0294249657544228, "grad_norm": 1.1281818151474, "learning_rate": 6.570058446807187e-07, "loss": 0.1116, "step": 44150 }, { "epoch": 1.0296581271312406, "grad_norm": 1.1509459018707275, "learning_rate": 6.5692812286265e-07, "loss": 0.1078, "step": 44160 }, { "epoch": 1.0298912885080587, "grad_norm": 1.6743226051330566, "learning_rate": 6.568504010445812e-07, "loss": 0.1163, "step": 44170 }, { "epoch": 1.0301244498848765, "grad_norm": 1.2658315896987915, "learning_rate": 6.567726792265125e-07, "loss": 0.1107, "step": 44180 }, { "epoch": 1.0303576112616946, "grad_norm": 1.4348046779632568, "learning_rate": 6.566949574084438e-07, "loss": 0.1088, "step": 44190 }, { "epoch": 1.0305907726385124, "grad_norm": 3.242455244064331, "learning_rate": 6.566172355903748e-07, "loss": 0.11, "step": 44200 }, { "epoch": 1.0308239340153305, "grad_norm": 1.2221251726150513, "learning_rate": 6.565395137723061e-07, "loss": 0.1125, "step": 44210 }, { "epoch": 1.0310570953921483, "grad_norm": 2.024548292160034, "learning_rate": 6.564617919542373e-07, "loss": 0.1088, "step": 44220 }, { "epoch": 1.0312902567689661, "grad_norm": 2.8575994968414307, "learning_rate": 6.563840701361686e-07, "loss": 0.1072, "step": 44230 }, { "epoch": 1.0315234181457842, "grad_norm": 1.5435208082199097, "learning_rate": 6.563063483180999e-07, "loss": 0.1106, "step": 44240 }, { "epoch": 1.031756579522602, "grad_norm": 1.7684680223464966, "learning_rate": 6.562286265000311e-07, "loss": 0.1142, "step": 44250 }, { "epoch": 1.03198974089942, "grad_norm": 1.4006153345108032, "learning_rate": 6.561509046819623e-07, "loss": 0.1165, "step": 44260 }, { "epoch": 1.032222902276238, "grad_norm": 1.7180392742156982, "learning_rate": 6.560731828638935e-07, "loss": 0.1245, "step": 44270 }, { "epoch": 1.032456063653056, "grad_norm": 1.4998630285263062, "learning_rate": 6.559954610458247e-07, "loss": 0.109, "step": 44280 }, { "epoch": 1.0326892250298738, "grad_norm": 1.8856053352355957, "learning_rate": 6.55917739227756e-07, "loss": 0.1108, "step": 44290 }, { "epoch": 1.0329223864066917, "grad_norm": 1.2925677299499512, "learning_rate": 6.558400174096872e-07, "loss": 0.1053, "step": 44300 }, { "epoch": 1.0331555477835097, "grad_norm": 1.9862260818481445, "learning_rate": 6.557622955916185e-07, "loss": 0.1209, "step": 44310 }, { "epoch": 1.0333887091603275, "grad_norm": 3.300109386444092, "learning_rate": 6.556845737735497e-07, "loss": 0.1172, "step": 44320 }, { "epoch": 1.0336218705371456, "grad_norm": 1.3670377731323242, "learning_rate": 6.556068519554809e-07, "loss": 0.1187, "step": 44330 }, { "epoch": 1.0338550319139634, "grad_norm": 1.2458410263061523, "learning_rate": 6.555291301374122e-07, "loss": 0.1182, "step": 44340 }, { "epoch": 1.0340881932907813, "grad_norm": 2.5903525352478027, "learning_rate": 6.554514083193434e-07, "loss": 0.1176, "step": 44350 }, { "epoch": 1.0343213546675993, "grad_norm": 1.5237823724746704, "learning_rate": 6.553736865012746e-07, "loss": 0.1333, "step": 44360 }, { "epoch": 1.0345545160444172, "grad_norm": 1.7227647304534912, "learning_rate": 6.552959646832058e-07, "loss": 0.1149, "step": 44370 }, { "epoch": 1.0347876774212352, "grad_norm": 1.1982574462890625, "learning_rate": 6.552182428651371e-07, "loss": 0.1167, "step": 44380 }, { "epoch": 1.035020838798053, "grad_norm": 1.5229830741882324, "learning_rate": 6.551405210470683e-07, "loss": 0.133, "step": 44390 }, { "epoch": 1.0352540001748711, "grad_norm": 2.7775886058807373, "learning_rate": 6.550627992289995e-07, "loss": 0.1215, "step": 44400 }, { "epoch": 1.035487161551689, "grad_norm": 1.3347902297973633, "learning_rate": 6.549850774109308e-07, "loss": 0.1031, "step": 44410 }, { "epoch": 1.0357203229285068, "grad_norm": 2.0653090476989746, "learning_rate": 6.549151277746688e-07, "loss": 0.1137, "step": 44420 }, { "epoch": 1.0359534843053249, "grad_norm": 1.41093909740448, "learning_rate": 6.548374059566001e-07, "loss": 0.1211, "step": 44430 }, { "epoch": 1.0361866456821427, "grad_norm": 1.1081575155258179, "learning_rate": 6.547596841385313e-07, "loss": 0.119, "step": 44440 }, { "epoch": 1.0364198070589608, "grad_norm": 1.4327507019042969, "learning_rate": 6.546819623204626e-07, "loss": 0.1081, "step": 44450 }, { "epoch": 1.0366529684357786, "grad_norm": 2.855360984802246, "learning_rate": 6.546042405023938e-07, "loss": 0.1289, "step": 44460 }, { "epoch": 1.0368861298125966, "grad_norm": 3.262882709503174, "learning_rate": 6.545265186843251e-07, "loss": 0.1004, "step": 44470 }, { "epoch": 1.0371192911894145, "grad_norm": 2.113215684890747, "learning_rate": 6.544487968662563e-07, "loss": 0.1137, "step": 44480 }, { "epoch": 1.0373524525662323, "grad_norm": 2.2031710147857666, "learning_rate": 6.543710750481874e-07, "loss": 0.1205, "step": 44490 }, { "epoch": 1.0375856139430504, "grad_norm": 1.335964322090149, "learning_rate": 6.542933532301187e-07, "loss": 0.1185, "step": 44500 }, { "epoch": 1.0378187753198682, "grad_norm": 2.39542555809021, "learning_rate": 6.5421563141205e-07, "loss": 0.1131, "step": 44510 }, { "epoch": 1.0380519366966863, "grad_norm": 1.7702444791793823, "learning_rate": 6.541379095939812e-07, "loss": 0.1162, "step": 44520 }, { "epoch": 1.038285098073504, "grad_norm": 2.0382816791534424, "learning_rate": 6.540601877759125e-07, "loss": 0.1249, "step": 44530 }, { "epoch": 1.038518259450322, "grad_norm": 2.4400858879089355, "learning_rate": 6.539824659578436e-07, "loss": 0.1204, "step": 44540 }, { "epoch": 1.03875142082714, "grad_norm": 1.709467887878418, "learning_rate": 6.539047441397749e-07, "loss": 0.1127, "step": 44550 }, { "epoch": 1.0389845822039578, "grad_norm": 1.7519959211349487, "learning_rate": 6.538270223217062e-07, "loss": 0.1123, "step": 44560 }, { "epoch": 1.039217743580776, "grad_norm": 1.8187315464019775, "learning_rate": 6.537493005036373e-07, "loss": 0.1198, "step": 44570 }, { "epoch": 1.0394509049575937, "grad_norm": 1.5026847124099731, "learning_rate": 6.536715786855686e-07, "loss": 0.1039, "step": 44580 }, { "epoch": 1.0396840663344118, "grad_norm": 1.6006227731704712, "learning_rate": 6.535938568674998e-07, "loss": 0.1209, "step": 44590 }, { "epoch": 1.0399172277112296, "grad_norm": 2.523301362991333, "learning_rate": 6.53516135049431e-07, "loss": 0.1106, "step": 44600 }, { "epoch": 1.0401503890880475, "grad_norm": 1.763096570968628, "learning_rate": 6.534384132313623e-07, "loss": 0.1118, "step": 44610 }, { "epoch": 1.0403835504648655, "grad_norm": 3.5856287479400635, "learning_rate": 6.533606914132935e-07, "loss": 0.1047, "step": 44620 }, { "epoch": 1.0406167118416834, "grad_norm": 2.363948345184326, "learning_rate": 6.532829695952248e-07, "loss": 0.1117, "step": 44630 }, { "epoch": 1.0408498732185014, "grad_norm": 1.9318766593933105, "learning_rate": 6.53205247777156e-07, "loss": 0.1208, "step": 44640 }, { "epoch": 1.0410830345953193, "grad_norm": 3.1819405555725098, "learning_rate": 6.531275259590872e-07, "loss": 0.1165, "step": 44650 }, { "epoch": 1.0413161959721373, "grad_norm": 1.6583572626113892, "learning_rate": 6.530498041410184e-07, "loss": 0.1145, "step": 44660 }, { "epoch": 1.0415493573489552, "grad_norm": 2.327108383178711, "learning_rate": 6.529720823229496e-07, "loss": 0.1177, "step": 44670 }, { "epoch": 1.041782518725773, "grad_norm": 1.6838099956512451, "learning_rate": 6.528943605048809e-07, "loss": 0.1115, "step": 44680 }, { "epoch": 1.042015680102591, "grad_norm": 1.3867937326431274, "learning_rate": 6.528166386868121e-07, "loss": 0.1213, "step": 44690 }, { "epoch": 1.0422488414794089, "grad_norm": 2.283036231994629, "learning_rate": 6.527389168687434e-07, "loss": 0.1212, "step": 44700 }, { "epoch": 1.042482002856227, "grad_norm": 1.9204654693603516, "learning_rate": 6.526611950506747e-07, "loss": 0.1092, "step": 44710 }, { "epoch": 1.0427151642330448, "grad_norm": 2.6761832237243652, "learning_rate": 6.525834732326059e-07, "loss": 0.121, "step": 44720 }, { "epoch": 1.0429483256098626, "grad_norm": 1.8075443506240845, "learning_rate": 6.52505751414537e-07, "loss": 0.115, "step": 44730 }, { "epoch": 1.0431814869866807, "grad_norm": 1.1836861371994019, "learning_rate": 6.524280295964682e-07, "loss": 0.1136, "step": 44740 }, { "epoch": 1.0434146483634985, "grad_norm": 1.4073909521102905, "learning_rate": 6.523503077783995e-07, "loss": 0.1117, "step": 44750 }, { "epoch": 1.0436478097403166, "grad_norm": 1.157658576965332, "learning_rate": 6.522725859603308e-07, "loss": 0.1102, "step": 44760 }, { "epoch": 1.0438809711171344, "grad_norm": 3.6513068675994873, "learning_rate": 6.52194864142262e-07, "loss": 0.1154, "step": 44770 }, { "epoch": 1.0441141324939525, "grad_norm": 1.4354925155639648, "learning_rate": 6.521171423241933e-07, "loss": 0.1099, "step": 44780 }, { "epoch": 1.0443472938707703, "grad_norm": 1.2024857997894287, "learning_rate": 6.520394205061244e-07, "loss": 0.1146, "step": 44790 }, { "epoch": 1.0445804552475881, "grad_norm": 1.8121107816696167, "learning_rate": 6.519616986880557e-07, "loss": 0.1026, "step": 44800 }, { "epoch": 1.0448136166244062, "grad_norm": 1.465229868888855, "learning_rate": 6.518839768699869e-07, "loss": 0.1274, "step": 44810 }, { "epoch": 1.045046778001224, "grad_norm": 1.5294864177703857, "learning_rate": 6.518062550519181e-07, "loss": 0.1136, "step": 44820 }, { "epoch": 1.045279939378042, "grad_norm": 1.702199935913086, "learning_rate": 6.517285332338494e-07, "loss": 0.1181, "step": 44830 }, { "epoch": 1.04551310075486, "grad_norm": 1.0482215881347656, "learning_rate": 6.516508114157806e-07, "loss": 0.108, "step": 44840 }, { "epoch": 1.045746262131678, "grad_norm": 1.444652795791626, "learning_rate": 6.515730895977118e-07, "loss": 0.1059, "step": 44850 }, { "epoch": 1.0459794235084958, "grad_norm": 2.2239348888397217, "learning_rate": 6.514953677796431e-07, "loss": 0.1092, "step": 44860 }, { "epoch": 1.0462125848853137, "grad_norm": 1.4534024000167847, "learning_rate": 6.514176459615743e-07, "loss": 0.1178, "step": 44870 }, { "epoch": 1.0464457462621317, "grad_norm": 1.4896354675292969, "learning_rate": 6.513399241435056e-07, "loss": 0.1082, "step": 44880 }, { "epoch": 1.0466789076389496, "grad_norm": 1.0505430698394775, "learning_rate": 6.512622023254367e-07, "loss": 0.1141, "step": 44890 }, { "epoch": 1.0469120690157676, "grad_norm": 1.839751958847046, "learning_rate": 6.51184480507368e-07, "loss": 0.1076, "step": 44900 }, { "epoch": 1.0471452303925854, "grad_norm": 1.5609982013702393, "learning_rate": 6.511067586892992e-07, "loss": 0.1116, "step": 44910 }, { "epoch": 1.0473783917694035, "grad_norm": 2.0508837699890137, "learning_rate": 6.510290368712304e-07, "loss": 0.1223, "step": 44920 }, { "epoch": 1.0476115531462213, "grad_norm": 1.2341504096984863, "learning_rate": 6.509513150531617e-07, "loss": 0.1106, "step": 44930 }, { "epoch": 1.0478447145230392, "grad_norm": 1.3730298280715942, "learning_rate": 6.50873593235093e-07, "loss": 0.1212, "step": 44940 }, { "epoch": 1.0480778758998572, "grad_norm": 1.4965519905090332, "learning_rate": 6.507958714170242e-07, "loss": 0.1168, "step": 44950 }, { "epoch": 1.048311037276675, "grad_norm": 1.8413920402526855, "learning_rate": 6.507181495989555e-07, "loss": 0.1185, "step": 44960 }, { "epoch": 1.0485441986534931, "grad_norm": 2.998333215713501, "learning_rate": 6.506404277808865e-07, "loss": 0.1259, "step": 44970 }, { "epoch": 1.048777360030311, "grad_norm": 1.1692636013031006, "learning_rate": 6.505627059628178e-07, "loss": 0.1155, "step": 44980 }, { "epoch": 1.0490105214071288, "grad_norm": 1.5496981143951416, "learning_rate": 6.50484984144749e-07, "loss": 0.1129, "step": 44990 }, { "epoch": 1.0492436827839469, "grad_norm": 1.619550108909607, "learning_rate": 6.504072623266803e-07, "loss": 0.1056, "step": 45000 }, { "epoch": 1.0494768441607647, "grad_norm": 1.5683002471923828, "learning_rate": 6.503295405086116e-07, "loss": 0.1086, "step": 45010 }, { "epoch": 1.0497100055375828, "grad_norm": 1.3725718259811401, "learning_rate": 6.502518186905428e-07, "loss": 0.117, "step": 45020 }, { "epoch": 1.0499431669144006, "grad_norm": 1.984971046447754, "learning_rate": 6.50174096872474e-07, "loss": 0.1116, "step": 45030 }, { "epoch": 1.0501763282912187, "grad_norm": 2.084779739379883, "learning_rate": 6.500963750544053e-07, "loss": 0.1048, "step": 45040 }, { "epoch": 1.0504094896680365, "grad_norm": 1.451111078262329, "learning_rate": 6.500186532363364e-07, "loss": 0.1188, "step": 45050 }, { "epoch": 1.0506426510448543, "grad_norm": 1.7914098501205444, "learning_rate": 6.499409314182677e-07, "loss": 0.116, "step": 45060 }, { "epoch": 1.0508758124216724, "grad_norm": 1.6526731252670288, "learning_rate": 6.498632096001989e-07, "loss": 0.1119, "step": 45070 }, { "epoch": 1.0511089737984902, "grad_norm": 1.590294361114502, "learning_rate": 6.497854877821302e-07, "loss": 0.103, "step": 45080 }, { "epoch": 1.0513421351753083, "grad_norm": 3.3140251636505127, "learning_rate": 6.497077659640614e-07, "loss": 0.1056, "step": 45090 }, { "epoch": 1.0515752965521261, "grad_norm": 1.6656885147094727, "learning_rate": 6.496300441459926e-07, "loss": 0.1169, "step": 45100 }, { "epoch": 1.0518084579289442, "grad_norm": 2.4739019870758057, "learning_rate": 6.495523223279239e-07, "loss": 0.1118, "step": 45110 }, { "epoch": 1.052041619305762, "grad_norm": 1.7590785026550293, "learning_rate": 6.494746005098551e-07, "loss": 0.1077, "step": 45120 }, { "epoch": 1.0522747806825798, "grad_norm": 1.4949195384979248, "learning_rate": 6.493968786917863e-07, "loss": 0.1099, "step": 45130 }, { "epoch": 1.052507942059398, "grad_norm": 3.5091652870178223, "learning_rate": 6.493191568737176e-07, "loss": 0.1073, "step": 45140 }, { "epoch": 1.0527411034362157, "grad_norm": 0.9745272397994995, "learning_rate": 6.492414350556488e-07, "loss": 0.1063, "step": 45150 }, { "epoch": 1.0529742648130338, "grad_norm": 2.024733304977417, "learning_rate": 6.4916371323758e-07, "loss": 0.1236, "step": 45160 }, { "epoch": 1.0532074261898516, "grad_norm": 1.5298601388931274, "learning_rate": 6.490859914195112e-07, "loss": 0.1123, "step": 45170 }, { "epoch": 1.0534405875666697, "grad_norm": 1.5161375999450684, "learning_rate": 6.490082696014425e-07, "loss": 0.1219, "step": 45180 }, { "epoch": 1.0536737489434875, "grad_norm": 3.7005958557128906, "learning_rate": 6.489305477833738e-07, "loss": 0.111, "step": 45190 }, { "epoch": 1.0539069103203054, "grad_norm": 1.269973874092102, "learning_rate": 6.48852825965305e-07, "loss": 0.1016, "step": 45200 }, { "epoch": 1.0541400716971234, "grad_norm": 1.4797066450119019, "learning_rate": 6.487751041472362e-07, "loss": 0.1078, "step": 45210 }, { "epoch": 1.0543732330739413, "grad_norm": 1.581122636795044, "learning_rate": 6.486973823291673e-07, "loss": 0.1223, "step": 45220 }, { "epoch": 1.0546063944507593, "grad_norm": 1.055530309677124, "learning_rate": 6.486196605110986e-07, "loss": 0.1039, "step": 45230 }, { "epoch": 1.0548395558275772, "grad_norm": 1.5462536811828613, "learning_rate": 6.485419386930299e-07, "loss": 0.1151, "step": 45240 }, { "epoch": 1.055072717204395, "grad_norm": 1.383760929107666, "learning_rate": 6.484642168749611e-07, "loss": 0.1088, "step": 45250 }, { "epoch": 1.055305878581213, "grad_norm": 1.139367938041687, "learning_rate": 6.483864950568924e-07, "loss": 0.1126, "step": 45260 }, { "epoch": 1.055539039958031, "grad_norm": 2.1094307899475098, "learning_rate": 6.483087732388236e-07, "loss": 0.1129, "step": 45270 }, { "epoch": 1.055772201334849, "grad_norm": 2.832943916320801, "learning_rate": 6.482310514207548e-07, "loss": 0.1227, "step": 45280 }, { "epoch": 1.0560053627116668, "grad_norm": 1.2832075357437134, "learning_rate": 6.48153329602686e-07, "loss": 0.1114, "step": 45290 }, { "epoch": 1.0562385240884848, "grad_norm": 1.9959715604782104, "learning_rate": 6.480756077846172e-07, "loss": 0.1114, "step": 45300 }, { "epoch": 1.0564716854653027, "grad_norm": 1.8052388429641724, "learning_rate": 6.479978859665485e-07, "loss": 0.1152, "step": 45310 }, { "epoch": 1.0567048468421205, "grad_norm": 1.5807305574417114, "learning_rate": 6.479201641484797e-07, "loss": 0.1126, "step": 45320 }, { "epoch": 1.0569380082189386, "grad_norm": 2.281480312347412, "learning_rate": 6.47842442330411e-07, "loss": 0.1125, "step": 45330 }, { "epoch": 1.0571711695957564, "grad_norm": 1.1379305124282837, "learning_rate": 6.477647205123422e-07, "loss": 0.1101, "step": 45340 }, { "epoch": 1.0574043309725745, "grad_norm": 2.318420648574829, "learning_rate": 6.476869986942734e-07, "loss": 0.1021, "step": 45350 }, { "epoch": 1.0576374923493923, "grad_norm": 1.6458714008331299, "learning_rate": 6.476092768762047e-07, "loss": 0.1112, "step": 45360 }, { "epoch": 1.0578706537262104, "grad_norm": 2.417128324508667, "learning_rate": 6.475315550581358e-07, "loss": 0.1187, "step": 45370 }, { "epoch": 1.0581038151030282, "grad_norm": 1.0647648572921753, "learning_rate": 6.474538332400671e-07, "loss": 0.1114, "step": 45380 }, { "epoch": 1.058336976479846, "grad_norm": 1.2208307981491089, "learning_rate": 6.473761114219984e-07, "loss": 0.0959, "step": 45390 }, { "epoch": 1.058570137856664, "grad_norm": 1.7513294219970703, "learning_rate": 6.472983896039295e-07, "loss": 0.1188, "step": 45400 }, { "epoch": 1.058803299233482, "grad_norm": 1.7672779560089111, "learning_rate": 6.472206677858608e-07, "loss": 0.1087, "step": 45410 }, { "epoch": 1.0590364606103, "grad_norm": 2.5037708282470703, "learning_rate": 6.47142945967792e-07, "loss": 0.1227, "step": 45420 }, { "epoch": 1.0592696219871178, "grad_norm": 1.4115054607391357, "learning_rate": 6.470652241497233e-07, "loss": 0.1124, "step": 45430 }, { "epoch": 1.0595027833639357, "grad_norm": 1.3663074970245361, "learning_rate": 6.469875023316546e-07, "loss": 0.1166, "step": 45440 }, { "epoch": 1.0597359447407537, "grad_norm": 1.6748695373535156, "learning_rate": 6.469097805135857e-07, "loss": 0.112, "step": 45450 }, { "epoch": 1.0599691061175716, "grad_norm": 3.0224368572235107, "learning_rate": 6.468320586955169e-07, "loss": 0.1135, "step": 45460 }, { "epoch": 1.0602022674943896, "grad_norm": 2.027961015701294, "learning_rate": 6.467543368774482e-07, "loss": 0.1057, "step": 45470 }, { "epoch": 1.0604354288712075, "grad_norm": 1.292020559310913, "learning_rate": 6.466766150593794e-07, "loss": 0.1086, "step": 45480 }, { "epoch": 1.0606685902480255, "grad_norm": 1.3989064693450928, "learning_rate": 6.465988932413107e-07, "loss": 0.1203, "step": 45490 }, { "epoch": 1.0609017516248433, "grad_norm": 1.6335963010787964, "learning_rate": 6.465211714232419e-07, "loss": 0.1277, "step": 45500 }, { "epoch": 1.0611349130016612, "grad_norm": 1.4152333736419678, "learning_rate": 6.464434496051732e-07, "loss": 0.1161, "step": 45510 }, { "epoch": 1.0613680743784792, "grad_norm": 2.255457878112793, "learning_rate": 6.463657277871045e-07, "loss": 0.1159, "step": 45520 }, { "epoch": 1.061601235755297, "grad_norm": 1.2700965404510498, "learning_rate": 6.462880059690355e-07, "loss": 0.1133, "step": 45530 }, { "epoch": 1.0618343971321151, "grad_norm": 2.4973256587982178, "learning_rate": 6.462102841509668e-07, "loss": 0.1144, "step": 45540 }, { "epoch": 1.062067558508933, "grad_norm": 2.744426727294922, "learning_rate": 6.46132562332898e-07, "loss": 0.1169, "step": 45550 }, { "epoch": 1.062300719885751, "grad_norm": 2.132676362991333, "learning_rate": 6.460548405148293e-07, "loss": 0.1013, "step": 45560 }, { "epoch": 1.0625338812625689, "grad_norm": 1.2025562524795532, "learning_rate": 6.459771186967606e-07, "loss": 0.1058, "step": 45570 }, { "epoch": 1.0627670426393867, "grad_norm": 1.6987452507019043, "learning_rate": 6.458993968786918e-07, "loss": 0.1191, "step": 45580 }, { "epoch": 1.0630002040162048, "grad_norm": 2.8090980052948, "learning_rate": 6.45821675060623e-07, "loss": 0.1203, "step": 45590 }, { "epoch": 1.0632333653930226, "grad_norm": 1.172167181968689, "learning_rate": 6.457439532425542e-07, "loss": 0.116, "step": 45600 }, { "epoch": 1.0634665267698407, "grad_norm": 1.2037959098815918, "learning_rate": 6.456662314244854e-07, "loss": 0.1132, "step": 45610 }, { "epoch": 1.0636996881466585, "grad_norm": 1.5645196437835693, "learning_rate": 6.455885096064167e-07, "loss": 0.0989, "step": 45620 }, { "epoch": 1.0639328495234763, "grad_norm": 1.2489609718322754, "learning_rate": 6.455107877883479e-07, "loss": 0.1238, "step": 45630 }, { "epoch": 1.0641660109002944, "grad_norm": 1.496058702468872, "learning_rate": 6.454330659702792e-07, "loss": 0.1051, "step": 45640 }, { "epoch": 1.0643991722771122, "grad_norm": 1.7895981073379517, "learning_rate": 6.453553441522103e-07, "loss": 0.1179, "step": 45650 }, { "epoch": 1.0646323336539303, "grad_norm": 1.7551101446151733, "learning_rate": 6.452776223341416e-07, "loss": 0.1137, "step": 45660 }, { "epoch": 1.0648654950307481, "grad_norm": 1.7277415990829468, "learning_rate": 6.451999005160729e-07, "loss": 0.1113, "step": 45670 }, { "epoch": 1.0650986564075662, "grad_norm": 1.208975076675415, "learning_rate": 6.451221786980041e-07, "loss": 0.1066, "step": 45680 }, { "epoch": 1.065331817784384, "grad_norm": 2.890282392501831, "learning_rate": 6.450444568799354e-07, "loss": 0.1162, "step": 45690 }, { "epoch": 1.0655649791612019, "grad_norm": 2.098273515701294, "learning_rate": 6.449667350618665e-07, "loss": 0.1182, "step": 45700 }, { "epoch": 1.06579814053802, "grad_norm": 1.8060415983200073, "learning_rate": 6.448890132437977e-07, "loss": 0.1165, "step": 45710 }, { "epoch": 1.0660313019148377, "grad_norm": 2.2379150390625, "learning_rate": 6.44811291425729e-07, "loss": 0.1153, "step": 45720 }, { "epoch": 1.0662644632916558, "grad_norm": 1.6425235271453857, "learning_rate": 6.447335696076602e-07, "loss": 0.1088, "step": 45730 }, { "epoch": 1.0664976246684736, "grad_norm": 1.1846970319747925, "learning_rate": 6.446558477895915e-07, "loss": 0.1182, "step": 45740 }, { "epoch": 1.0667307860452917, "grad_norm": 1.2876808643341064, "learning_rate": 6.445781259715227e-07, "loss": 0.1121, "step": 45750 }, { "epoch": 1.0669639474221095, "grad_norm": 1.2656034231185913, "learning_rate": 6.44500404153454e-07, "loss": 0.1136, "step": 45760 }, { "epoch": 1.0671971087989274, "grad_norm": 1.5431346893310547, "learning_rate": 6.444226823353852e-07, "loss": 0.112, "step": 45770 }, { "epoch": 1.0674302701757454, "grad_norm": 3.5791726112365723, "learning_rate": 6.443449605173163e-07, "loss": 0.107, "step": 45780 }, { "epoch": 1.0676634315525633, "grad_norm": 1.3563786745071411, "learning_rate": 6.442672386992476e-07, "loss": 0.104, "step": 45790 }, { "epoch": 1.0678965929293813, "grad_norm": 1.7864720821380615, "learning_rate": 6.441895168811788e-07, "loss": 0.1154, "step": 45800 }, { "epoch": 1.0681297543061992, "grad_norm": 1.1612541675567627, "learning_rate": 6.441117950631101e-07, "loss": 0.0994, "step": 45810 }, { "epoch": 1.068362915683017, "grad_norm": 1.3528157472610474, "learning_rate": 6.440340732450414e-07, "loss": 0.1251, "step": 45820 }, { "epoch": 1.068596077059835, "grad_norm": 3.7525875568389893, "learning_rate": 6.439563514269725e-07, "loss": 0.1064, "step": 45830 }, { "epoch": 1.068829238436653, "grad_norm": 1.888352632522583, "learning_rate": 6.438786296089038e-07, "loss": 0.1043, "step": 45840 }, { "epoch": 1.069062399813471, "grad_norm": 1.7437832355499268, "learning_rate": 6.438009077908351e-07, "loss": 0.1232, "step": 45850 }, { "epoch": 1.0692955611902888, "grad_norm": 1.6605356931686401, "learning_rate": 6.437231859727662e-07, "loss": 0.1015, "step": 45860 }, { "epoch": 1.0695287225671068, "grad_norm": 1.3665649890899658, "learning_rate": 6.436454641546975e-07, "loss": 0.1069, "step": 45870 }, { "epoch": 1.0697618839439247, "grad_norm": 2.385838747024536, "learning_rate": 6.435677423366287e-07, "loss": 0.1097, "step": 45880 }, { "epoch": 1.0699950453207425, "grad_norm": 2.302088737487793, "learning_rate": 6.434900205185599e-07, "loss": 0.1119, "step": 45890 }, { "epoch": 1.0702282066975606, "grad_norm": 2.047966241836548, "learning_rate": 6.434122987004912e-07, "loss": 0.1189, "step": 45900 }, { "epoch": 1.0704613680743784, "grad_norm": 1.3964240550994873, "learning_rate": 6.433345768824224e-07, "loss": 0.1214, "step": 45910 }, { "epoch": 1.0706945294511965, "grad_norm": 1.7103646993637085, "learning_rate": 6.432568550643537e-07, "loss": 0.1168, "step": 45920 }, { "epoch": 1.0709276908280143, "grad_norm": 1.9058862924575806, "learning_rate": 6.431791332462849e-07, "loss": 0.1087, "step": 45930 }, { "epoch": 1.0711608522048324, "grad_norm": 1.4557853937149048, "learning_rate": 6.431014114282161e-07, "loss": 0.1273, "step": 45940 }, { "epoch": 1.0713940135816502, "grad_norm": 1.488467812538147, "learning_rate": 6.430236896101473e-07, "loss": 0.1168, "step": 45950 }, { "epoch": 1.071627174958468, "grad_norm": 2.2648136615753174, "learning_rate": 6.429459677920785e-07, "loss": 0.1083, "step": 45960 }, { "epoch": 1.071860336335286, "grad_norm": 3.2629470825195312, "learning_rate": 6.428682459740098e-07, "loss": 0.1176, "step": 45970 }, { "epoch": 1.072093497712104, "grad_norm": 1.9954882860183716, "learning_rate": 6.42790524155941e-07, "loss": 0.1174, "step": 45980 }, { "epoch": 1.072326659088922, "grad_norm": 1.1968982219696045, "learning_rate": 6.427128023378723e-07, "loss": 0.1102, "step": 45990 }, { "epoch": 1.0725598204657398, "grad_norm": 1.5581029653549194, "learning_rate": 6.426350805198036e-07, "loss": 0.1061, "step": 46000 }, { "epoch": 1.0727929818425577, "grad_norm": 1.2064553499221802, "learning_rate": 6.425573587017348e-07, "loss": 0.111, "step": 46010 }, { "epoch": 1.0730261432193757, "grad_norm": 1.1638894081115723, "learning_rate": 6.424796368836659e-07, "loss": 0.1045, "step": 46020 }, { "epoch": 1.0732593045961936, "grad_norm": 1.6195390224456787, "learning_rate": 6.424019150655971e-07, "loss": 0.1238, "step": 46030 }, { "epoch": 1.0734924659730116, "grad_norm": 1.4437416791915894, "learning_rate": 6.423241932475284e-07, "loss": 0.1217, "step": 46040 }, { "epoch": 1.0737256273498295, "grad_norm": 2.402043104171753, "learning_rate": 6.422464714294597e-07, "loss": 0.1132, "step": 46050 }, { "epoch": 1.0739587887266475, "grad_norm": 1.5726667642593384, "learning_rate": 6.421687496113909e-07, "loss": 0.1102, "step": 46060 }, { "epoch": 1.0741919501034654, "grad_norm": 2.089341878890991, "learning_rate": 6.420910277933222e-07, "loss": 0.1196, "step": 46070 }, { "epoch": 1.0744251114802832, "grad_norm": 1.3555387258529663, "learning_rate": 6.420133059752533e-07, "loss": 0.1104, "step": 46080 }, { "epoch": 1.0746582728571012, "grad_norm": 1.4283040761947632, "learning_rate": 6.419355841571846e-07, "loss": 0.1093, "step": 46090 }, { "epoch": 1.074891434233919, "grad_norm": 1.3199251890182495, "learning_rate": 6.418578623391158e-07, "loss": 0.1142, "step": 46100 }, { "epoch": 1.0751245956107371, "grad_norm": 1.2128064632415771, "learning_rate": 6.41780140521047e-07, "loss": 0.1033, "step": 46110 }, { "epoch": 1.075357756987555, "grad_norm": 2.1192779541015625, "learning_rate": 6.417024187029783e-07, "loss": 0.1039, "step": 46120 }, { "epoch": 1.075590918364373, "grad_norm": 2.00921893119812, "learning_rate": 6.416246968849095e-07, "loss": 0.1036, "step": 46130 }, { "epoch": 1.0758240797411909, "grad_norm": 1.4485703706741333, "learning_rate": 6.415469750668407e-07, "loss": 0.1262, "step": 46140 }, { "epoch": 1.0760572411180087, "grad_norm": 2.2270758152008057, "learning_rate": 6.41469253248772e-07, "loss": 0.115, "step": 46150 }, { "epoch": 1.0762904024948268, "grad_norm": 1.2808618545532227, "learning_rate": 6.413915314307032e-07, "loss": 0.12, "step": 46160 }, { "epoch": 1.0765235638716446, "grad_norm": 1.7776660919189453, "learning_rate": 6.413138096126345e-07, "loss": 0.1065, "step": 46170 }, { "epoch": 1.0767567252484627, "grad_norm": 2.568817138671875, "learning_rate": 6.412360877945656e-07, "loss": 0.1089, "step": 46180 }, { "epoch": 1.0769898866252805, "grad_norm": 1.9754468202590942, "learning_rate": 6.411583659764969e-07, "loss": 0.1121, "step": 46190 }, { "epoch": 1.0772230480020983, "grad_norm": 1.3292651176452637, "learning_rate": 6.410806441584281e-07, "loss": 0.106, "step": 46200 }, { "epoch": 1.0774562093789164, "grad_norm": 1.133172869682312, "learning_rate": 6.410029223403593e-07, "loss": 0.1103, "step": 46210 }, { "epoch": 1.0776893707557342, "grad_norm": 1.6242691278457642, "learning_rate": 6.409252005222906e-07, "loss": 0.1185, "step": 46220 }, { "epoch": 1.0779225321325523, "grad_norm": 1.5482118129730225, "learning_rate": 6.408474787042219e-07, "loss": 0.1156, "step": 46230 }, { "epoch": 1.0781556935093701, "grad_norm": 1.744780421257019, "learning_rate": 6.407697568861531e-07, "loss": 0.1141, "step": 46240 }, { "epoch": 1.0783888548861882, "grad_norm": 3.0050880908966064, "learning_rate": 6.406920350680844e-07, "loss": 0.1139, "step": 46250 }, { "epoch": 1.078622016263006, "grad_norm": 3.074352741241455, "learning_rate": 6.406143132500154e-07, "loss": 0.1101, "step": 46260 }, { "epoch": 1.0788551776398239, "grad_norm": 3.201072931289673, "learning_rate": 6.405365914319467e-07, "loss": 0.109, "step": 46270 }, { "epoch": 1.079088339016642, "grad_norm": 1.3144491910934448, "learning_rate": 6.40458869613878e-07, "loss": 0.1091, "step": 46280 }, { "epoch": 1.0793215003934598, "grad_norm": 1.3383278846740723, "learning_rate": 6.403811477958092e-07, "loss": 0.1117, "step": 46290 }, { "epoch": 1.0795546617702778, "grad_norm": 1.353831171989441, "learning_rate": 6.403034259777405e-07, "loss": 0.1316, "step": 46300 }, { "epoch": 1.0797878231470956, "grad_norm": 2.6064324378967285, "learning_rate": 6.402257041596717e-07, "loss": 0.1097, "step": 46310 }, { "epoch": 1.0800209845239137, "grad_norm": 1.137412190437317, "learning_rate": 6.401479823416029e-07, "loss": 0.0995, "step": 46320 }, { "epoch": 1.0802541459007315, "grad_norm": 2.097684621810913, "learning_rate": 6.400702605235342e-07, "loss": 0.1234, "step": 46330 }, { "epoch": 1.0804873072775494, "grad_norm": 2.217923402786255, "learning_rate": 6.399925387054653e-07, "loss": 0.1072, "step": 46340 }, { "epoch": 1.0807204686543674, "grad_norm": 2.200488805770874, "learning_rate": 6.399148168873966e-07, "loss": 0.1036, "step": 46350 }, { "epoch": 1.0809536300311853, "grad_norm": 1.668283224105835, "learning_rate": 6.398370950693278e-07, "loss": 0.1109, "step": 46360 }, { "epoch": 1.0811867914080033, "grad_norm": 2.225203514099121, "learning_rate": 6.397593732512591e-07, "loss": 0.1107, "step": 46370 }, { "epoch": 1.0814199527848212, "grad_norm": 1.6004786491394043, "learning_rate": 6.396816514331903e-07, "loss": 0.1182, "step": 46380 }, { "epoch": 1.081653114161639, "grad_norm": 1.284532070159912, "learning_rate": 6.396039296151215e-07, "loss": 0.118, "step": 46390 }, { "epoch": 1.081886275538457, "grad_norm": 1.3452483415603638, "learning_rate": 6.395262077970528e-07, "loss": 0.1082, "step": 46400 }, { "epoch": 1.082119436915275, "grad_norm": 1.917757511138916, "learning_rate": 6.39448485978984e-07, "loss": 0.1256, "step": 46410 }, { "epoch": 1.082352598292093, "grad_norm": 2.5155820846557617, "learning_rate": 6.393707641609152e-07, "loss": 0.1074, "step": 46420 }, { "epoch": 1.0825857596689108, "grad_norm": 2.556055784225464, "learning_rate": 6.392930423428465e-07, "loss": 0.1187, "step": 46430 }, { "epoch": 1.0828189210457289, "grad_norm": 1.4028669595718384, "learning_rate": 6.392153205247777e-07, "loss": 0.1213, "step": 46440 }, { "epoch": 1.0830520824225467, "grad_norm": 3.6735520362854004, "learning_rate": 6.391453708885158e-07, "loss": 0.1101, "step": 46450 }, { "epoch": 1.0832852437993647, "grad_norm": 1.9934861660003662, "learning_rate": 6.390676490704471e-07, "loss": 0.0983, "step": 46460 }, { "epoch": 1.0835184051761826, "grad_norm": 2.0436851978302, "learning_rate": 6.389899272523783e-07, "loss": 0.1089, "step": 46470 }, { "epoch": 1.0837515665530004, "grad_norm": 1.4691343307495117, "learning_rate": 6.389122054343094e-07, "loss": 0.1128, "step": 46480 }, { "epoch": 1.0839847279298185, "grad_norm": 1.4367237091064453, "learning_rate": 6.388344836162407e-07, "loss": 0.1217, "step": 46490 }, { "epoch": 1.0842178893066363, "grad_norm": 5.482792377471924, "learning_rate": 6.387567617981719e-07, "loss": 0.1164, "step": 46500 }, { "epoch": 1.0844510506834544, "grad_norm": 1.4014617204666138, "learning_rate": 6.386790399801032e-07, "loss": 0.1113, "step": 46510 }, { "epoch": 1.0846842120602722, "grad_norm": 2.8686985969543457, "learning_rate": 6.386013181620345e-07, "loss": 0.1222, "step": 46520 }, { "epoch": 1.08491737343709, "grad_norm": 1.3266617059707642, "learning_rate": 6.385235963439657e-07, "loss": 0.1064, "step": 46530 }, { "epoch": 1.085150534813908, "grad_norm": 1.2950725555419922, "learning_rate": 6.384458745258969e-07, "loss": 0.1198, "step": 46540 }, { "epoch": 1.085383696190726, "grad_norm": 2.1201741695404053, "learning_rate": 6.38368152707828e-07, "loss": 0.1192, "step": 46550 }, { "epoch": 1.085616857567544, "grad_norm": 2.0965635776519775, "learning_rate": 6.382904308897593e-07, "loss": 0.1084, "step": 46560 }, { "epoch": 1.0858500189443618, "grad_norm": 1.3826504945755005, "learning_rate": 6.382127090716906e-07, "loss": 0.1116, "step": 46570 }, { "epoch": 1.08608318032118, "grad_norm": 1.2856802940368652, "learning_rate": 6.381349872536218e-07, "loss": 0.1166, "step": 46580 }, { "epoch": 1.0863163416979977, "grad_norm": 1.641489863395691, "learning_rate": 6.380572654355531e-07, "loss": 0.1154, "step": 46590 }, { "epoch": 1.0865495030748156, "grad_norm": 1.2780003547668457, "learning_rate": 6.379795436174842e-07, "loss": 0.1079, "step": 46600 }, { "epoch": 1.0867826644516336, "grad_norm": 1.820907473564148, "learning_rate": 6.379018217994155e-07, "loss": 0.1149, "step": 46610 }, { "epoch": 1.0870158258284515, "grad_norm": 1.8933345079421997, "learning_rate": 6.378240999813468e-07, "loss": 0.1171, "step": 46620 }, { "epoch": 1.0872489872052695, "grad_norm": 1.3307849168777466, "learning_rate": 6.377463781632779e-07, "loss": 0.1288, "step": 46630 }, { "epoch": 1.0874821485820874, "grad_norm": 2.9945263862609863, "learning_rate": 6.376686563452092e-07, "loss": 0.1196, "step": 46640 }, { "epoch": 1.0877153099589054, "grad_norm": 1.298638105392456, "learning_rate": 6.375909345271405e-07, "loss": 0.1116, "step": 46650 }, { "epoch": 1.0879484713357233, "grad_norm": 1.2352566719055176, "learning_rate": 6.375132127090716e-07, "loss": 0.1098, "step": 46660 }, { "epoch": 1.088181632712541, "grad_norm": 1.6759308576583862, "learning_rate": 6.374354908910029e-07, "loss": 0.113, "step": 46670 }, { "epoch": 1.0884147940893591, "grad_norm": 2.3645505905151367, "learning_rate": 6.373577690729341e-07, "loss": 0.1058, "step": 46680 }, { "epoch": 1.088647955466177, "grad_norm": 2.015742540359497, "learning_rate": 6.372800472548654e-07, "loss": 0.1065, "step": 46690 }, { "epoch": 1.088881116842995, "grad_norm": 1.4858436584472656, "learning_rate": 6.372023254367967e-07, "loss": 0.1232, "step": 46700 }, { "epoch": 1.0891142782198129, "grad_norm": 1.7862575054168701, "learning_rate": 6.371246036187278e-07, "loss": 0.1151, "step": 46710 }, { "epoch": 1.0893474395966307, "grad_norm": 2.6268086433410645, "learning_rate": 6.37046881800659e-07, "loss": 0.1114, "step": 46720 }, { "epoch": 1.0895806009734488, "grad_norm": 1.8145521879196167, "learning_rate": 6.369691599825902e-07, "loss": 0.1159, "step": 46730 }, { "epoch": 1.0898137623502666, "grad_norm": 3.5066933631896973, "learning_rate": 6.368914381645215e-07, "loss": 0.1148, "step": 46740 }, { "epoch": 1.0900469237270847, "grad_norm": 2.579432487487793, "learning_rate": 6.368214885282597e-07, "loss": 0.112, "step": 46750 }, { "epoch": 1.0902800851039025, "grad_norm": 2.8358826637268066, "learning_rate": 6.367437667101908e-07, "loss": 0.1017, "step": 46760 }, { "epoch": 1.0905132464807206, "grad_norm": 1.7445755004882812, "learning_rate": 6.36666044892122e-07, "loss": 0.1113, "step": 46770 }, { "epoch": 1.0907464078575384, "grad_norm": 1.4769986867904663, "learning_rate": 6.365883230740533e-07, "loss": 0.1201, "step": 46780 }, { "epoch": 1.0909795692343562, "grad_norm": 1.3468486070632935, "learning_rate": 6.365106012559846e-07, "loss": 0.1144, "step": 46790 }, { "epoch": 1.0912127306111743, "grad_norm": 3.4156413078308105, "learning_rate": 6.364328794379158e-07, "loss": 0.1038, "step": 46800 }, { "epoch": 1.0914458919879921, "grad_norm": 3.4929940700531006, "learning_rate": 6.363551576198471e-07, "loss": 0.1048, "step": 46810 }, { "epoch": 1.0916790533648102, "grad_norm": 2.0423431396484375, "learning_rate": 6.362774358017782e-07, "loss": 0.1202, "step": 46820 }, { "epoch": 1.091912214741628, "grad_norm": 1.159523367881775, "learning_rate": 6.361997139837095e-07, "loss": 0.098, "step": 46830 }, { "epoch": 1.092145376118446, "grad_norm": 3.3343687057495117, "learning_rate": 6.361219921656407e-07, "loss": 0.1243, "step": 46840 }, { "epoch": 1.092378537495264, "grad_norm": 1.9682157039642334, "learning_rate": 6.360442703475719e-07, "loss": 0.1196, "step": 46850 }, { "epoch": 1.0926116988720818, "grad_norm": 1.521065354347229, "learning_rate": 6.359665485295032e-07, "loss": 0.117, "step": 46860 }, { "epoch": 1.0928448602488998, "grad_norm": 1.1563094854354858, "learning_rate": 6.358888267114344e-07, "loss": 0.1171, "step": 46870 }, { "epoch": 1.0930780216257177, "grad_norm": 1.426308512687683, "learning_rate": 6.358111048933656e-07, "loss": 0.1109, "step": 46880 }, { "epoch": 1.0933111830025357, "grad_norm": 1.3020095825195312, "learning_rate": 6.357333830752969e-07, "loss": 0.1131, "step": 46890 }, { "epoch": 1.0935443443793536, "grad_norm": 1.1363791227340698, "learning_rate": 6.356556612572281e-07, "loss": 0.1056, "step": 46900 }, { "epoch": 1.0937775057561714, "grad_norm": 1.7841291427612305, "learning_rate": 6.355779394391594e-07, "loss": 0.1112, "step": 46910 }, { "epoch": 1.0940106671329894, "grad_norm": 1.1509348154067993, "learning_rate": 6.355002176210905e-07, "loss": 0.111, "step": 46920 }, { "epoch": 1.0942438285098073, "grad_norm": 1.1433066129684448, "learning_rate": 6.354224958030218e-07, "loss": 0.0993, "step": 46930 }, { "epoch": 1.0944769898866253, "grad_norm": 1.8816578388214111, "learning_rate": 6.35344773984953e-07, "loss": 0.1113, "step": 46940 }, { "epoch": 1.0947101512634432, "grad_norm": 1.2542181015014648, "learning_rate": 6.352670521668842e-07, "loss": 0.1049, "step": 46950 }, { "epoch": 1.0949433126402612, "grad_norm": 1.4601441621780396, "learning_rate": 6.351893303488155e-07, "loss": 0.1214, "step": 46960 }, { "epoch": 1.095176474017079, "grad_norm": 2.1797025203704834, "learning_rate": 6.351116085307468e-07, "loss": 0.1126, "step": 46970 }, { "epoch": 1.095409635393897, "grad_norm": 2.6053340435028076, "learning_rate": 6.35033886712678e-07, "loss": 0.106, "step": 46980 }, { "epoch": 1.095642796770715, "grad_norm": 2.364985704421997, "learning_rate": 6.349561648946093e-07, "loss": 0.1062, "step": 46990 }, { "epoch": 1.0958759581475328, "grad_norm": 1.392897367477417, "learning_rate": 6.348784430765403e-07, "loss": 0.1183, "step": 47000 }, { "epoch": 1.0961091195243509, "grad_norm": 1.363795518875122, "learning_rate": 6.348007212584716e-07, "loss": 0.1132, "step": 47010 }, { "epoch": 1.0963422809011687, "grad_norm": 3.6988143920898438, "learning_rate": 6.347229994404029e-07, "loss": 0.1123, "step": 47020 }, { "epoch": 1.0965754422779868, "grad_norm": 2.858147144317627, "learning_rate": 6.346452776223341e-07, "loss": 0.1077, "step": 47030 }, { "epoch": 1.0968086036548046, "grad_norm": 1.6635977029800415, "learning_rate": 6.345675558042654e-07, "loss": 0.1251, "step": 47040 }, { "epoch": 1.0970417650316224, "grad_norm": 2.9285361766815186, "learning_rate": 6.344898339861966e-07, "loss": 0.1091, "step": 47050 }, { "epoch": 1.0972749264084405, "grad_norm": 3.5509824752807617, "learning_rate": 6.344121121681278e-07, "loss": 0.118, "step": 47060 }, { "epoch": 1.0975080877852583, "grad_norm": 3.6487152576446533, "learning_rate": 6.343343903500591e-07, "loss": 0.1099, "step": 47070 }, { "epoch": 1.0977412491620764, "grad_norm": 1.6482609510421753, "learning_rate": 6.342566685319902e-07, "loss": 0.1054, "step": 47080 }, { "epoch": 1.0979744105388942, "grad_norm": 2.1241626739501953, "learning_rate": 6.341789467139215e-07, "loss": 0.1062, "step": 47090 }, { "epoch": 1.098207571915712, "grad_norm": 1.3625248670578003, "learning_rate": 6.341012248958527e-07, "loss": 0.1083, "step": 47100 }, { "epoch": 1.0984407332925301, "grad_norm": 1.945969820022583, "learning_rate": 6.34023503077784e-07, "loss": 0.1164, "step": 47110 }, { "epoch": 1.098673894669348, "grad_norm": 2.895866870880127, "learning_rate": 6.339457812597152e-07, "loss": 0.13, "step": 47120 }, { "epoch": 1.098907056046166, "grad_norm": 1.1004410982131958, "learning_rate": 6.338680594416464e-07, "loss": 0.1062, "step": 47130 }, { "epoch": 1.0991402174229838, "grad_norm": 1.074930191040039, "learning_rate": 6.337903376235777e-07, "loss": 0.106, "step": 47140 }, { "epoch": 1.099373378799802, "grad_norm": 1.9989049434661865, "learning_rate": 6.337126158055089e-07, "loss": 0.1161, "step": 47150 }, { "epoch": 1.0996065401766197, "grad_norm": 1.7310218811035156, "learning_rate": 6.336348939874401e-07, "loss": 0.1056, "step": 47160 }, { "epoch": 1.0998397015534376, "grad_norm": 1.9112998247146606, "learning_rate": 6.335571721693714e-07, "loss": 0.1099, "step": 47170 }, { "epoch": 1.1000728629302556, "grad_norm": 1.7668532133102417, "learning_rate": 6.334794503513025e-07, "loss": 0.1129, "step": 47180 }, { "epoch": 1.1003060243070735, "grad_norm": 1.357543706893921, "learning_rate": 6.334017285332338e-07, "loss": 0.1062, "step": 47190 }, { "epoch": 1.1005391856838915, "grad_norm": 1.586696743965149, "learning_rate": 6.33324006715165e-07, "loss": 0.1099, "step": 47200 }, { "epoch": 1.1007723470607094, "grad_norm": 1.232572078704834, "learning_rate": 6.332462848970963e-07, "loss": 0.1113, "step": 47210 }, { "epoch": 1.1010055084375274, "grad_norm": 1.6000021696090698, "learning_rate": 6.331685630790276e-07, "loss": 0.1113, "step": 47220 }, { "epoch": 1.1012386698143453, "grad_norm": 1.4905577898025513, "learning_rate": 6.330908412609588e-07, "loss": 0.102, "step": 47230 }, { "epoch": 1.101471831191163, "grad_norm": 1.2099158763885498, "learning_rate": 6.3301311944289e-07, "loss": 0.1072, "step": 47240 }, { "epoch": 1.1017049925679812, "grad_norm": 1.3969314098358154, "learning_rate": 6.329353976248211e-07, "loss": 0.1024, "step": 47250 }, { "epoch": 1.101938153944799, "grad_norm": 1.3676989078521729, "learning_rate": 6.328576758067524e-07, "loss": 0.1118, "step": 47260 }, { "epoch": 1.102171315321617, "grad_norm": 1.3792967796325684, "learning_rate": 6.327799539886837e-07, "loss": 0.1141, "step": 47270 }, { "epoch": 1.1024044766984349, "grad_norm": 1.7440359592437744, "learning_rate": 6.327022321706149e-07, "loss": 0.1234, "step": 47280 }, { "epoch": 1.1026376380752527, "grad_norm": 1.5299979448318481, "learning_rate": 6.326245103525462e-07, "loss": 0.1061, "step": 47290 }, { "epoch": 1.1028707994520708, "grad_norm": 2.028294563293457, "learning_rate": 6.325467885344775e-07, "loss": 0.1004, "step": 47300 }, { "epoch": 1.1031039608288886, "grad_norm": 2.6638498306274414, "learning_rate": 6.324690667164086e-07, "loss": 0.1117, "step": 47310 }, { "epoch": 1.1033371222057067, "grad_norm": 1.399686336517334, "learning_rate": 6.323913448983398e-07, "loss": 0.1166, "step": 47320 }, { "epoch": 1.1035702835825245, "grad_norm": 3.330094814300537, "learning_rate": 6.32313623080271e-07, "loss": 0.1122, "step": 47330 }, { "epoch": 1.1038034449593426, "grad_norm": 1.7540687322616577, "learning_rate": 6.322359012622023e-07, "loss": 0.1177, "step": 47340 }, { "epoch": 1.1040366063361604, "grad_norm": 0.9911400675773621, "learning_rate": 6.321581794441335e-07, "loss": 0.1048, "step": 47350 }, { "epoch": 1.1042697677129782, "grad_norm": 1.592724084854126, "learning_rate": 6.320804576260648e-07, "loss": 0.1107, "step": 47360 }, { "epoch": 1.1045029290897963, "grad_norm": 1.4645808935165405, "learning_rate": 6.32002735807996e-07, "loss": 0.1075, "step": 47370 }, { "epoch": 1.1047360904666141, "grad_norm": 1.8607364892959595, "learning_rate": 6.319250139899272e-07, "loss": 0.1038, "step": 47380 }, { "epoch": 1.1049692518434322, "grad_norm": 2.794891834259033, "learning_rate": 6.318472921718585e-07, "loss": 0.1125, "step": 47390 }, { "epoch": 1.10520241322025, "grad_norm": 1.763384461402893, "learning_rate": 6.317695703537896e-07, "loss": 0.1116, "step": 47400 }, { "epoch": 1.105435574597068, "grad_norm": 2.383317708969116, "learning_rate": 6.316918485357209e-07, "loss": 0.1198, "step": 47410 }, { "epoch": 1.105668735973886, "grad_norm": 1.5261741876602173, "learning_rate": 6.316141267176522e-07, "loss": 0.1077, "step": 47420 }, { "epoch": 1.1059018973507038, "grad_norm": 1.3235958814620972, "learning_rate": 6.315364048995833e-07, "loss": 0.1154, "step": 47430 }, { "epoch": 1.1061350587275218, "grad_norm": 1.2231531143188477, "learning_rate": 6.314586830815146e-07, "loss": 0.1169, "step": 47440 }, { "epoch": 1.1063682201043397, "grad_norm": 1.8439115285873413, "learning_rate": 6.313809612634459e-07, "loss": 0.1196, "step": 47450 }, { "epoch": 1.1066013814811577, "grad_norm": 1.2878104448318481, "learning_rate": 6.313032394453771e-07, "loss": 0.1118, "step": 47460 }, { "epoch": 1.1068345428579756, "grad_norm": 2.265812635421753, "learning_rate": 6.312255176273084e-07, "loss": 0.1203, "step": 47470 }, { "epoch": 1.1070677042347934, "grad_norm": 1.2318804264068604, "learning_rate": 6.311477958092395e-07, "loss": 0.1039, "step": 47480 }, { "epoch": 1.1073008656116115, "grad_norm": 1.631874680519104, "learning_rate": 6.310700739911707e-07, "loss": 0.1206, "step": 47490 }, { "epoch": 1.1075340269884293, "grad_norm": 1.221512794494629, "learning_rate": 6.30992352173102e-07, "loss": 0.1194, "step": 47500 }, { "epoch": 1.1077671883652473, "grad_norm": 1.3602508306503296, "learning_rate": 6.309146303550332e-07, "loss": 0.1205, "step": 47510 }, { "epoch": 1.1080003497420652, "grad_norm": 1.281855583190918, "learning_rate": 6.308369085369645e-07, "loss": 0.1117, "step": 47520 }, { "epoch": 1.1082335111188832, "grad_norm": 1.7942957878112793, "learning_rate": 6.307591867188957e-07, "loss": 0.1137, "step": 47530 }, { "epoch": 1.108466672495701, "grad_norm": 2.0347213745117188, "learning_rate": 6.30681464900827e-07, "loss": 0.1113, "step": 47540 }, { "epoch": 1.108699833872519, "grad_norm": 2.268110990524292, "learning_rate": 6.306037430827583e-07, "loss": 0.1055, "step": 47550 }, { "epoch": 1.108932995249337, "grad_norm": 1.4347851276397705, "learning_rate": 6.305260212646893e-07, "loss": 0.1134, "step": 47560 }, { "epoch": 1.1091661566261548, "grad_norm": 1.2583215236663818, "learning_rate": 6.304482994466206e-07, "loss": 0.1103, "step": 47570 }, { "epoch": 1.1093993180029729, "grad_norm": 1.8036423921585083, "learning_rate": 6.303705776285518e-07, "loss": 0.1165, "step": 47580 }, { "epoch": 1.1096324793797907, "grad_norm": 1.42302668094635, "learning_rate": 6.302928558104831e-07, "loss": 0.1172, "step": 47590 }, { "epoch": 1.1098656407566088, "grad_norm": 1.7357391119003296, "learning_rate": 6.302151339924144e-07, "loss": 0.1139, "step": 47600 }, { "epoch": 1.1100988021334266, "grad_norm": 1.131770372390747, "learning_rate": 6.301374121743456e-07, "loss": 0.1216, "step": 47610 }, { "epoch": 1.1103319635102444, "grad_norm": 1.268467664718628, "learning_rate": 6.300596903562768e-07, "loss": 0.1105, "step": 47620 }, { "epoch": 1.1105651248870625, "grad_norm": 1.4227375984191895, "learning_rate": 6.29981968538208e-07, "loss": 0.1192, "step": 47630 }, { "epoch": 1.1107982862638803, "grad_norm": 2.386364221572876, "learning_rate": 6.299042467201392e-07, "loss": 0.1144, "step": 47640 }, { "epoch": 1.1110314476406984, "grad_norm": 1.5645095109939575, "learning_rate": 6.298265249020705e-07, "loss": 0.1131, "step": 47650 }, { "epoch": 1.1112646090175162, "grad_norm": 1.375757098197937, "learning_rate": 6.297488030840017e-07, "loss": 0.1229, "step": 47660 }, { "epoch": 1.111497770394334, "grad_norm": 1.2659454345703125, "learning_rate": 6.29671081265933e-07, "loss": 0.1093, "step": 47670 }, { "epoch": 1.1117309317711521, "grad_norm": 2.6707165241241455, "learning_rate": 6.295933594478641e-07, "loss": 0.1189, "step": 47680 }, { "epoch": 1.11196409314797, "grad_norm": 1.2587796449661255, "learning_rate": 6.295156376297954e-07, "loss": 0.1099, "step": 47690 }, { "epoch": 1.112197254524788, "grad_norm": 2.906158208847046, "learning_rate": 6.294379158117267e-07, "loss": 0.1091, "step": 47700 }, { "epoch": 1.1124304159016059, "grad_norm": 1.7899713516235352, "learning_rate": 6.293601939936579e-07, "loss": 0.0996, "step": 47710 }, { "epoch": 1.112663577278424, "grad_norm": 1.2930529117584229, "learning_rate": 6.292824721755891e-07, "loss": 0.1189, "step": 47720 }, { "epoch": 1.1128967386552417, "grad_norm": 2.7330117225646973, "learning_rate": 6.292047503575203e-07, "loss": 0.1199, "step": 47730 }, { "epoch": 1.1131299000320598, "grad_norm": 2.4486453533172607, "learning_rate": 6.291270285394515e-07, "loss": 0.1139, "step": 47740 }, { "epoch": 1.1133630614088776, "grad_norm": 2.1499507427215576, "learning_rate": 6.290493067213828e-07, "loss": 0.1126, "step": 47750 }, { "epoch": 1.1135962227856955, "grad_norm": 1.8576350212097168, "learning_rate": 6.28971584903314e-07, "loss": 0.1066, "step": 47760 }, { "epoch": 1.1138293841625135, "grad_norm": 1.9916070699691772, "learning_rate": 6.288938630852453e-07, "loss": 0.1176, "step": 47770 }, { "epoch": 1.1140625455393314, "grad_norm": 1.3093727827072144, "learning_rate": 6.288161412671766e-07, "loss": 0.1105, "step": 47780 }, { "epoch": 1.1142957069161494, "grad_norm": 1.918686032295227, "learning_rate": 6.287384194491078e-07, "loss": 0.1184, "step": 47790 }, { "epoch": 1.1145288682929673, "grad_norm": 2.7663960456848145, "learning_rate": 6.286606976310389e-07, "loss": 0.1255, "step": 47800 }, { "epoch": 1.114762029669785, "grad_norm": 2.248660087585449, "learning_rate": 6.285829758129701e-07, "loss": 0.1116, "step": 47810 }, { "epoch": 1.1149951910466032, "grad_norm": 2.0423102378845215, "learning_rate": 6.285052539949014e-07, "loss": 0.1196, "step": 47820 }, { "epoch": 1.115228352423421, "grad_norm": 1.2828938961029053, "learning_rate": 6.284275321768327e-07, "loss": 0.1142, "step": 47830 }, { "epoch": 1.115461513800239, "grad_norm": 2.309253692626953, "learning_rate": 6.283498103587639e-07, "loss": 0.1007, "step": 47840 }, { "epoch": 1.115694675177057, "grad_norm": 1.2803577184677124, "learning_rate": 6.282720885406952e-07, "loss": 0.1142, "step": 47850 }, { "epoch": 1.115927836553875, "grad_norm": 1.513163447380066, "learning_rate": 6.281943667226263e-07, "loss": 0.1077, "step": 47860 }, { "epoch": 1.1161609979306928, "grad_norm": 2.4813413619995117, "learning_rate": 6.281166449045576e-07, "loss": 0.1046, "step": 47870 }, { "epoch": 1.1163941593075106, "grad_norm": 1.2054983377456665, "learning_rate": 6.280389230864887e-07, "loss": 0.1123, "step": 47880 }, { "epoch": 1.1166273206843287, "grad_norm": 4.157181262969971, "learning_rate": 6.2796120126842e-07, "loss": 0.1161, "step": 47890 }, { "epoch": 1.1168604820611465, "grad_norm": 1.1089106798171997, "learning_rate": 6.278834794503513e-07, "loss": 0.0988, "step": 47900 }, { "epoch": 1.1170936434379646, "grad_norm": 4.084828853607178, "learning_rate": 6.278057576322825e-07, "loss": 0.1225, "step": 47910 }, { "epoch": 1.1173268048147824, "grad_norm": 1.769121527671814, "learning_rate": 6.277280358142137e-07, "loss": 0.1154, "step": 47920 }, { "epoch": 1.1175599661916005, "grad_norm": 1.5021061897277832, "learning_rate": 6.27650313996145e-07, "loss": 0.1093, "step": 47930 }, { "epoch": 1.1177931275684183, "grad_norm": 1.665950059890747, "learning_rate": 6.275725921780762e-07, "loss": 0.1195, "step": 47940 }, { "epoch": 1.1180262889452361, "grad_norm": 1.25807523727417, "learning_rate": 6.274948703600075e-07, "loss": 0.1079, "step": 47950 }, { "epoch": 1.1182594503220542, "grad_norm": 2.582106590270996, "learning_rate": 6.274171485419386e-07, "loss": 0.1133, "step": 47960 }, { "epoch": 1.118492611698872, "grad_norm": 1.7901932001113892, "learning_rate": 6.273394267238699e-07, "loss": 0.1115, "step": 47970 }, { "epoch": 1.11872577307569, "grad_norm": 1.627585768699646, "learning_rate": 6.272617049058011e-07, "loss": 0.1145, "step": 47980 }, { "epoch": 1.118958934452508, "grad_norm": 1.8215407133102417, "learning_rate": 6.271839830877323e-07, "loss": 0.1074, "step": 47990 }, { "epoch": 1.1191920958293258, "grad_norm": 1.5103257894515991, "learning_rate": 6.271062612696636e-07, "loss": 0.1016, "step": 48000 }, { "epoch": 1.1194252572061438, "grad_norm": 2.4047422409057617, "learning_rate": 6.270285394515948e-07, "loss": 0.1181, "step": 48010 }, { "epoch": 1.1196584185829617, "grad_norm": 1.3751415014266968, "learning_rate": 6.269508176335261e-07, "loss": 0.1191, "step": 48020 }, { "epoch": 1.1198915799597797, "grad_norm": 3.9641408920288086, "learning_rate": 6.268730958154574e-07, "loss": 0.1213, "step": 48030 }, { "epoch": 1.1201247413365976, "grad_norm": 1.836718201637268, "learning_rate": 6.267953739973884e-07, "loss": 0.1076, "step": 48040 }, { "epoch": 1.1203579027134156, "grad_norm": 2.3597588539123535, "learning_rate": 6.267176521793197e-07, "loss": 0.1049, "step": 48050 }, { "epoch": 1.1205910640902335, "grad_norm": 1.41895592212677, "learning_rate": 6.266399303612509e-07, "loss": 0.1119, "step": 48060 }, { "epoch": 1.1208242254670513, "grad_norm": 1.6464312076568604, "learning_rate": 6.265622085431822e-07, "loss": 0.1035, "step": 48070 }, { "epoch": 1.1210573868438694, "grad_norm": 1.1407718658447266, "learning_rate": 6.264844867251135e-07, "loss": 0.1086, "step": 48080 }, { "epoch": 1.1212905482206872, "grad_norm": 2.608006715774536, "learning_rate": 6.264067649070447e-07, "loss": 0.1114, "step": 48090 }, { "epoch": 1.1215237095975052, "grad_norm": 1.8852388858795166, "learning_rate": 6.26329043088976e-07, "loss": 0.1132, "step": 48100 }, { "epoch": 1.121756870974323, "grad_norm": 1.4008375406265259, "learning_rate": 6.262513212709071e-07, "loss": 0.1177, "step": 48110 }, { "epoch": 1.1219900323511411, "grad_norm": 1.0749149322509766, "learning_rate": 6.261735994528383e-07, "loss": 0.107, "step": 48120 }, { "epoch": 1.122223193727959, "grad_norm": 3.2539079189300537, "learning_rate": 6.260958776347696e-07, "loss": 0.1118, "step": 48130 }, { "epoch": 1.1224563551047768, "grad_norm": 1.5908863544464111, "learning_rate": 6.260181558167008e-07, "loss": 0.1172, "step": 48140 }, { "epoch": 1.1226895164815949, "grad_norm": 1.4602580070495605, "learning_rate": 6.259404339986321e-07, "loss": 0.1088, "step": 48150 }, { "epoch": 1.1229226778584127, "grad_norm": 2.48707914352417, "learning_rate": 6.258627121805633e-07, "loss": 0.1041, "step": 48160 }, { "epoch": 1.1231558392352308, "grad_norm": 2.5406932830810547, "learning_rate": 6.257849903624945e-07, "loss": 0.1116, "step": 48170 }, { "epoch": 1.1233890006120486, "grad_norm": 1.7622593641281128, "learning_rate": 6.257072685444258e-07, "loss": 0.1078, "step": 48180 }, { "epoch": 1.1236221619888664, "grad_norm": 1.4021426439285278, "learning_rate": 6.25629546726357e-07, "loss": 0.1167, "step": 48190 }, { "epoch": 1.1238553233656845, "grad_norm": 1.776004433631897, "learning_rate": 6.255518249082882e-07, "loss": 0.1062, "step": 48200 }, { "epoch": 1.1240884847425023, "grad_norm": 1.1964572668075562, "learning_rate": 6.254741030902194e-07, "loss": 0.116, "step": 48210 }, { "epoch": 1.1243216461193204, "grad_norm": 1.1738243103027344, "learning_rate": 6.253963812721507e-07, "loss": 0.1097, "step": 48220 }, { "epoch": 1.1245548074961382, "grad_norm": 1.174342393875122, "learning_rate": 6.253186594540819e-07, "loss": 0.1085, "step": 48230 }, { "epoch": 1.1247879688729563, "grad_norm": 1.8129422664642334, "learning_rate": 6.252409376360131e-07, "loss": 0.1136, "step": 48240 }, { "epoch": 1.1250211302497741, "grad_norm": 1.3509891033172607, "learning_rate": 6.251632158179444e-07, "loss": 0.1147, "step": 48250 }, { "epoch": 1.125254291626592, "grad_norm": 1.406446099281311, "learning_rate": 6.250854939998757e-07, "loss": 0.1128, "step": 48260 }, { "epoch": 1.12548745300341, "grad_norm": 1.7928537130355835, "learning_rate": 6.250077721818069e-07, "loss": 0.1184, "step": 48270 }, { "epoch": 1.1257206143802279, "grad_norm": 1.1249737739562988, "learning_rate": 6.249300503637381e-07, "loss": 0.0985, "step": 48280 }, { "epoch": 1.125953775757046, "grad_norm": 2.3505728244781494, "learning_rate": 6.248523285456692e-07, "loss": 0.109, "step": 48290 }, { "epoch": 1.1261869371338638, "grad_norm": 1.6565990447998047, "learning_rate": 6.247746067276005e-07, "loss": 0.119, "step": 48300 }, { "epoch": 1.1264200985106818, "grad_norm": 1.1860146522521973, "learning_rate": 6.246968849095318e-07, "loss": 0.1014, "step": 48310 }, { "epoch": 1.1266532598874996, "grad_norm": 1.2171682119369507, "learning_rate": 6.24619163091463e-07, "loss": 0.0957, "step": 48320 }, { "epoch": 1.1268864212643175, "grad_norm": 1.293540120124817, "learning_rate": 6.245414412733943e-07, "loss": 0.1049, "step": 48330 }, { "epoch": 1.1271195826411355, "grad_norm": 1.5403443574905396, "learning_rate": 6.244637194553255e-07, "loss": 0.1094, "step": 48340 }, { "epoch": 1.1273527440179534, "grad_norm": 2.6439738273620605, "learning_rate": 6.243859976372567e-07, "loss": 0.1151, "step": 48350 }, { "epoch": 1.1275859053947714, "grad_norm": 1.724259614944458, "learning_rate": 6.243082758191879e-07, "loss": 0.1079, "step": 48360 }, { "epoch": 1.1278190667715893, "grad_norm": 1.0447484254837036, "learning_rate": 6.242305540011191e-07, "loss": 0.1075, "step": 48370 }, { "epoch": 1.128052228148407, "grad_norm": 1.7389048337936401, "learning_rate": 6.241528321830504e-07, "loss": 0.1156, "step": 48380 }, { "epoch": 1.1282853895252252, "grad_norm": 1.4013437032699585, "learning_rate": 6.240751103649816e-07, "loss": 0.1239, "step": 48390 }, { "epoch": 1.128518550902043, "grad_norm": 1.8282002210617065, "learning_rate": 6.239973885469129e-07, "loss": 0.1062, "step": 48400 }, { "epoch": 1.128751712278861, "grad_norm": 1.0365517139434814, "learning_rate": 6.239196667288441e-07, "loss": 0.1292, "step": 48410 }, { "epoch": 1.128984873655679, "grad_norm": 1.5515130758285522, "learning_rate": 6.238419449107753e-07, "loss": 0.1191, "step": 48420 }, { "epoch": 1.129218035032497, "grad_norm": 2.266587495803833, "learning_rate": 6.237642230927066e-07, "loss": 0.1119, "step": 48430 }, { "epoch": 1.1294511964093148, "grad_norm": 1.1863751411437988, "learning_rate": 6.236865012746377e-07, "loss": 0.112, "step": 48440 }, { "epoch": 1.1296843577861326, "grad_norm": 1.47722327709198, "learning_rate": 6.23608779456569e-07, "loss": 0.1138, "step": 48450 }, { "epoch": 1.1299175191629507, "grad_norm": 2.0615060329437256, "learning_rate": 6.235310576385003e-07, "loss": 0.1129, "step": 48460 }, { "epoch": 1.1301506805397685, "grad_norm": 2.003758668899536, "learning_rate": 6.234533358204315e-07, "loss": 0.1123, "step": 48470 }, { "epoch": 1.1303838419165866, "grad_norm": 1.5508546829223633, "learning_rate": 6.233756140023627e-07, "loss": 0.1066, "step": 48480 }, { "epoch": 1.1306170032934044, "grad_norm": 2.30901837348938, "learning_rate": 6.232978921842939e-07, "loss": 0.0979, "step": 48490 }, { "epoch": 1.1308501646702225, "grad_norm": 1.6553243398666382, "learning_rate": 6.232201703662252e-07, "loss": 0.1063, "step": 48500 }, { "epoch": 1.1310833260470403, "grad_norm": 3.53181791305542, "learning_rate": 6.231424485481565e-07, "loss": 0.115, "step": 48510 }, { "epoch": 1.1313164874238582, "grad_norm": 2.2148067951202393, "learning_rate": 6.230647267300877e-07, "loss": 0.1134, "step": 48520 }, { "epoch": 1.1315496488006762, "grad_norm": 1.5544015169143677, "learning_rate": 6.229870049120189e-07, "loss": 0.1211, "step": 48530 }, { "epoch": 1.131782810177494, "grad_norm": 1.5932892560958862, "learning_rate": 6.2290928309395e-07, "loss": 0.1077, "step": 48540 }, { "epoch": 1.132015971554312, "grad_norm": 1.275418758392334, "learning_rate": 6.228315612758813e-07, "loss": 0.1022, "step": 48550 }, { "epoch": 1.13224913293113, "grad_norm": 1.8274344205856323, "learning_rate": 6.227538394578126e-07, "loss": 0.1176, "step": 48560 }, { "epoch": 1.1324822943079478, "grad_norm": 1.5946986675262451, "learning_rate": 6.226761176397438e-07, "loss": 0.118, "step": 48570 }, { "epoch": 1.1327154556847658, "grad_norm": 1.5809483528137207, "learning_rate": 6.225983958216751e-07, "loss": 0.1112, "step": 48580 }, { "epoch": 1.1329486170615837, "grad_norm": 2.1160836219787598, "learning_rate": 6.225206740036064e-07, "loss": 0.1164, "step": 48590 }, { "epoch": 1.1331817784384017, "grad_norm": 1.3476589918136597, "learning_rate": 6.224429521855375e-07, "loss": 0.1074, "step": 48600 }, { "epoch": 1.1334149398152196, "grad_norm": 1.7192704677581787, "learning_rate": 6.223652303674687e-07, "loss": 0.1103, "step": 48610 }, { "epoch": 1.1336481011920376, "grad_norm": 1.3801010847091675, "learning_rate": 6.222875085493999e-07, "loss": 0.1237, "step": 48620 }, { "epoch": 1.1338812625688555, "grad_norm": 1.8158683776855469, "learning_rate": 6.222097867313312e-07, "loss": 0.125, "step": 48630 }, { "epoch": 1.1341144239456735, "grad_norm": 1.079262375831604, "learning_rate": 6.221320649132624e-07, "loss": 0.1131, "step": 48640 }, { "epoch": 1.1343475853224914, "grad_norm": 1.4678881168365479, "learning_rate": 6.220543430951937e-07, "loss": 0.1302, "step": 48650 }, { "epoch": 1.1345807466993092, "grad_norm": 1.0942339897155762, "learning_rate": 6.219766212771249e-07, "loss": 0.1107, "step": 48660 }, { "epoch": 1.1348139080761273, "grad_norm": 1.1532868146896362, "learning_rate": 6.218988994590561e-07, "loss": 0.1089, "step": 48670 }, { "epoch": 1.135047069452945, "grad_norm": 1.3819187879562378, "learning_rate": 6.218211776409874e-07, "loss": 0.1129, "step": 48680 }, { "epoch": 1.1352802308297631, "grad_norm": 1.2728863954544067, "learning_rate": 6.217434558229185e-07, "loss": 0.1175, "step": 48690 }, { "epoch": 1.135513392206581, "grad_norm": 1.2878528833389282, "learning_rate": 6.216657340048498e-07, "loss": 0.1189, "step": 48700 }, { "epoch": 1.1357465535833988, "grad_norm": 2.1669912338256836, "learning_rate": 6.215880121867811e-07, "loss": 0.1212, "step": 48710 }, { "epoch": 1.1359797149602169, "grad_norm": 1.2052676677703857, "learning_rate": 6.215102903687122e-07, "loss": 0.1098, "step": 48720 }, { "epoch": 1.1362128763370347, "grad_norm": 2.078897714614868, "learning_rate": 6.214325685506435e-07, "loss": 0.1082, "step": 48730 }, { "epoch": 1.1364460377138528, "grad_norm": 1.2666383981704712, "learning_rate": 6.213548467325748e-07, "loss": 0.106, "step": 48740 }, { "epoch": 1.1366791990906706, "grad_norm": 1.6519172191619873, "learning_rate": 6.21277124914506e-07, "loss": 0.1066, "step": 48750 }, { "epoch": 1.1369123604674884, "grad_norm": 1.222792148590088, "learning_rate": 6.211994030964373e-07, "loss": 0.1033, "step": 48760 }, { "epoch": 1.1371455218443065, "grad_norm": 2.3654963970184326, "learning_rate": 6.211216812783684e-07, "loss": 0.1191, "step": 48770 }, { "epoch": 1.1373786832211243, "grad_norm": 1.430993676185608, "learning_rate": 6.210439594602996e-07, "loss": 0.1161, "step": 48780 }, { "epoch": 1.1376118445979424, "grad_norm": 2.121284246444702, "learning_rate": 6.209662376422309e-07, "loss": 0.1209, "step": 48790 }, { "epoch": 1.1378450059747602, "grad_norm": 1.6655045747756958, "learning_rate": 6.208885158241621e-07, "loss": 0.1186, "step": 48800 }, { "epoch": 1.1380781673515783, "grad_norm": 2.549365997314453, "learning_rate": 6.208107940060934e-07, "loss": 0.1202, "step": 48810 }, { "epoch": 1.1383113287283961, "grad_norm": 1.5270990133285522, "learning_rate": 6.207330721880246e-07, "loss": 0.1157, "step": 48820 }, { "epoch": 1.1385444901052142, "grad_norm": 1.2994110584259033, "learning_rate": 6.206553503699559e-07, "loss": 0.1045, "step": 48830 }, { "epoch": 1.138777651482032, "grad_norm": 2.0812718868255615, "learning_rate": 6.205776285518872e-07, "loss": 0.1074, "step": 48840 }, { "epoch": 1.1390108128588499, "grad_norm": 1.426590919494629, "learning_rate": 6.204999067338182e-07, "loss": 0.1162, "step": 48850 }, { "epoch": 1.139243974235668, "grad_norm": 2.1445469856262207, "learning_rate": 6.204221849157495e-07, "loss": 0.1171, "step": 48860 }, { "epoch": 1.1394771356124858, "grad_norm": 1.4203828573226929, "learning_rate": 6.203444630976807e-07, "loss": 0.1137, "step": 48870 }, { "epoch": 1.1397102969893038, "grad_norm": 1.466369867324829, "learning_rate": 6.20266741279612e-07, "loss": 0.1046, "step": 48880 }, { "epoch": 1.1399434583661217, "grad_norm": 2.049186944961548, "learning_rate": 6.201890194615433e-07, "loss": 0.1045, "step": 48890 }, { "epoch": 1.1401766197429395, "grad_norm": 2.2695412635803223, "learning_rate": 6.201112976434745e-07, "loss": 0.1208, "step": 48900 }, { "epoch": 1.1404097811197575, "grad_norm": 1.4139792919158936, "learning_rate": 6.200335758254057e-07, "loss": 0.1089, "step": 48910 }, { "epoch": 1.1406429424965754, "grad_norm": 2.3291561603546143, "learning_rate": 6.199558540073369e-07, "loss": 0.112, "step": 48920 }, { "epoch": 1.1408761038733934, "grad_norm": 2.1379005908966064, "learning_rate": 6.198781321892681e-07, "loss": 0.1193, "step": 48930 }, { "epoch": 1.1411092652502113, "grad_norm": 1.8302475214004517, "learning_rate": 6.198004103711994e-07, "loss": 0.1123, "step": 48940 }, { "epoch": 1.1413424266270291, "grad_norm": 2.850820541381836, "learning_rate": 6.197226885531306e-07, "loss": 0.1142, "step": 48950 }, { "epoch": 1.1415755880038472, "grad_norm": 1.6644577980041504, "learning_rate": 6.196449667350619e-07, "loss": 0.1219, "step": 48960 }, { "epoch": 1.141808749380665, "grad_norm": 2.216641664505005, "learning_rate": 6.19567244916993e-07, "loss": 0.1085, "step": 48970 }, { "epoch": 1.142041910757483, "grad_norm": 2.3906643390655518, "learning_rate": 6.194895230989243e-07, "loss": 0.1215, "step": 48980 }, { "epoch": 1.142275072134301, "grad_norm": 1.525431513786316, "learning_rate": 6.194118012808556e-07, "loss": 0.1084, "step": 48990 }, { "epoch": 1.142508233511119, "grad_norm": 1.5411564111709595, "learning_rate": 6.193340794627868e-07, "loss": 0.1093, "step": 49000 }, { "epoch": 1.1427413948879368, "grad_norm": 1.3506604433059692, "learning_rate": 6.19256357644718e-07, "loss": 0.1043, "step": 49010 }, { "epoch": 1.1429745562647549, "grad_norm": 1.7251721620559692, "learning_rate": 6.191864080084561e-07, "loss": 0.1129, "step": 49020 }, { "epoch": 1.1432077176415727, "grad_norm": 1.9148873090744019, "learning_rate": 6.191086861903874e-07, "loss": 0.106, "step": 49030 }, { "epoch": 1.1434408790183905, "grad_norm": 1.2697923183441162, "learning_rate": 6.190309643723186e-07, "loss": 0.1101, "step": 49040 }, { "epoch": 1.1436740403952086, "grad_norm": 1.5297038555145264, "learning_rate": 6.189532425542499e-07, "loss": 0.105, "step": 49050 }, { "epoch": 1.1439072017720264, "grad_norm": 5.292664051055908, "learning_rate": 6.188755207361809e-07, "loss": 0.1212, "step": 49060 }, { "epoch": 1.1441403631488445, "grad_norm": 1.5182126760482788, "learning_rate": 6.187977989181122e-07, "loss": 0.1085, "step": 49070 }, { "epoch": 1.1443735245256623, "grad_norm": 1.3310091495513916, "learning_rate": 6.187200771000435e-07, "loss": 0.1166, "step": 49080 }, { "epoch": 1.1446066859024802, "grad_norm": 1.1979836225509644, "learning_rate": 6.186423552819747e-07, "loss": 0.1185, "step": 49090 }, { "epoch": 1.1448398472792982, "grad_norm": 1.0798845291137695, "learning_rate": 6.18564633463906e-07, "loss": 0.1109, "step": 49100 }, { "epoch": 1.145073008656116, "grad_norm": 1.141135334968567, "learning_rate": 6.184869116458373e-07, "loss": 0.114, "step": 49110 }, { "epoch": 1.1453061700329341, "grad_norm": 1.4418801069259644, "learning_rate": 6.184091898277684e-07, "loss": 0.1176, "step": 49120 }, { "epoch": 1.145539331409752, "grad_norm": 1.3034225702285767, "learning_rate": 6.183314680096997e-07, "loss": 0.1161, "step": 49130 }, { "epoch": 1.1457724927865698, "grad_norm": 1.9224680662155151, "learning_rate": 6.182537461916308e-07, "loss": 0.1258, "step": 49140 }, { "epoch": 1.1460056541633878, "grad_norm": 2.2834346294403076, "learning_rate": 6.181760243735621e-07, "loss": 0.1055, "step": 49150 }, { "epoch": 1.1462388155402057, "grad_norm": 1.4414925575256348, "learning_rate": 6.180983025554934e-07, "loss": 0.1213, "step": 49160 }, { "epoch": 1.1464719769170237, "grad_norm": 1.6857340335845947, "learning_rate": 6.180205807374246e-07, "loss": 0.1043, "step": 49170 }, { "epoch": 1.1467051382938416, "grad_norm": 2.9385883808135986, "learning_rate": 6.179428589193558e-07, "loss": 0.1093, "step": 49180 }, { "epoch": 1.1469382996706596, "grad_norm": 1.5983537435531616, "learning_rate": 6.17865137101287e-07, "loss": 0.1069, "step": 49190 }, { "epoch": 1.1471714610474775, "grad_norm": 2.1412179470062256, "learning_rate": 6.177874152832183e-07, "loss": 0.1205, "step": 49200 }, { "epoch": 1.1474046224242955, "grad_norm": 1.203059196472168, "learning_rate": 6.177096934651496e-07, "loss": 0.1112, "step": 49210 }, { "epoch": 1.1476377838011134, "grad_norm": 1.5632729530334473, "learning_rate": 6.176319716470807e-07, "loss": 0.117, "step": 49220 }, { "epoch": 1.1478709451779312, "grad_norm": 1.9523890018463135, "learning_rate": 6.17554249829012e-07, "loss": 0.1039, "step": 49230 }, { "epoch": 1.1481041065547493, "grad_norm": 1.5694650411605835, "learning_rate": 6.174765280109431e-07, "loss": 0.1026, "step": 49240 }, { "epoch": 1.148337267931567, "grad_norm": 1.3347225189208984, "learning_rate": 6.173988061928744e-07, "loss": 0.1222, "step": 49250 }, { "epoch": 1.1485704293083852, "grad_norm": 1.6760644912719727, "learning_rate": 6.173210843748057e-07, "loss": 0.1125, "step": 49260 }, { "epoch": 1.148803590685203, "grad_norm": 2.160133123397827, "learning_rate": 6.172433625567369e-07, "loss": 0.1092, "step": 49270 }, { "epoch": 1.1490367520620208, "grad_norm": 1.3003133535385132, "learning_rate": 6.171656407386682e-07, "loss": 0.1014, "step": 49280 }, { "epoch": 1.1492699134388389, "grad_norm": 1.7198657989501953, "learning_rate": 6.170879189205994e-07, "loss": 0.1056, "step": 49290 }, { "epoch": 1.1495030748156567, "grad_norm": 3.3628017902374268, "learning_rate": 6.170101971025305e-07, "loss": 0.1112, "step": 49300 }, { "epoch": 1.1497362361924748, "grad_norm": 1.7662147283554077, "learning_rate": 6.169324752844618e-07, "loss": 0.1125, "step": 49310 }, { "epoch": 1.1499693975692926, "grad_norm": 2.741089105606079, "learning_rate": 6.16854753466393e-07, "loss": 0.0979, "step": 49320 }, { "epoch": 1.1502025589461105, "grad_norm": 2.62892746925354, "learning_rate": 6.167770316483243e-07, "loss": 0.1186, "step": 49330 }, { "epoch": 1.1504357203229285, "grad_norm": 2.2863872051239014, "learning_rate": 6.166993098302555e-07, "loss": 0.1155, "step": 49340 }, { "epoch": 1.1506688816997463, "grad_norm": 1.3244706392288208, "learning_rate": 6.166215880121868e-07, "loss": 0.1127, "step": 49350 }, { "epoch": 1.1509020430765644, "grad_norm": 2.387615919113159, "learning_rate": 6.165438661941181e-07, "loss": 0.1065, "step": 49360 }, { "epoch": 1.1511352044533822, "grad_norm": 2.427928924560547, "learning_rate": 6.164661443760492e-07, "loss": 0.1147, "step": 49370 }, { "epoch": 1.1513683658302003, "grad_norm": 1.6947325468063354, "learning_rate": 6.163884225579804e-07, "loss": 0.1012, "step": 49380 }, { "epoch": 1.1516015272070181, "grad_norm": 2.229458808898926, "learning_rate": 6.163107007399116e-07, "loss": 0.1176, "step": 49390 }, { "epoch": 1.1518346885838362, "grad_norm": 1.1952656507492065, "learning_rate": 6.162329789218429e-07, "loss": 0.1035, "step": 49400 }, { "epoch": 1.152067849960654, "grad_norm": 1.2956082820892334, "learning_rate": 6.161552571037742e-07, "loss": 0.1184, "step": 49410 }, { "epoch": 1.1523010113374719, "grad_norm": 1.7306565046310425, "learning_rate": 6.160775352857054e-07, "loss": 0.1165, "step": 49420 }, { "epoch": 1.15253417271429, "grad_norm": 3.6568644046783447, "learning_rate": 6.159998134676366e-07, "loss": 0.1236, "step": 49430 }, { "epoch": 1.1527673340911078, "grad_norm": 2.783514976501465, "learning_rate": 6.159220916495678e-07, "loss": 0.1124, "step": 49440 }, { "epoch": 1.1530004954679258, "grad_norm": 3.8458433151245117, "learning_rate": 6.158443698314991e-07, "loss": 0.1039, "step": 49450 }, { "epoch": 1.1532336568447437, "grad_norm": 3.9784128665924072, "learning_rate": 6.157666480134303e-07, "loss": 0.1118, "step": 49460 }, { "epoch": 1.1534668182215615, "grad_norm": 1.174857497215271, "learning_rate": 6.156889261953615e-07, "loss": 0.1102, "step": 49470 }, { "epoch": 1.1536999795983796, "grad_norm": 1.574774146080017, "learning_rate": 6.156112043772928e-07, "loss": 0.1079, "step": 49480 }, { "epoch": 1.1539331409751974, "grad_norm": 3.7778244018554688, "learning_rate": 6.15533482559224e-07, "loss": 0.1268, "step": 49490 }, { "epoch": 1.1541663023520154, "grad_norm": 1.5923808813095093, "learning_rate": 6.154557607411552e-07, "loss": 0.1205, "step": 49500 }, { "epoch": 1.1543994637288333, "grad_norm": 1.333211064338684, "learning_rate": 6.153780389230865e-07, "loss": 0.1042, "step": 49510 }, { "epoch": 1.1546326251056513, "grad_norm": 1.3574241399765015, "learning_rate": 6.153003171050177e-07, "loss": 0.1214, "step": 49520 }, { "epoch": 1.1548657864824692, "grad_norm": 1.3254891633987427, "learning_rate": 6.15222595286949e-07, "loss": 0.1181, "step": 49530 }, { "epoch": 1.155098947859287, "grad_norm": 1.4842267036437988, "learning_rate": 6.151448734688802e-07, "loss": 0.1036, "step": 49540 }, { "epoch": 1.155332109236105, "grad_norm": 0.944917619228363, "learning_rate": 6.150671516508113e-07, "loss": 0.1, "step": 49550 }, { "epoch": 1.155565270612923, "grad_norm": 1.74746572971344, "learning_rate": 6.149894298327426e-07, "loss": 0.1074, "step": 49560 }, { "epoch": 1.155798431989741, "grad_norm": 1.2587436437606812, "learning_rate": 6.149117080146738e-07, "loss": 0.1027, "step": 49570 }, { "epoch": 1.1560315933665588, "grad_norm": 3.997040271759033, "learning_rate": 6.148339861966051e-07, "loss": 0.1191, "step": 49580 }, { "epoch": 1.1562647547433769, "grad_norm": 2.151353120803833, "learning_rate": 6.147562643785364e-07, "loss": 0.1124, "step": 49590 }, { "epoch": 1.1564979161201947, "grad_norm": 1.350037932395935, "learning_rate": 6.146785425604676e-07, "loss": 0.1173, "step": 49600 }, { "epoch": 1.1567310774970125, "grad_norm": 1.3717867136001587, "learning_rate": 6.146008207423988e-07, "loss": 0.1068, "step": 49610 }, { "epoch": 1.1569642388738306, "grad_norm": 1.651954174041748, "learning_rate": 6.145230989243299e-07, "loss": 0.1224, "step": 49620 }, { "epoch": 1.1571974002506484, "grad_norm": 2.017245054244995, "learning_rate": 6.144453771062612e-07, "loss": 0.1037, "step": 49630 }, { "epoch": 1.1574305616274665, "grad_norm": 1.2688733339309692, "learning_rate": 6.143676552881925e-07, "loss": 0.1125, "step": 49640 }, { "epoch": 1.1576637230042843, "grad_norm": 1.6340762376785278, "learning_rate": 6.142899334701237e-07, "loss": 0.118, "step": 49650 }, { "epoch": 1.1578968843811022, "grad_norm": 1.4513885974884033, "learning_rate": 6.14212211652055e-07, "loss": 0.1045, "step": 49660 }, { "epoch": 1.1581300457579202, "grad_norm": 1.5833888053894043, "learning_rate": 6.141344898339862e-07, "loss": 0.1265, "step": 49670 }, { "epoch": 1.158363207134738, "grad_norm": 1.2588869333267212, "learning_rate": 6.140567680159174e-07, "loss": 0.1154, "step": 49680 }, { "epoch": 1.1585963685115561, "grad_norm": 2.4696762561798096, "learning_rate": 6.139790461978487e-07, "loss": 0.112, "step": 49690 }, { "epoch": 1.158829529888374, "grad_norm": 1.7287124395370483, "learning_rate": 6.139013243797798e-07, "loss": 0.1055, "step": 49700 }, { "epoch": 1.159062691265192, "grad_norm": 2.5976462364196777, "learning_rate": 6.138236025617111e-07, "loss": 0.1159, "step": 49710 }, { "epoch": 1.1592958526420098, "grad_norm": 1.1239999532699585, "learning_rate": 6.137458807436423e-07, "loss": 0.1149, "step": 49720 }, { "epoch": 1.1595290140188277, "grad_norm": 2.0208911895751953, "learning_rate": 6.136681589255736e-07, "loss": 0.1223, "step": 49730 }, { "epoch": 1.1597621753956457, "grad_norm": 1.4385560750961304, "learning_rate": 6.135904371075048e-07, "loss": 0.106, "step": 49740 }, { "epoch": 1.1599953367724636, "grad_norm": 1.4267370700836182, "learning_rate": 6.13512715289436e-07, "loss": 0.1121, "step": 49750 }, { "epoch": 1.1602284981492816, "grad_norm": 1.6370168924331665, "learning_rate": 6.134349934713673e-07, "loss": 0.1215, "step": 49760 }, { "epoch": 1.1604616595260995, "grad_norm": 1.2770084142684937, "learning_rate": 6.133572716532985e-07, "loss": 0.1022, "step": 49770 }, { "epoch": 1.1606948209029175, "grad_norm": 1.1418417692184448, "learning_rate": 6.132795498352297e-07, "loss": 0.1019, "step": 49780 }, { "epoch": 1.1609279822797354, "grad_norm": 1.5854642391204834, "learning_rate": 6.13201828017161e-07, "loss": 0.1068, "step": 49790 }, { "epoch": 1.1611611436565532, "grad_norm": 1.9813286066055298, "learning_rate": 6.131241061990921e-07, "loss": 0.0994, "step": 49800 }, { "epoch": 1.1613943050333713, "grad_norm": 1.9735054969787598, "learning_rate": 6.130463843810234e-07, "loss": 0.1103, "step": 49810 }, { "epoch": 1.161627466410189, "grad_norm": 1.2204458713531494, "learning_rate": 6.129686625629546e-07, "loss": 0.1095, "step": 49820 }, { "epoch": 1.1618606277870072, "grad_norm": 2.8556835651397705, "learning_rate": 6.128909407448859e-07, "loss": 0.1212, "step": 49830 }, { "epoch": 1.162093789163825, "grad_norm": 1.7099010944366455, "learning_rate": 6.128132189268172e-07, "loss": 0.12, "step": 49840 }, { "epoch": 1.1623269505406428, "grad_norm": 3.4345481395721436, "learning_rate": 6.127354971087484e-07, "loss": 0.1149, "step": 49850 }, { "epoch": 1.162560111917461, "grad_norm": 1.810557246208191, "learning_rate": 6.126577752906795e-07, "loss": 0.112, "step": 49860 }, { "epoch": 1.1627932732942787, "grad_norm": 1.9952526092529297, "learning_rate": 6.125800534726107e-07, "loss": 0.1022, "step": 49870 }, { "epoch": 1.1630264346710968, "grad_norm": 2.3808670043945312, "learning_rate": 6.12502331654542e-07, "loss": 0.1135, "step": 49880 }, { "epoch": 1.1632595960479146, "grad_norm": 1.4612581729888916, "learning_rate": 6.124246098364733e-07, "loss": 0.1293, "step": 49890 }, { "epoch": 1.1634927574247327, "grad_norm": 1.3273770809173584, "learning_rate": 6.123468880184045e-07, "loss": 0.1107, "step": 49900 }, { "epoch": 1.1637259188015505, "grad_norm": 1.3869067430496216, "learning_rate": 6.122691662003358e-07, "loss": 0.1124, "step": 49910 }, { "epoch": 1.1639590801783686, "grad_norm": 1.2107607126235962, "learning_rate": 6.12191444382267e-07, "loss": 0.1154, "step": 49920 }, { "epoch": 1.1641922415551864, "grad_norm": 1.8569689989089966, "learning_rate": 6.121137225641982e-07, "loss": 0.096, "step": 49930 }, { "epoch": 1.1644254029320042, "grad_norm": 1.762398362159729, "learning_rate": 6.120360007461294e-07, "loss": 0.1171, "step": 49940 }, { "epoch": 1.1646585643088223, "grad_norm": 2.4644641876220703, "learning_rate": 6.119582789280606e-07, "loss": 0.1198, "step": 49950 }, { "epoch": 1.1648917256856401, "grad_norm": 2.0010697841644287, "learning_rate": 6.118805571099919e-07, "loss": 0.113, "step": 49960 }, { "epoch": 1.1651248870624582, "grad_norm": 1.3869497776031494, "learning_rate": 6.118028352919232e-07, "loss": 0.1142, "step": 49970 }, { "epoch": 1.165358048439276, "grad_norm": 2.1993277072906494, "learning_rate": 6.117251134738543e-07, "loss": 0.1085, "step": 49980 }, { "epoch": 1.1655912098160939, "grad_norm": 1.6999784708023071, "learning_rate": 6.116473916557856e-07, "loss": 0.1205, "step": 49990 }, { "epoch": 1.165824371192912, "grad_norm": 1.825229525566101, "learning_rate": 6.115696698377168e-07, "loss": 0.1148, "step": 50000 }, { "epoch": 1.165824371192912, "eval_accuracy": 0.9468806798993179, "eval_f1": 0.9620986206337224, "eval_loss": 0.13716034591197968, "eval_runtime": 4514.163, "eval_samples_per_second": 405.373, "eval_steps_per_second": 50.672, "step": 50000 }, { "epoch": 1.1660575325697298, "grad_norm": 1.498810887336731, "learning_rate": 6.114919480196481e-07, "loss": 0.1117, "step": 50010 }, { "epoch": 1.1662906939465478, "grad_norm": 1.3721061944961548, "learning_rate": 6.114142262015793e-07, "loss": 0.1143, "step": 50020 }, { "epoch": 1.1665238553233657, "grad_norm": 2.5547127723693848, "learning_rate": 6.113365043835105e-07, "loss": 0.1118, "step": 50030 }, { "epoch": 1.1667570167001835, "grad_norm": 1.1354044675827026, "learning_rate": 6.112587825654417e-07, "loss": 0.1167, "step": 50040 }, { "epoch": 1.1669901780770016, "grad_norm": 1.5293611288070679, "learning_rate": 6.111810607473729e-07, "loss": 0.1135, "step": 50050 }, { "epoch": 1.1672233394538194, "grad_norm": 1.3992260694503784, "learning_rate": 6.111033389293042e-07, "loss": 0.1173, "step": 50060 }, { "epoch": 1.1674565008306375, "grad_norm": 1.3662559986114502, "learning_rate": 6.110256171112355e-07, "loss": 0.1032, "step": 50070 }, { "epoch": 1.1676896622074553, "grad_norm": 1.7529782056808472, "learning_rate": 6.109478952931667e-07, "loss": 0.1172, "step": 50080 }, { "epoch": 1.1679228235842734, "grad_norm": 2.3285622596740723, "learning_rate": 6.10870173475098e-07, "loss": 0.1051, "step": 50090 }, { "epoch": 1.1681559849610912, "grad_norm": 1.4998937845230103, "learning_rate": 6.10792451657029e-07, "loss": 0.1125, "step": 50100 }, { "epoch": 1.1683891463379092, "grad_norm": 1.9963796138763428, "learning_rate": 6.107147298389603e-07, "loss": 0.1019, "step": 50110 }, { "epoch": 1.168622307714727, "grad_norm": 1.411171793937683, "learning_rate": 6.106370080208916e-07, "loss": 0.1194, "step": 50120 }, { "epoch": 1.168855469091545, "grad_norm": 3.600179433822632, "learning_rate": 6.105592862028228e-07, "loss": 0.1248, "step": 50130 }, { "epoch": 1.169088630468363, "grad_norm": 4.104859352111816, "learning_rate": 6.104815643847541e-07, "loss": 0.1109, "step": 50140 }, { "epoch": 1.1693217918451808, "grad_norm": 1.4074666500091553, "learning_rate": 6.104038425666853e-07, "loss": 0.1142, "step": 50150 }, { "epoch": 1.1695549532219989, "grad_norm": 2.5803744792938232, "learning_rate": 6.103261207486166e-07, "loss": 0.1161, "step": 50160 }, { "epoch": 1.1697881145988167, "grad_norm": 2.8092215061187744, "learning_rate": 6.102483989305478e-07, "loss": 0.1157, "step": 50170 }, { "epoch": 1.1700212759756345, "grad_norm": 1.5509827136993408, "learning_rate": 6.101706771124789e-07, "loss": 0.1118, "step": 50180 }, { "epoch": 1.1702544373524526, "grad_norm": 1.7472001314163208, "learning_rate": 6.100929552944102e-07, "loss": 0.1163, "step": 50190 }, { "epoch": 1.1704875987292704, "grad_norm": 1.9868093729019165, "learning_rate": 6.100152334763414e-07, "loss": 0.1095, "step": 50200 }, { "epoch": 1.1707207601060885, "grad_norm": 1.5934562683105469, "learning_rate": 6.099375116582727e-07, "loss": 0.1215, "step": 50210 }, { "epoch": 1.1709539214829063, "grad_norm": 4.311050891876221, "learning_rate": 6.09859789840204e-07, "loss": 0.112, "step": 50220 }, { "epoch": 1.1711870828597242, "grad_norm": 1.2184914350509644, "learning_rate": 6.097820680221351e-07, "loss": 0.1156, "step": 50230 }, { "epoch": 1.1714202442365422, "grad_norm": 1.0928373336791992, "learning_rate": 6.097043462040664e-07, "loss": 0.1255, "step": 50240 }, { "epoch": 1.17165340561336, "grad_norm": 2.977290391921997, "learning_rate": 6.096266243859976e-07, "loss": 0.113, "step": 50250 }, { "epoch": 1.1718865669901781, "grad_norm": 1.9184948205947876, "learning_rate": 6.095489025679288e-07, "loss": 0.1151, "step": 50260 }, { "epoch": 1.172119728366996, "grad_norm": 2.6050240993499756, "learning_rate": 6.094711807498601e-07, "loss": 0.1038, "step": 50270 }, { "epoch": 1.172352889743814, "grad_norm": 1.3787033557891846, "learning_rate": 6.093934589317913e-07, "loss": 0.1126, "step": 50280 }, { "epoch": 1.1725860511206319, "grad_norm": 1.1239674091339111, "learning_rate": 6.093157371137225e-07, "loss": 0.1062, "step": 50290 }, { "epoch": 1.17281921249745, "grad_norm": 1.316489577293396, "learning_rate": 6.092380152956537e-07, "loss": 0.1136, "step": 50300 }, { "epoch": 1.1730523738742678, "grad_norm": 1.2372052669525146, "learning_rate": 6.09160293477585e-07, "loss": 0.1, "step": 50310 }, { "epoch": 1.1732855352510856, "grad_norm": 1.3897478580474854, "learning_rate": 6.090825716595163e-07, "loss": 0.1022, "step": 50320 }, { "epoch": 1.1735186966279036, "grad_norm": 1.2087078094482422, "learning_rate": 6.090048498414475e-07, "loss": 0.1178, "step": 50330 }, { "epoch": 1.1737518580047215, "grad_norm": 1.5395313501358032, "learning_rate": 6.089271280233787e-07, "loss": 0.1185, "step": 50340 }, { "epoch": 1.1739850193815395, "grad_norm": 2.0426623821258545, "learning_rate": 6.088494062053098e-07, "loss": 0.101, "step": 50350 }, { "epoch": 1.1742181807583574, "grad_norm": 1.7219206094741821, "learning_rate": 6.087716843872411e-07, "loss": 0.1091, "step": 50360 }, { "epoch": 1.1744513421351752, "grad_norm": 1.163806438446045, "learning_rate": 6.086939625691724e-07, "loss": 0.1189, "step": 50370 }, { "epoch": 1.1746845035119933, "grad_norm": 1.9878560304641724, "learning_rate": 6.086162407511036e-07, "loss": 0.1245, "step": 50380 }, { "epoch": 1.174917664888811, "grad_norm": 1.6800631284713745, "learning_rate": 6.085385189330349e-07, "loss": 0.1037, "step": 50390 }, { "epoch": 1.1751508262656292, "grad_norm": 1.8686786890029907, "learning_rate": 6.084607971149662e-07, "loss": 0.1051, "step": 50400 }, { "epoch": 1.175383987642447, "grad_norm": 3.20951247215271, "learning_rate": 6.083830752968973e-07, "loss": 0.1149, "step": 50410 }, { "epoch": 1.1756171490192648, "grad_norm": 1.4508394002914429, "learning_rate": 6.083053534788285e-07, "loss": 0.1069, "step": 50420 }, { "epoch": 1.175850310396083, "grad_norm": 1.2478491067886353, "learning_rate": 6.082276316607597e-07, "loss": 0.117, "step": 50430 }, { "epoch": 1.1760834717729007, "grad_norm": 1.4580843448638916, "learning_rate": 6.08149909842691e-07, "loss": 0.1101, "step": 50440 }, { "epoch": 1.1763166331497188, "grad_norm": 1.6965125799179077, "learning_rate": 6.080721880246223e-07, "loss": 0.1084, "step": 50450 }, { "epoch": 1.1765497945265366, "grad_norm": 1.3406599760055542, "learning_rate": 6.079944662065535e-07, "loss": 0.1057, "step": 50460 }, { "epoch": 1.1767829559033547, "grad_norm": 1.1358182430267334, "learning_rate": 6.079167443884847e-07, "loss": 0.0985, "step": 50470 }, { "epoch": 1.1770161172801725, "grad_norm": 1.2471941709518433, "learning_rate": 6.078390225704159e-07, "loss": 0.1067, "step": 50480 }, { "epoch": 1.1772492786569906, "grad_norm": 3.5418386459350586, "learning_rate": 6.077613007523472e-07, "loss": 0.1156, "step": 50490 }, { "epoch": 1.1774824400338084, "grad_norm": 1.8524069786071777, "learning_rate": 6.076835789342784e-07, "loss": 0.1053, "step": 50500 }, { "epoch": 1.1777156014106263, "grad_norm": 1.208648443222046, "learning_rate": 6.076058571162096e-07, "loss": 0.1128, "step": 50510 }, { "epoch": 1.1779487627874443, "grad_norm": 2.3427207469940186, "learning_rate": 6.075281352981409e-07, "loss": 0.1274, "step": 50520 }, { "epoch": 1.1781819241642622, "grad_norm": 1.552996039390564, "learning_rate": 6.07450413480072e-07, "loss": 0.1151, "step": 50530 }, { "epoch": 1.1784150855410802, "grad_norm": 6.229344367980957, "learning_rate": 6.073726916620033e-07, "loss": 0.1042, "step": 50540 }, { "epoch": 1.178648246917898, "grad_norm": 1.0757346153259277, "learning_rate": 6.072949698439346e-07, "loss": 0.1046, "step": 50550 }, { "epoch": 1.1788814082947159, "grad_norm": 1.9616628885269165, "learning_rate": 6.072172480258658e-07, "loss": 0.1021, "step": 50560 }, { "epoch": 1.179114569671534, "grad_norm": 1.4948660135269165, "learning_rate": 6.071395262077971e-07, "loss": 0.1091, "step": 50570 }, { "epoch": 1.1793477310483518, "grad_norm": 1.6807029247283936, "learning_rate": 6.070618043897282e-07, "loss": 0.1208, "step": 50580 }, { "epoch": 1.1795808924251698, "grad_norm": 1.1399580240249634, "learning_rate": 6.069840825716595e-07, "loss": 0.1107, "step": 50590 }, { "epoch": 1.1798140538019877, "grad_norm": 1.4608538150787354, "learning_rate": 6.069063607535907e-07, "loss": 0.1127, "step": 50600 }, { "epoch": 1.1800472151788055, "grad_norm": 1.8673533201217651, "learning_rate": 6.068286389355219e-07, "loss": 0.1153, "step": 50610 }, { "epoch": 1.1802803765556236, "grad_norm": 3.146599054336548, "learning_rate": 6.067509171174532e-07, "loss": 0.1144, "step": 50620 }, { "epoch": 1.1805135379324414, "grad_norm": 2.7680623531341553, "learning_rate": 6.066731952993844e-07, "loss": 0.0984, "step": 50630 }, { "epoch": 1.1807466993092595, "grad_norm": 1.6513553857803345, "learning_rate": 6.065954734813157e-07, "loss": 0.1116, "step": 50640 }, { "epoch": 1.1809798606860773, "grad_norm": 1.473517894744873, "learning_rate": 6.06517751663247e-07, "loss": 0.1185, "step": 50650 }, { "epoch": 1.1812130220628954, "grad_norm": 3.0922210216522217, "learning_rate": 6.06440029845178e-07, "loss": 0.1249, "step": 50660 }, { "epoch": 1.1814461834397132, "grad_norm": 2.1789395809173584, "learning_rate": 6.063623080271093e-07, "loss": 0.0996, "step": 50670 }, { "epoch": 1.1816793448165313, "grad_norm": 2.9236881732940674, "learning_rate": 6.062845862090405e-07, "loss": 0.1166, "step": 50680 }, { "epoch": 1.181912506193349, "grad_norm": 2.069641590118408, "learning_rate": 6.062068643909718e-07, "loss": 0.1102, "step": 50690 }, { "epoch": 1.182145667570167, "grad_norm": 1.3771089315414429, "learning_rate": 6.061291425729031e-07, "loss": 0.1162, "step": 50700 }, { "epoch": 1.182378828946985, "grad_norm": 1.0521526336669922, "learning_rate": 6.060514207548343e-07, "loss": 0.1169, "step": 50710 }, { "epoch": 1.1826119903238028, "grad_norm": 2.8988356590270996, "learning_rate": 6.059736989367655e-07, "loss": 0.1022, "step": 50720 }, { "epoch": 1.1828451517006209, "grad_norm": 1.2202682495117188, "learning_rate": 6.058959771186967e-07, "loss": 0.1133, "step": 50730 }, { "epoch": 1.1830783130774387, "grad_norm": 1.2609179019927979, "learning_rate": 6.058182553006279e-07, "loss": 0.1005, "step": 50740 }, { "epoch": 1.1833114744542566, "grad_norm": 1.2757641077041626, "learning_rate": 6.057405334825592e-07, "loss": 0.1043, "step": 50750 }, { "epoch": 1.1835446358310746, "grad_norm": 1.2773066759109497, "learning_rate": 6.056628116644904e-07, "loss": 0.1129, "step": 50760 }, { "epoch": 1.1837777972078924, "grad_norm": 1.3315703868865967, "learning_rate": 6.055850898464217e-07, "loss": 0.1113, "step": 50770 }, { "epoch": 1.1840109585847105, "grad_norm": 2.5108771324157715, "learning_rate": 6.055073680283528e-07, "loss": 0.1048, "step": 50780 }, { "epoch": 1.1842441199615283, "grad_norm": 1.3639826774597168, "learning_rate": 6.054296462102841e-07, "loss": 0.1043, "step": 50790 }, { "epoch": 1.1844772813383464, "grad_norm": 1.3135571479797363, "learning_rate": 6.053519243922154e-07, "loss": 0.1022, "step": 50800 }, { "epoch": 1.1847104427151642, "grad_norm": 1.698096513748169, "learning_rate": 6.052742025741466e-07, "loss": 0.1211, "step": 50810 }, { "epoch": 1.184943604091982, "grad_norm": 1.5615562200546265, "learning_rate": 6.051964807560778e-07, "loss": 0.1142, "step": 50820 }, { "epoch": 1.1851767654688001, "grad_norm": 1.19321870803833, "learning_rate": 6.051187589380091e-07, "loss": 0.1103, "step": 50830 }, { "epoch": 1.185409926845618, "grad_norm": 1.7488504648208618, "learning_rate": 6.050410371199402e-07, "loss": 0.121, "step": 50840 }, { "epoch": 1.185643088222436, "grad_norm": 1.2171710729599, "learning_rate": 6.049633153018715e-07, "loss": 0.112, "step": 50850 }, { "epoch": 1.1858762495992539, "grad_norm": 2.783168315887451, "learning_rate": 6.048855934838027e-07, "loss": 0.0997, "step": 50860 }, { "epoch": 1.186109410976072, "grad_norm": 3.758138656616211, "learning_rate": 6.04807871665734e-07, "loss": 0.1019, "step": 50870 }, { "epoch": 1.1863425723528898, "grad_norm": 1.5458990335464478, "learning_rate": 6.047301498476653e-07, "loss": 0.1185, "step": 50880 }, { "epoch": 1.1865757337297076, "grad_norm": 2.567002773284912, "learning_rate": 6.046524280295965e-07, "loss": 0.1155, "step": 50890 }, { "epoch": 1.1868088951065257, "grad_norm": 2.2204196453094482, "learning_rate": 6.045747062115276e-07, "loss": 0.1155, "step": 50900 }, { "epoch": 1.1870420564833435, "grad_norm": 1.1104930639266968, "learning_rate": 6.044969843934588e-07, "loss": 0.1103, "step": 50910 }, { "epoch": 1.1872752178601615, "grad_norm": 1.2635488510131836, "learning_rate": 6.044192625753901e-07, "loss": 0.1194, "step": 50920 }, { "epoch": 1.1875083792369794, "grad_norm": 3.2624192237854004, "learning_rate": 6.043415407573214e-07, "loss": 0.1079, "step": 50930 }, { "epoch": 1.1877415406137972, "grad_norm": 2.9343204498291016, "learning_rate": 6.042638189392526e-07, "loss": 0.1122, "step": 50940 }, { "epoch": 1.1879747019906153, "grad_norm": 1.168594241142273, "learning_rate": 6.041860971211839e-07, "loss": 0.1075, "step": 50950 }, { "epoch": 1.1882078633674331, "grad_norm": 1.42741060256958, "learning_rate": 6.041083753031151e-07, "loss": 0.1205, "step": 50960 }, { "epoch": 1.1884410247442512, "grad_norm": 1.9623924493789673, "learning_rate": 6.040306534850463e-07, "loss": 0.1123, "step": 50970 }, { "epoch": 1.188674186121069, "grad_norm": 1.668548345565796, "learning_rate": 6.039529316669775e-07, "loss": 0.116, "step": 50980 }, { "epoch": 1.188907347497887, "grad_norm": 1.5978784561157227, "learning_rate": 6.038752098489087e-07, "loss": 0.1106, "step": 50990 }, { "epoch": 1.189140508874705, "grad_norm": 1.134394884109497, "learning_rate": 6.0379748803084e-07, "loss": 0.1156, "step": 51000 }, { "epoch": 1.1893736702515227, "grad_norm": 1.6444967985153198, "learning_rate": 6.037197662127712e-07, "loss": 0.1097, "step": 51010 }, { "epoch": 1.1896068316283408, "grad_norm": 2.1607534885406494, "learning_rate": 6.036420443947025e-07, "loss": 0.1176, "step": 51020 }, { "epoch": 1.1898399930051586, "grad_norm": 1.5793830156326294, "learning_rate": 6.035643225766337e-07, "loss": 0.1025, "step": 51030 }, { "epoch": 1.1900731543819767, "grad_norm": 2.2563669681549072, "learning_rate": 6.034866007585649e-07, "loss": 0.1183, "step": 51040 }, { "epoch": 1.1903063157587945, "grad_norm": 1.733221173286438, "learning_rate": 6.034088789404962e-07, "loss": 0.0906, "step": 51050 }, { "epoch": 1.1905394771356126, "grad_norm": 1.537595510482788, "learning_rate": 6.033311571224273e-07, "loss": 0.1173, "step": 51060 }, { "epoch": 1.1907726385124304, "grad_norm": 1.3472354412078857, "learning_rate": 6.032534353043586e-07, "loss": 0.1063, "step": 51070 }, { "epoch": 1.1910057998892483, "grad_norm": 2.192552089691162, "learning_rate": 6.031757134862899e-07, "loss": 0.1165, "step": 51080 }, { "epoch": 1.1912389612660663, "grad_norm": 2.643568754196167, "learning_rate": 6.03097991668221e-07, "loss": 0.1124, "step": 51090 }, { "epoch": 1.1914721226428842, "grad_norm": 1.2684600353240967, "learning_rate": 6.030202698501523e-07, "loss": 0.1179, "step": 51100 }, { "epoch": 1.1917052840197022, "grad_norm": 2.648867607116699, "learning_rate": 6.029425480320835e-07, "loss": 0.1021, "step": 51110 }, { "epoch": 1.19193844539652, "grad_norm": 1.8272327184677124, "learning_rate": 6.028648262140148e-07, "loss": 0.118, "step": 51120 }, { "epoch": 1.1921716067733379, "grad_norm": 1.8567583560943604, "learning_rate": 6.027871043959461e-07, "loss": 0.1097, "step": 51130 }, { "epoch": 1.192404768150156, "grad_norm": 2.204904794692993, "learning_rate": 6.027093825778772e-07, "loss": 0.1089, "step": 51140 }, { "epoch": 1.1926379295269738, "grad_norm": 1.6979809999465942, "learning_rate": 6.026316607598084e-07, "loss": 0.1005, "step": 51150 }, { "epoch": 1.1928710909037918, "grad_norm": 2.1998770236968994, "learning_rate": 6.025539389417396e-07, "loss": 0.1148, "step": 51160 }, { "epoch": 1.1931042522806097, "grad_norm": 2.0185139179229736, "learning_rate": 6.024762171236709e-07, "loss": 0.1004, "step": 51170 }, { "epoch": 1.1933374136574277, "grad_norm": 3.8382511138916016, "learning_rate": 6.023984953056022e-07, "loss": 0.1072, "step": 51180 }, { "epoch": 1.1935705750342456, "grad_norm": 2.0446038246154785, "learning_rate": 6.023207734875334e-07, "loss": 0.1141, "step": 51190 }, { "epoch": 1.1938037364110636, "grad_norm": 1.8602745532989502, "learning_rate": 6.022430516694647e-07, "loss": 0.1125, "step": 51200 }, { "epoch": 1.1940368977878815, "grad_norm": 1.8033568859100342, "learning_rate": 6.021653298513959e-07, "loss": 0.1123, "step": 51210 }, { "epoch": 1.1942700591646993, "grad_norm": 1.9314638376235962, "learning_rate": 6.020876080333271e-07, "loss": 0.118, "step": 51220 }, { "epoch": 1.1945032205415174, "grad_norm": 1.6125128269195557, "learning_rate": 6.020098862152583e-07, "loss": 0.1124, "step": 51230 }, { "epoch": 1.1947363819183352, "grad_norm": 1.323623776435852, "learning_rate": 6.019321643971895e-07, "loss": 0.1173, "step": 51240 }, { "epoch": 1.1949695432951533, "grad_norm": 1.433990478515625, "learning_rate": 6.018544425791208e-07, "loss": 0.1193, "step": 51250 }, { "epoch": 1.195202704671971, "grad_norm": 1.2417711019515991, "learning_rate": 6.017767207610521e-07, "loss": 0.107, "step": 51260 }, { "epoch": 1.195435866048789, "grad_norm": 2.6550416946411133, "learning_rate": 6.016989989429832e-07, "loss": 0.1111, "step": 51270 }, { "epoch": 1.195669027425607, "grad_norm": 2.0987584590911865, "learning_rate": 6.016212771249145e-07, "loss": 0.1111, "step": 51280 }, { "epoch": 1.1959021888024248, "grad_norm": 1.2966876029968262, "learning_rate": 6.015435553068457e-07, "loss": 0.1117, "step": 51290 }, { "epoch": 1.1961353501792429, "grad_norm": 1.185380220413208, "learning_rate": 6.01465833488777e-07, "loss": 0.1035, "step": 51300 }, { "epoch": 1.1963685115560607, "grad_norm": 2.1864066123962402, "learning_rate": 6.013881116707082e-07, "loss": 0.1122, "step": 51310 }, { "epoch": 1.1966016729328786, "grad_norm": 1.5365244150161743, "learning_rate": 6.013103898526394e-07, "loss": 0.1026, "step": 51320 }, { "epoch": 1.1968348343096966, "grad_norm": 1.4485079050064087, "learning_rate": 6.012326680345706e-07, "loss": 0.1044, "step": 51330 }, { "epoch": 1.1970679956865145, "grad_norm": 1.3205069303512573, "learning_rate": 6.011549462165018e-07, "loss": 0.1196, "step": 51340 }, { "epoch": 1.1973011570633325, "grad_norm": 1.5435576438903809, "learning_rate": 6.010772243984331e-07, "loss": 0.1207, "step": 51350 }, { "epoch": 1.1975343184401503, "grad_norm": 1.6521122455596924, "learning_rate": 6.009995025803644e-07, "loss": 0.1161, "step": 51360 }, { "epoch": 1.1977674798169684, "grad_norm": 1.496517539024353, "learning_rate": 6.009217807622956e-07, "loss": 0.1012, "step": 51370 }, { "epoch": 1.1980006411937862, "grad_norm": 2.8846182823181152, "learning_rate": 6.008440589442269e-07, "loss": 0.1063, "step": 51380 }, { "epoch": 1.1982338025706043, "grad_norm": 4.066763401031494, "learning_rate": 6.007663371261579e-07, "loss": 0.116, "step": 51390 }, { "epoch": 1.1984669639474221, "grad_norm": 1.280965805053711, "learning_rate": 6.006886153080892e-07, "loss": 0.1119, "step": 51400 }, { "epoch": 1.19870012532424, "grad_norm": 1.278515100479126, "learning_rate": 6.006108934900205e-07, "loss": 0.1052, "step": 51410 }, { "epoch": 1.198933286701058, "grad_norm": 1.6007518768310547, "learning_rate": 6.005331716719517e-07, "loss": 0.1104, "step": 51420 }, { "epoch": 1.1991664480778759, "grad_norm": 1.5299015045166016, "learning_rate": 6.00455449853883e-07, "loss": 0.1029, "step": 51430 }, { "epoch": 1.199399609454694, "grad_norm": 1.5170460939407349, "learning_rate": 6.003777280358142e-07, "loss": 0.115, "step": 51440 }, { "epoch": 1.1996327708315118, "grad_norm": 1.5407319068908691, "learning_rate": 6.003000062177455e-07, "loss": 0.1182, "step": 51450 }, { "epoch": 1.1998659322083296, "grad_norm": 1.2002109289169312, "learning_rate": 6.002222843996767e-07, "loss": 0.1094, "step": 51460 }, { "epoch": 1.2000990935851477, "grad_norm": 1.4476854801177979, "learning_rate": 6.001445625816078e-07, "loss": 0.1101, "step": 51470 }, { "epoch": 1.2003322549619655, "grad_norm": 2.2050819396972656, "learning_rate": 6.000668407635391e-07, "loss": 0.1109, "step": 51480 }, { "epoch": 1.2005654163387836, "grad_norm": 1.6656180620193481, "learning_rate": 5.999891189454703e-07, "loss": 0.1134, "step": 51490 }, { "epoch": 1.2007985777156014, "grad_norm": 2.379924774169922, "learning_rate": 5.999113971274016e-07, "loss": 0.1059, "step": 51500 }, { "epoch": 1.2010317390924192, "grad_norm": 1.380813479423523, "learning_rate": 5.998336753093329e-07, "loss": 0.1138, "step": 51510 }, { "epoch": 1.2012649004692373, "grad_norm": 3.470552921295166, "learning_rate": 5.99755953491264e-07, "loss": 0.1184, "step": 51520 }, { "epoch": 1.2014980618460551, "grad_norm": 1.2814478874206543, "learning_rate": 5.996782316731953e-07, "loss": 0.1207, "step": 51530 }, { "epoch": 1.2017312232228732, "grad_norm": 1.7228069305419922, "learning_rate": 5.996005098551265e-07, "loss": 0.1064, "step": 51540 }, { "epoch": 1.201964384599691, "grad_norm": 1.5549379587173462, "learning_rate": 5.995227880370577e-07, "loss": 0.1098, "step": 51550 }, { "epoch": 1.202197545976509, "grad_norm": 1.7151658535003662, "learning_rate": 5.99445066218989e-07, "loss": 0.0946, "step": 51560 }, { "epoch": 1.202430707353327, "grad_norm": 2.021358013153076, "learning_rate": 5.993673444009202e-07, "loss": 0.1092, "step": 51570 }, { "epoch": 1.202663868730145, "grad_norm": 2.3276705741882324, "learning_rate": 5.992896225828514e-07, "loss": 0.119, "step": 51580 }, { "epoch": 1.2028970301069628, "grad_norm": 1.3223360776901245, "learning_rate": 5.992119007647826e-07, "loss": 0.101, "step": 51590 }, { "epoch": 1.2031301914837806, "grad_norm": 2.4450719356536865, "learning_rate": 5.991341789467139e-07, "loss": 0.1076, "step": 51600 }, { "epoch": 1.2033633528605987, "grad_norm": 1.6337857246398926, "learning_rate": 5.990564571286452e-07, "loss": 0.1008, "step": 51610 }, { "epoch": 1.2035965142374165, "grad_norm": 2.242722511291504, "learning_rate": 5.989787353105764e-07, "loss": 0.1076, "step": 51620 }, { "epoch": 1.2038296756142346, "grad_norm": 1.849930763244629, "learning_rate": 5.989010134925076e-07, "loss": 0.1101, "step": 51630 }, { "epoch": 1.2040628369910524, "grad_norm": 1.8820548057556152, "learning_rate": 5.988232916744387e-07, "loss": 0.1189, "step": 51640 }, { "epoch": 1.2042959983678703, "grad_norm": 4.6510515213012695, "learning_rate": 5.9874556985637e-07, "loss": 0.1073, "step": 51650 }, { "epoch": 1.2045291597446883, "grad_norm": 1.8083997964859009, "learning_rate": 5.986678480383013e-07, "loss": 0.1127, "step": 51660 }, { "epoch": 1.2047623211215062, "grad_norm": 3.644314765930176, "learning_rate": 5.985901262202325e-07, "loss": 0.1216, "step": 51670 }, { "epoch": 1.2049954824983242, "grad_norm": 1.4774576425552368, "learning_rate": 5.985124044021638e-07, "loss": 0.1076, "step": 51680 }, { "epoch": 1.205228643875142, "grad_norm": 1.8100700378417969, "learning_rate": 5.984346825840951e-07, "loss": 0.1122, "step": 51690 }, { "epoch": 1.20546180525196, "grad_norm": 1.3925071954727173, "learning_rate": 5.983569607660262e-07, "loss": 0.1064, "step": 51700 }, { "epoch": 1.205694966628778, "grad_norm": 1.0864845514297485, "learning_rate": 5.982792389479574e-07, "loss": 0.1036, "step": 51710 }, { "epoch": 1.2059281280055958, "grad_norm": 1.548910140991211, "learning_rate": 5.982015171298886e-07, "loss": 0.1068, "step": 51720 }, { "epoch": 1.2061612893824138, "grad_norm": 2.2098448276519775, "learning_rate": 5.981237953118199e-07, "loss": 0.115, "step": 51730 }, { "epoch": 1.2063944507592317, "grad_norm": 1.5883418321609497, "learning_rate": 5.980460734937512e-07, "loss": 0.1111, "step": 51740 }, { "epoch": 1.2066276121360497, "grad_norm": 1.9930392503738403, "learning_rate": 5.979683516756824e-07, "loss": 0.1118, "step": 51750 }, { "epoch": 1.2068607735128676, "grad_norm": 1.696752905845642, "learning_rate": 5.978906298576136e-07, "loss": 0.1096, "step": 51760 }, { "epoch": 1.2070939348896856, "grad_norm": 3.468144655227661, "learning_rate": 5.978129080395448e-07, "loss": 0.1067, "step": 51770 }, { "epoch": 1.2073270962665035, "grad_norm": 4.640987873077393, "learning_rate": 5.977351862214761e-07, "loss": 0.1148, "step": 51780 }, { "epoch": 1.2075602576433213, "grad_norm": 3.5557854175567627, "learning_rate": 5.976574644034073e-07, "loss": 0.1082, "step": 51790 }, { "epoch": 1.2077934190201394, "grad_norm": 1.17855966091156, "learning_rate": 5.975797425853385e-07, "loss": 0.1057, "step": 51800 }, { "epoch": 1.2080265803969572, "grad_norm": 1.5138152837753296, "learning_rate": 5.975020207672698e-07, "loss": 0.1108, "step": 51810 }, { "epoch": 1.2082597417737753, "grad_norm": 2.29082989692688, "learning_rate": 5.97424298949201e-07, "loss": 0.0967, "step": 51820 }, { "epoch": 1.208492903150593, "grad_norm": 1.8838335275650024, "learning_rate": 5.973465771311322e-07, "loss": 0.117, "step": 51830 }, { "epoch": 1.208726064527411, "grad_norm": 1.837520956993103, "learning_rate": 5.972688553130635e-07, "loss": 0.1064, "step": 51840 }, { "epoch": 1.208959225904229, "grad_norm": 1.5268663167953491, "learning_rate": 5.971911334949947e-07, "loss": 0.113, "step": 51850 }, { "epoch": 1.2091923872810468, "grad_norm": 1.9795371294021606, "learning_rate": 5.97113411676926e-07, "loss": 0.1087, "step": 51860 }, { "epoch": 1.209425548657865, "grad_norm": 1.1118407249450684, "learning_rate": 5.970356898588571e-07, "loss": 0.1074, "step": 51870 }, { "epoch": 1.2096587100346827, "grad_norm": 1.7295278310775757, "learning_rate": 5.969579680407884e-07, "loss": 0.1011, "step": 51880 }, { "epoch": 1.2098918714115006, "grad_norm": 2.2239882946014404, "learning_rate": 5.968802462227196e-07, "loss": 0.1035, "step": 51890 }, { "epoch": 1.2101250327883186, "grad_norm": 2.566014289855957, "learning_rate": 5.968025244046508e-07, "loss": 0.1163, "step": 51900 }, { "epoch": 1.2103581941651365, "grad_norm": 2.4708352088928223, "learning_rate": 5.967248025865821e-07, "loss": 0.1167, "step": 51910 }, { "epoch": 1.2105913555419545, "grad_norm": 1.965147852897644, "learning_rate": 5.966470807685133e-07, "loss": 0.11, "step": 51920 }, { "epoch": 1.2108245169187724, "grad_norm": 1.3540350198745728, "learning_rate": 5.965693589504446e-07, "loss": 0.1151, "step": 51930 }, { "epoch": 1.2110576782955904, "grad_norm": 1.2776813507080078, "learning_rate": 5.964916371323759e-07, "loss": 0.1213, "step": 51940 }, { "epoch": 1.2112908396724082, "grad_norm": 1.7969416379928589, "learning_rate": 5.964139153143069e-07, "loss": 0.1005, "step": 51950 }, { "epoch": 1.2115240010492263, "grad_norm": 1.7027713060379028, "learning_rate": 5.963361934962382e-07, "loss": 0.1149, "step": 51960 }, { "epoch": 1.2117571624260441, "grad_norm": 1.177150845527649, "learning_rate": 5.962584716781694e-07, "loss": 0.1035, "step": 51970 }, { "epoch": 1.211990323802862, "grad_norm": 1.317775845527649, "learning_rate": 5.961807498601007e-07, "loss": 0.1098, "step": 51980 }, { "epoch": 1.21222348517968, "grad_norm": 1.2047532796859741, "learning_rate": 5.96103028042032e-07, "loss": 0.0999, "step": 51990 }, { "epoch": 1.2124566465564979, "grad_norm": 1.798043966293335, "learning_rate": 5.960253062239632e-07, "loss": 0.1115, "step": 52000 }, { "epoch": 1.212689807933316, "grad_norm": 1.1778218746185303, "learning_rate": 5.959475844058944e-07, "loss": 0.1185, "step": 52010 }, { "epoch": 1.2129229693101338, "grad_norm": 1.2005271911621094, "learning_rate": 5.958698625878256e-07, "loss": 0.1151, "step": 52020 }, { "epoch": 1.2131561306869516, "grad_norm": 1.0237867832183838, "learning_rate": 5.957921407697568e-07, "loss": 0.112, "step": 52030 }, { "epoch": 1.2133892920637697, "grad_norm": 1.0035165548324585, "learning_rate": 5.957144189516881e-07, "loss": 0.1157, "step": 52040 }, { "epoch": 1.2136224534405875, "grad_norm": 2.7004992961883545, "learning_rate": 5.956366971336193e-07, "loss": 0.1109, "step": 52050 }, { "epoch": 1.2138556148174056, "grad_norm": 1.4957633018493652, "learning_rate": 5.955589753155506e-07, "loss": 0.1201, "step": 52060 }, { "epoch": 1.2140887761942234, "grad_norm": 3.4010376930236816, "learning_rate": 5.954812534974817e-07, "loss": 0.1186, "step": 52070 }, { "epoch": 1.2143219375710412, "grad_norm": 2.107649326324463, "learning_rate": 5.95403531679413e-07, "loss": 0.1036, "step": 52080 }, { "epoch": 1.2145550989478593, "grad_norm": 1.639875054359436, "learning_rate": 5.953258098613443e-07, "loss": 0.0962, "step": 52090 }, { "epoch": 1.2147882603246771, "grad_norm": 1.2962307929992676, "learning_rate": 5.952480880432755e-07, "loss": 0.117, "step": 52100 }, { "epoch": 1.2150214217014952, "grad_norm": 2.250830888748169, "learning_rate": 5.951703662252067e-07, "loss": 0.0982, "step": 52110 }, { "epoch": 1.215254583078313, "grad_norm": 2.1540441513061523, "learning_rate": 5.95092644407138e-07, "loss": 0.1041, "step": 52120 }, { "epoch": 1.215487744455131, "grad_norm": 2.9144320487976074, "learning_rate": 5.950149225890691e-07, "loss": 0.1161, "step": 52130 }, { "epoch": 1.215720905831949, "grad_norm": 2.599914789199829, "learning_rate": 5.949372007710004e-07, "loss": 0.1217, "step": 52140 }, { "epoch": 1.215954067208767, "grad_norm": 1.4435486793518066, "learning_rate": 5.948594789529316e-07, "loss": 0.1218, "step": 52150 }, { "epoch": 1.2161872285855848, "grad_norm": 1.8916839361190796, "learning_rate": 5.947817571348629e-07, "loss": 0.1042, "step": 52160 }, { "epoch": 1.2164203899624026, "grad_norm": 1.291285753250122, "learning_rate": 5.947040353167942e-07, "loss": 0.1081, "step": 52170 }, { "epoch": 1.2166535513392207, "grad_norm": 1.3145314455032349, "learning_rate": 5.946263134987254e-07, "loss": 0.1144, "step": 52180 }, { "epoch": 1.2168867127160385, "grad_norm": 1.7871285676956177, "learning_rate": 5.945485916806565e-07, "loss": 0.1225, "step": 52190 }, { "epoch": 1.2171198740928566, "grad_norm": 2.0464494228363037, "learning_rate": 5.944708698625877e-07, "loss": 0.1123, "step": 52200 }, { "epoch": 1.2173530354696744, "grad_norm": 1.8499600887298584, "learning_rate": 5.94393148044519e-07, "loss": 0.0982, "step": 52210 }, { "epoch": 1.2175861968464923, "grad_norm": 1.7698286771774292, "learning_rate": 5.943154262264503e-07, "loss": 0.1009, "step": 52220 }, { "epoch": 1.2178193582233103, "grad_norm": 1.6468961238861084, "learning_rate": 5.942377044083815e-07, "loss": 0.1118, "step": 52230 }, { "epoch": 1.2180525196001282, "grad_norm": 1.3307255506515503, "learning_rate": 5.941599825903128e-07, "loss": 0.1005, "step": 52240 }, { "epoch": 1.2182856809769462, "grad_norm": 2.237133264541626, "learning_rate": 5.94082260772244e-07, "loss": 0.1099, "step": 52250 }, { "epoch": 1.218518842353764, "grad_norm": 1.671994924545288, "learning_rate": 5.940045389541752e-07, "loss": 0.1075, "step": 52260 }, { "epoch": 1.2187520037305821, "grad_norm": 2.04046893119812, "learning_rate": 5.939268171361064e-07, "loss": 0.1111, "step": 52270 }, { "epoch": 1.2189851651074, "grad_norm": 4.038487434387207, "learning_rate": 5.938490953180376e-07, "loss": 0.1245, "step": 52280 }, { "epoch": 1.2192183264842178, "grad_norm": 1.7590786218643188, "learning_rate": 5.937713734999689e-07, "loss": 0.1075, "step": 52290 }, { "epoch": 1.2194514878610359, "grad_norm": 2.635974884033203, "learning_rate": 5.936936516819001e-07, "loss": 0.1113, "step": 52300 }, { "epoch": 1.2196846492378537, "grad_norm": 1.2765305042266846, "learning_rate": 5.936159298638314e-07, "loss": 0.1062, "step": 52310 }, { "epoch": 1.2199178106146717, "grad_norm": 1.6213078498840332, "learning_rate": 5.935382080457626e-07, "loss": 0.1009, "step": 52320 }, { "epoch": 1.2201509719914896, "grad_norm": 1.4370601177215576, "learning_rate": 5.934604862276938e-07, "loss": 0.1142, "step": 52330 }, { "epoch": 1.2203841333683076, "grad_norm": 2.824409246444702, "learning_rate": 5.933827644096251e-07, "loss": 0.1056, "step": 52340 }, { "epoch": 1.2206172947451255, "grad_norm": 1.6687684059143066, "learning_rate": 5.933050425915562e-07, "loss": 0.1065, "step": 52350 }, { "epoch": 1.2208504561219433, "grad_norm": 1.5511009693145752, "learning_rate": 5.932273207734875e-07, "loss": 0.1188, "step": 52360 }, { "epoch": 1.2210836174987614, "grad_norm": 1.688998818397522, "learning_rate": 5.931495989554188e-07, "loss": 0.1163, "step": 52370 }, { "epoch": 1.2213167788755792, "grad_norm": 1.6367568969726562, "learning_rate": 5.930718771373499e-07, "loss": 0.1247, "step": 52380 }, { "epoch": 1.2215499402523973, "grad_norm": 1.226211428642273, "learning_rate": 5.929941553192812e-07, "loss": 0.1231, "step": 52390 }, { "epoch": 1.221783101629215, "grad_norm": 1.3093087673187256, "learning_rate": 5.929164335012124e-07, "loss": 0.1111, "step": 52400 }, { "epoch": 1.222016263006033, "grad_norm": 1.5150046348571777, "learning_rate": 5.928387116831437e-07, "loss": 0.1173, "step": 52410 }, { "epoch": 1.222249424382851, "grad_norm": 2.198889970779419, "learning_rate": 5.92760989865075e-07, "loss": 0.1056, "step": 52420 }, { "epoch": 1.2224825857596688, "grad_norm": 2.108884572982788, "learning_rate": 5.926832680470061e-07, "loss": 0.1213, "step": 52430 }, { "epoch": 1.222715747136487, "grad_norm": 1.1537383794784546, "learning_rate": 5.926055462289373e-07, "loss": 0.1059, "step": 52440 }, { "epoch": 1.2229489085133047, "grad_norm": 1.0541695356369019, "learning_rate": 5.925278244108685e-07, "loss": 0.111, "step": 52450 }, { "epoch": 1.2231820698901228, "grad_norm": 1.4405397176742554, "learning_rate": 5.924501025927998e-07, "loss": 0.1129, "step": 52460 }, { "epoch": 1.2234152312669406, "grad_norm": 1.9864598512649536, "learning_rate": 5.923723807747311e-07, "loss": 0.1247, "step": 52470 }, { "epoch": 1.2236483926437585, "grad_norm": 1.4972686767578125, "learning_rate": 5.922946589566623e-07, "loss": 0.1091, "step": 52480 }, { "epoch": 1.2238815540205765, "grad_norm": 1.5533266067504883, "learning_rate": 5.922169371385936e-07, "loss": 0.1077, "step": 52490 }, { "epoch": 1.2241147153973944, "grad_norm": 1.566794753074646, "learning_rate": 5.921392153205247e-07, "loss": 0.1045, "step": 52500 }, { "epoch": 1.2243478767742124, "grad_norm": 1.6545394659042358, "learning_rate": 5.920614935024559e-07, "loss": 0.1131, "step": 52510 }, { "epoch": 1.2245810381510303, "grad_norm": 2.234588384628296, "learning_rate": 5.919837716843872e-07, "loss": 0.1092, "step": 52520 }, { "epoch": 1.2248141995278483, "grad_norm": 3.135100841522217, "learning_rate": 5.919060498663184e-07, "loss": 0.1112, "step": 52530 }, { "epoch": 1.2250473609046661, "grad_norm": 1.6847587823867798, "learning_rate": 5.918283280482497e-07, "loss": 0.1044, "step": 52540 }, { "epoch": 1.225280522281484, "grad_norm": 2.9182188510894775, "learning_rate": 5.91750606230181e-07, "loss": 0.0984, "step": 52550 }, { "epoch": 1.225513683658302, "grad_norm": 3.450510025024414, "learning_rate": 5.916728844121121e-07, "loss": 0.1151, "step": 52560 }, { "epoch": 1.2257468450351199, "grad_norm": 2.415220022201538, "learning_rate": 5.915951625940434e-07, "loss": 0.1034, "step": 52570 }, { "epoch": 1.225980006411938, "grad_norm": 1.5876716375350952, "learning_rate": 5.915174407759746e-07, "loss": 0.1161, "step": 52580 }, { "epoch": 1.2262131677887558, "grad_norm": 1.409439206123352, "learning_rate": 5.914397189579058e-07, "loss": 0.1071, "step": 52590 }, { "epoch": 1.2264463291655736, "grad_norm": 2.519435405731201, "learning_rate": 5.913619971398371e-07, "loss": 0.1089, "step": 52600 }, { "epoch": 1.2266794905423917, "grad_norm": 2.2902212142944336, "learning_rate": 5.912842753217683e-07, "loss": 0.1191, "step": 52610 }, { "epoch": 1.2269126519192095, "grad_norm": 1.7752935886383057, "learning_rate": 5.912065535036995e-07, "loss": 0.1039, "step": 52620 }, { "epoch": 1.2271458132960276, "grad_norm": 1.5787625312805176, "learning_rate": 5.911288316856307e-07, "loss": 0.107, "step": 52630 }, { "epoch": 1.2273789746728454, "grad_norm": 1.1918821334838867, "learning_rate": 5.91051109867562e-07, "loss": 0.1085, "step": 52640 }, { "epoch": 1.2276121360496635, "grad_norm": 2.162797212600708, "learning_rate": 5.909733880494933e-07, "loss": 0.1183, "step": 52650 }, { "epoch": 1.2278452974264813, "grad_norm": 2.075009822845459, "learning_rate": 5.908956662314245e-07, "loss": 0.1137, "step": 52660 }, { "epoch": 1.2280784588032994, "grad_norm": 1.6899367570877075, "learning_rate": 5.908179444133557e-07, "loss": 0.1123, "step": 52670 }, { "epoch": 1.2283116201801172, "grad_norm": 1.3967899084091187, "learning_rate": 5.907402225952868e-07, "loss": 0.1069, "step": 52680 }, { "epoch": 1.228544781556935, "grad_norm": 3.1342036724090576, "learning_rate": 5.906625007772181e-07, "loss": 0.1211, "step": 52690 }, { "epoch": 1.228777942933753, "grad_norm": 1.6470961570739746, "learning_rate": 5.905847789591494e-07, "loss": 0.1103, "step": 52700 }, { "epoch": 1.229011104310571, "grad_norm": 3.1796951293945312, "learning_rate": 5.905070571410806e-07, "loss": 0.1076, "step": 52710 }, { "epoch": 1.229244265687389, "grad_norm": 1.1869709491729736, "learning_rate": 5.904293353230119e-07, "loss": 0.1023, "step": 52720 }, { "epoch": 1.2294774270642068, "grad_norm": 1.4914250373840332, "learning_rate": 5.903516135049431e-07, "loss": 0.1063, "step": 52730 }, { "epoch": 1.2297105884410247, "grad_norm": 2.399733066558838, "learning_rate": 5.902738916868744e-07, "loss": 0.1071, "step": 52740 }, { "epoch": 1.2299437498178427, "grad_norm": 1.2338368892669678, "learning_rate": 5.901961698688055e-07, "loss": 0.1127, "step": 52750 }, { "epoch": 1.2301769111946605, "grad_norm": 2.605588912963867, "learning_rate": 5.901184480507367e-07, "loss": 0.1089, "step": 52760 }, { "epoch": 1.2304100725714786, "grad_norm": 1.7438852787017822, "learning_rate": 5.90040726232668e-07, "loss": 0.1041, "step": 52770 }, { "epoch": 1.2306432339482964, "grad_norm": 2.416097640991211, "learning_rate": 5.899630044145992e-07, "loss": 0.117, "step": 52780 }, { "epoch": 1.2308763953251143, "grad_norm": 1.1495648622512817, "learning_rate": 5.898852825965305e-07, "loss": 0.1036, "step": 52790 }, { "epoch": 1.2311095567019323, "grad_norm": 1.5753873586654663, "learning_rate": 5.898075607784618e-07, "loss": 0.0913, "step": 52800 }, { "epoch": 1.2313427180787502, "grad_norm": 2.3348445892333984, "learning_rate": 5.897298389603929e-07, "loss": 0.1215, "step": 52810 }, { "epoch": 1.2315758794555682, "grad_norm": 1.3235784769058228, "learning_rate": 5.896521171423242e-07, "loss": 0.1167, "step": 52820 }, { "epoch": 1.231809040832386, "grad_norm": 1.5163782835006714, "learning_rate": 5.895743953242553e-07, "loss": 0.1086, "step": 52830 }, { "epoch": 1.2320422022092041, "grad_norm": 1.65091073513031, "learning_rate": 5.894966735061866e-07, "loss": 0.0944, "step": 52840 }, { "epoch": 1.232275363586022, "grad_norm": 3.23051381111145, "learning_rate": 5.894189516881179e-07, "loss": 0.1085, "step": 52850 }, { "epoch": 1.23250852496284, "grad_norm": 2.6219003200531006, "learning_rate": 5.893412298700491e-07, "loss": 0.1058, "step": 52860 }, { "epoch": 1.2327416863396579, "grad_norm": 1.748266339302063, "learning_rate": 5.892635080519803e-07, "loss": 0.1223, "step": 52870 }, { "epoch": 1.2329748477164757, "grad_norm": 1.3849639892578125, "learning_rate": 5.891857862339115e-07, "loss": 0.1035, "step": 52880 }, { "epoch": 1.2332080090932938, "grad_norm": 1.645351767539978, "learning_rate": 5.891080644158428e-07, "loss": 0.1078, "step": 52890 }, { "epoch": 1.2334411704701116, "grad_norm": 2.5184998512268066, "learning_rate": 5.890303425977741e-07, "loss": 0.1153, "step": 52900 }, { "epoch": 1.2336743318469297, "grad_norm": 2.555931329727173, "learning_rate": 5.889526207797052e-07, "loss": 0.1123, "step": 52910 }, { "epoch": 1.2339074932237475, "grad_norm": 1.4293925762176514, "learning_rate": 5.888748989616365e-07, "loss": 0.1046, "step": 52920 }, { "epoch": 1.2341406546005653, "grad_norm": 1.4063785076141357, "learning_rate": 5.887971771435676e-07, "loss": 0.1065, "step": 52930 }, { "epoch": 1.2343738159773834, "grad_norm": 1.7101207971572876, "learning_rate": 5.887194553254989e-07, "loss": 0.1112, "step": 52940 }, { "epoch": 1.2346069773542012, "grad_norm": 1.2774245738983154, "learning_rate": 5.886417335074302e-07, "loss": 0.1186, "step": 52950 }, { "epoch": 1.2348401387310193, "grad_norm": 1.7783528566360474, "learning_rate": 5.885640116893614e-07, "loss": 0.1064, "step": 52960 }, { "epoch": 1.2350733001078371, "grad_norm": 2.1701648235321045, "learning_rate": 5.884862898712927e-07, "loss": 0.1085, "step": 52970 }, { "epoch": 1.235306461484655, "grad_norm": 3.725310802459717, "learning_rate": 5.884163402350307e-07, "loss": 0.1195, "step": 52980 }, { "epoch": 1.235539622861473, "grad_norm": 2.2067530155181885, "learning_rate": 5.88338618416962e-07, "loss": 0.1042, "step": 52990 }, { "epoch": 1.2357727842382908, "grad_norm": 3.853386640548706, "learning_rate": 5.882608965988932e-07, "loss": 0.1181, "step": 53000 }, { "epoch": 1.236005945615109, "grad_norm": 1.5814266204833984, "learning_rate": 5.881831747808245e-07, "loss": 0.1187, "step": 53010 }, { "epoch": 1.2362391069919267, "grad_norm": 2.0799620151519775, "learning_rate": 5.881054529627558e-07, "loss": 0.1158, "step": 53020 }, { "epoch": 1.2364722683687448, "grad_norm": 1.6054127216339111, "learning_rate": 5.880277311446869e-07, "loss": 0.1203, "step": 53030 }, { "epoch": 1.2367054297455626, "grad_norm": 1.2753901481628418, "learning_rate": 5.879500093266181e-07, "loss": 0.1078, "step": 53040 }, { "epoch": 1.2369385911223807, "grad_norm": 1.7592068910598755, "learning_rate": 5.878722875085493e-07, "loss": 0.1088, "step": 53050 }, { "epoch": 1.2371717524991985, "grad_norm": 2.394059419631958, "learning_rate": 5.877945656904806e-07, "loss": 0.1107, "step": 53060 }, { "epoch": 1.2374049138760164, "grad_norm": 1.4605231285095215, "learning_rate": 5.877168438724119e-07, "loss": 0.1207, "step": 53070 }, { "epoch": 1.2376380752528344, "grad_norm": 1.2545802593231201, "learning_rate": 5.876391220543431e-07, "loss": 0.121, "step": 53080 }, { "epoch": 1.2378712366296523, "grad_norm": 2.269103527069092, "learning_rate": 5.875614002362743e-07, "loss": 0.1226, "step": 53090 }, { "epoch": 1.2381043980064703, "grad_norm": 1.3621093034744263, "learning_rate": 5.874836784182055e-07, "loss": 0.1098, "step": 53100 }, { "epoch": 1.2383375593832882, "grad_norm": 1.7366594076156616, "learning_rate": 5.874059566001368e-07, "loss": 0.1091, "step": 53110 }, { "epoch": 1.238570720760106, "grad_norm": 3.2177412509918213, "learning_rate": 5.87328234782068e-07, "loss": 0.1204, "step": 53120 }, { "epoch": 1.238803882136924, "grad_norm": 2.0816776752471924, "learning_rate": 5.872505129639992e-07, "loss": 0.1128, "step": 53130 }, { "epoch": 1.2390370435137419, "grad_norm": 2.2264013290405273, "learning_rate": 5.871727911459305e-07, "loss": 0.1183, "step": 53140 }, { "epoch": 1.23927020489056, "grad_norm": 2.2989416122436523, "learning_rate": 5.870950693278616e-07, "loss": 0.1101, "step": 53150 }, { "epoch": 1.2395033662673778, "grad_norm": 1.654383659362793, "learning_rate": 5.870173475097929e-07, "loss": 0.1121, "step": 53160 }, { "epoch": 1.2397365276441956, "grad_norm": 1.339009404182434, "learning_rate": 5.869396256917242e-07, "loss": 0.1172, "step": 53170 }, { "epoch": 1.2399696890210137, "grad_norm": 3.8933393955230713, "learning_rate": 5.868619038736554e-07, "loss": 0.1151, "step": 53180 }, { "epoch": 1.2402028503978315, "grad_norm": 1.5002321004867554, "learning_rate": 5.867841820555867e-07, "loss": 0.11, "step": 53190 }, { "epoch": 1.2404360117746496, "grad_norm": 2.597775936126709, "learning_rate": 5.867064602375178e-07, "loss": 0.1125, "step": 53200 }, { "epoch": 1.2406691731514674, "grad_norm": 1.1978951692581177, "learning_rate": 5.86628738419449e-07, "loss": 0.1128, "step": 53210 }, { "epoch": 1.2409023345282855, "grad_norm": 2.2997539043426514, "learning_rate": 5.865510166013803e-07, "loss": 0.0985, "step": 53220 }, { "epoch": 1.2411354959051033, "grad_norm": 1.3897401094436646, "learning_rate": 5.864732947833115e-07, "loss": 0.1036, "step": 53230 }, { "epoch": 1.2413686572819214, "grad_norm": 1.2752366065979004, "learning_rate": 5.863955729652428e-07, "loss": 0.1074, "step": 53240 }, { "epoch": 1.2416018186587392, "grad_norm": 1.2115927934646606, "learning_rate": 5.86317851147174e-07, "loss": 0.1059, "step": 53250 }, { "epoch": 1.241834980035557, "grad_norm": 1.2282675504684448, "learning_rate": 5.862401293291053e-07, "loss": 0.1099, "step": 53260 }, { "epoch": 1.242068141412375, "grad_norm": 3.72350811958313, "learning_rate": 5.861624075110365e-07, "loss": 0.1037, "step": 53270 }, { "epoch": 1.242301302789193, "grad_norm": 1.719397783279419, "learning_rate": 5.860846856929676e-07, "loss": 0.1079, "step": 53280 }, { "epoch": 1.242534464166011, "grad_norm": 1.4813672304153442, "learning_rate": 5.860069638748989e-07, "loss": 0.1128, "step": 53290 }, { "epoch": 1.2427676255428288, "grad_norm": 2.3215694427490234, "learning_rate": 5.859292420568301e-07, "loss": 0.1066, "step": 53300 }, { "epoch": 1.2430007869196467, "grad_norm": 1.7164512872695923, "learning_rate": 5.858515202387614e-07, "loss": 0.0967, "step": 53310 }, { "epoch": 1.2432339482964647, "grad_norm": 1.2757728099822998, "learning_rate": 5.857737984206927e-07, "loss": 0.1082, "step": 53320 }, { "epoch": 1.2434671096732826, "grad_norm": 2.0916194915771484, "learning_rate": 5.856960766026238e-07, "loss": 0.1144, "step": 53330 }, { "epoch": 1.2437002710501006, "grad_norm": 1.508705735206604, "learning_rate": 5.856183547845551e-07, "loss": 0.1048, "step": 53340 }, { "epoch": 1.2439334324269185, "grad_norm": 1.5387953519821167, "learning_rate": 5.855406329664864e-07, "loss": 0.1215, "step": 53350 }, { "epoch": 1.2441665938037363, "grad_norm": 1.889032006263733, "learning_rate": 5.854629111484175e-07, "loss": 0.1165, "step": 53360 }, { "epoch": 1.2443997551805543, "grad_norm": 1.122999906539917, "learning_rate": 5.853851893303488e-07, "loss": 0.1094, "step": 53370 }, { "epoch": 1.2446329165573722, "grad_norm": 1.9323824644088745, "learning_rate": 5.8530746751228e-07, "loss": 0.1106, "step": 53380 }, { "epoch": 1.2448660779341902, "grad_norm": 2.34393310546875, "learning_rate": 5.852297456942112e-07, "loss": 0.1088, "step": 53390 }, { "epoch": 1.245099239311008, "grad_norm": 3.032163619995117, "learning_rate": 5.851520238761425e-07, "loss": 0.1127, "step": 53400 }, { "epoch": 1.2453324006878261, "grad_norm": 1.572630524635315, "learning_rate": 5.850743020580737e-07, "loss": 0.1071, "step": 53410 }, { "epoch": 1.245565562064644, "grad_norm": 2.5826804637908936, "learning_rate": 5.84996580240005e-07, "loss": 0.12, "step": 53420 }, { "epoch": 1.245798723441462, "grad_norm": 1.305845856666565, "learning_rate": 5.849188584219362e-07, "loss": 0.1124, "step": 53430 }, { "epoch": 1.2460318848182799, "grad_norm": 2.0145182609558105, "learning_rate": 5.848411366038674e-07, "loss": 0.1033, "step": 53440 }, { "epoch": 1.2462650461950977, "grad_norm": 1.2273813486099243, "learning_rate": 5.847634147857986e-07, "loss": 0.109, "step": 53450 }, { "epoch": 1.2464982075719158, "grad_norm": 1.2839226722717285, "learning_rate": 5.846856929677298e-07, "loss": 0.1018, "step": 53460 }, { "epoch": 1.2467313689487336, "grad_norm": 2.0261833667755127, "learning_rate": 5.846079711496611e-07, "loss": 0.1032, "step": 53470 }, { "epoch": 1.2469645303255517, "grad_norm": 2.091925859451294, "learning_rate": 5.845302493315923e-07, "loss": 0.1055, "step": 53480 }, { "epoch": 1.2471976917023695, "grad_norm": 1.447800874710083, "learning_rate": 5.844525275135236e-07, "loss": 0.1034, "step": 53490 }, { "epoch": 1.2474308530791873, "grad_norm": 1.8655128479003906, "learning_rate": 5.843748056954549e-07, "loss": 0.1317, "step": 53500 }, { "epoch": 1.2476640144560054, "grad_norm": 1.3110907077789307, "learning_rate": 5.842970838773861e-07, "loss": 0.1192, "step": 53510 }, { "epoch": 1.2478971758328232, "grad_norm": 2.3674399852752686, "learning_rate": 5.842193620593172e-07, "loss": 0.1138, "step": 53520 }, { "epoch": 1.2481303372096413, "grad_norm": 1.3996385335922241, "learning_rate": 5.841416402412484e-07, "loss": 0.1033, "step": 53530 }, { "epoch": 1.2483634985864591, "grad_norm": 1.4414951801300049, "learning_rate": 5.840639184231797e-07, "loss": 0.1058, "step": 53540 }, { "epoch": 1.2485966599632772, "grad_norm": 1.5532459020614624, "learning_rate": 5.83986196605111e-07, "loss": 0.1071, "step": 53550 }, { "epoch": 1.248829821340095, "grad_norm": 3.090188980102539, "learning_rate": 5.839084747870422e-07, "loss": 0.1121, "step": 53560 }, { "epoch": 1.2490629827169129, "grad_norm": 1.8599116802215576, "learning_rate": 5.838307529689735e-07, "loss": 0.1015, "step": 53570 }, { "epoch": 1.249296144093731, "grad_norm": 1.3666324615478516, "learning_rate": 5.837530311509046e-07, "loss": 0.1122, "step": 53580 }, { "epoch": 1.2495293054705487, "grad_norm": 3.4455370903015137, "learning_rate": 5.836753093328359e-07, "loss": 0.1048, "step": 53590 }, { "epoch": 1.2497624668473668, "grad_norm": 1.9629079103469849, "learning_rate": 5.835975875147671e-07, "loss": 0.1084, "step": 53600 }, { "epoch": 1.2499956282241846, "grad_norm": 1.8019413948059082, "learning_rate": 5.835198656966983e-07, "loss": 0.1139, "step": 53610 }, { "epoch": 1.2502287896010027, "grad_norm": 1.7009104490280151, "learning_rate": 5.834421438786296e-07, "loss": 0.1037, "step": 53620 }, { "epoch": 1.2504619509778205, "grad_norm": 2.18097186088562, "learning_rate": 5.833644220605608e-07, "loss": 0.1174, "step": 53630 }, { "epoch": 1.2506951123546384, "grad_norm": 1.1554208993911743, "learning_rate": 5.83286700242492e-07, "loss": 0.1145, "step": 53640 }, { "epoch": 1.2509282737314564, "grad_norm": 1.431710124015808, "learning_rate": 5.832089784244233e-07, "loss": 0.1053, "step": 53650 }, { "epoch": 1.2511614351082743, "grad_norm": 1.9198594093322754, "learning_rate": 5.831312566063545e-07, "loss": 0.1109, "step": 53660 }, { "epoch": 1.2513945964850923, "grad_norm": 1.314685583114624, "learning_rate": 5.830535347882858e-07, "loss": 0.1192, "step": 53670 }, { "epoch": 1.2516277578619102, "grad_norm": 1.3060288429260254, "learning_rate": 5.829758129702169e-07, "loss": 0.1103, "step": 53680 }, { "epoch": 1.251860919238728, "grad_norm": 1.250335454940796, "learning_rate": 5.828980911521482e-07, "loss": 0.115, "step": 53690 }, { "epoch": 1.252094080615546, "grad_norm": 1.4283115863800049, "learning_rate": 5.828203693340794e-07, "loss": 0.1123, "step": 53700 }, { "epoch": 1.252327241992364, "grad_norm": 1.6249053478240967, "learning_rate": 5.827426475160106e-07, "loss": 0.1043, "step": 53710 }, { "epoch": 1.252560403369182, "grad_norm": 1.965699553489685, "learning_rate": 5.826649256979419e-07, "loss": 0.1214, "step": 53720 }, { "epoch": 1.2527935647459998, "grad_norm": 2.1024367809295654, "learning_rate": 5.825872038798732e-07, "loss": 0.1096, "step": 53730 }, { "epoch": 1.2530267261228176, "grad_norm": 2.1871323585510254, "learning_rate": 5.825094820618044e-07, "loss": 0.102, "step": 53740 }, { "epoch": 1.2532598874996357, "grad_norm": 2.4706127643585205, "learning_rate": 5.824317602437357e-07, "loss": 0.1087, "step": 53750 }, { "epoch": 1.2534930488764537, "grad_norm": 1.699703574180603, "learning_rate": 5.823540384256667e-07, "loss": 0.1128, "step": 53760 }, { "epoch": 1.2537262102532716, "grad_norm": 1.4271541833877563, "learning_rate": 5.82276316607598e-07, "loss": 0.1172, "step": 53770 }, { "epoch": 1.2539593716300894, "grad_norm": 1.2389616966247559, "learning_rate": 5.821985947895293e-07, "loss": 0.1111, "step": 53780 }, { "epoch": 1.2541925330069075, "grad_norm": 2.210116386413574, "learning_rate": 5.821208729714605e-07, "loss": 0.1108, "step": 53790 }, { "epoch": 1.2544256943837253, "grad_norm": 1.6953895092010498, "learning_rate": 5.820431511533918e-07, "loss": 0.1128, "step": 53800 }, { "epoch": 1.2546588557605434, "grad_norm": 1.6891236305236816, "learning_rate": 5.81965429335323e-07, "loss": 0.1104, "step": 53810 }, { "epoch": 1.2548920171373612, "grad_norm": 2.242926836013794, "learning_rate": 5.818877075172542e-07, "loss": 0.1188, "step": 53820 }, { "epoch": 1.255125178514179, "grad_norm": 1.0570604801177979, "learning_rate": 5.818099856991855e-07, "loss": 0.1236, "step": 53830 }, { "epoch": 1.255358339890997, "grad_norm": 1.7940517663955688, "learning_rate": 5.817322638811167e-07, "loss": 0.1145, "step": 53840 }, { "epoch": 1.255591501267815, "grad_norm": 2.1924808025360107, "learning_rate": 5.816545420630479e-07, "loss": 0.1173, "step": 53850 }, { "epoch": 1.255824662644633, "grad_norm": 1.366170048713684, "learning_rate": 5.815768202449791e-07, "loss": 0.1043, "step": 53860 }, { "epoch": 1.2560578240214508, "grad_norm": 2.6007578372955322, "learning_rate": 5.814990984269104e-07, "loss": 0.1115, "step": 53870 }, { "epoch": 1.2562909853982687, "grad_norm": 2.280296802520752, "learning_rate": 5.814213766088416e-07, "loss": 0.1207, "step": 53880 }, { "epoch": 1.2565241467750867, "grad_norm": 1.5028760433197021, "learning_rate": 5.813436547907728e-07, "loss": 0.1116, "step": 53890 }, { "epoch": 1.2567573081519046, "grad_norm": 1.2291126251220703, "learning_rate": 5.812659329727041e-07, "loss": 0.1121, "step": 53900 }, { "epoch": 1.2569904695287226, "grad_norm": 3.2189064025878906, "learning_rate": 5.811882111546353e-07, "loss": 0.1108, "step": 53910 }, { "epoch": 1.2572236309055405, "grad_norm": 2.18379282951355, "learning_rate": 5.811104893365666e-07, "loss": 0.1147, "step": 53920 }, { "epoch": 1.2574567922823583, "grad_norm": 1.653643250465393, "learning_rate": 5.810327675184978e-07, "loss": 0.1114, "step": 53930 }, { "epoch": 1.2576899536591764, "grad_norm": 1.525604248046875, "learning_rate": 5.80955045700429e-07, "loss": 0.1191, "step": 53940 }, { "epoch": 1.2579231150359944, "grad_norm": 1.517322063446045, "learning_rate": 5.808773238823602e-07, "loss": 0.1128, "step": 53950 }, { "epoch": 1.2581562764128122, "grad_norm": 2.59907603263855, "learning_rate": 5.807996020642914e-07, "loss": 0.108, "step": 53960 }, { "epoch": 1.25838943778963, "grad_norm": 2.669786214828491, "learning_rate": 5.807218802462227e-07, "loss": 0.1201, "step": 53970 }, { "epoch": 1.2586225991664481, "grad_norm": 2.3909409046173096, "learning_rate": 5.80644158428154e-07, "loss": 0.1162, "step": 53980 }, { "epoch": 1.258855760543266, "grad_norm": 1.2967873811721802, "learning_rate": 5.805664366100852e-07, "loss": 0.1095, "step": 53990 }, { "epoch": 1.259088921920084, "grad_norm": 1.5936940908432007, "learning_rate": 5.804887147920165e-07, "loss": 0.1097, "step": 54000 }, { "epoch": 1.2593220832969019, "grad_norm": 1.8075143098831177, "learning_rate": 5.804109929739475e-07, "loss": 0.1089, "step": 54010 }, { "epoch": 1.2595552446737197, "grad_norm": 1.1757806539535522, "learning_rate": 5.803332711558788e-07, "loss": 0.1127, "step": 54020 }, { "epoch": 1.2597884060505378, "grad_norm": 1.3212789297103882, "learning_rate": 5.802555493378101e-07, "loss": 0.1167, "step": 54030 }, { "epoch": 1.2600215674273556, "grad_norm": 1.552621841430664, "learning_rate": 5.801778275197413e-07, "loss": 0.1115, "step": 54040 }, { "epoch": 1.2602547288041737, "grad_norm": 1.3329888582229614, "learning_rate": 5.801001057016726e-07, "loss": 0.1148, "step": 54050 }, { "epoch": 1.2604878901809915, "grad_norm": 1.2075459957122803, "learning_rate": 5.800223838836038e-07, "loss": 0.1048, "step": 54060 }, { "epoch": 1.2607210515578093, "grad_norm": 2.737973213195801, "learning_rate": 5.79944662065535e-07, "loss": 0.1145, "step": 54070 }, { "epoch": 1.2609542129346274, "grad_norm": 1.4160054922103882, "learning_rate": 5.798669402474663e-07, "loss": 0.1115, "step": 54080 }, { "epoch": 1.2611873743114452, "grad_norm": 2.968543767929077, "learning_rate": 5.797892184293974e-07, "loss": 0.1102, "step": 54090 }, { "epoch": 1.2614205356882633, "grad_norm": 2.1514697074890137, "learning_rate": 5.797114966113287e-07, "loss": 0.1059, "step": 54100 }, { "epoch": 1.2616536970650811, "grad_norm": 1.121646523475647, "learning_rate": 5.7963377479326e-07, "loss": 0.111, "step": 54110 }, { "epoch": 1.261886858441899, "grad_norm": 2.1393015384674072, "learning_rate": 5.795560529751912e-07, "loss": 0.1164, "step": 54120 }, { "epoch": 1.262120019818717, "grad_norm": 2.347252607345581, "learning_rate": 5.794783311571224e-07, "loss": 0.1176, "step": 54130 }, { "epoch": 1.262353181195535, "grad_norm": 1.3539434671401978, "learning_rate": 5.794006093390536e-07, "loss": 0.1158, "step": 54140 }, { "epoch": 1.262586342572353, "grad_norm": 1.3505260944366455, "learning_rate": 5.793228875209849e-07, "loss": 0.1129, "step": 54150 }, { "epoch": 1.2628195039491708, "grad_norm": 1.1533857583999634, "learning_rate": 5.792451657029162e-07, "loss": 0.1047, "step": 54160 }, { "epoch": 1.2630526653259888, "grad_norm": 2.943136692047119, "learning_rate": 5.791674438848473e-07, "loss": 0.1125, "step": 54170 }, { "epoch": 1.2632858267028066, "grad_norm": 1.2685717344284058, "learning_rate": 5.790897220667786e-07, "loss": 0.1081, "step": 54180 }, { "epoch": 1.2635189880796247, "grad_norm": 2.226466417312622, "learning_rate": 5.790120002487097e-07, "loss": 0.1109, "step": 54190 }, { "epoch": 1.2637521494564425, "grad_norm": 1.1658834218978882, "learning_rate": 5.78934278430641e-07, "loss": 0.1016, "step": 54200 }, { "epoch": 1.2639853108332604, "grad_norm": 2.1371688842773438, "learning_rate": 5.788565566125723e-07, "loss": 0.1182, "step": 54210 }, { "epoch": 1.2642184722100784, "grad_norm": 1.1587880849838257, "learning_rate": 5.787788347945035e-07, "loss": 0.1148, "step": 54220 }, { "epoch": 1.2644516335868963, "grad_norm": 1.2607433795928955, "learning_rate": 5.787011129764348e-07, "loss": 0.1016, "step": 54230 }, { "epoch": 1.2646847949637143, "grad_norm": 3.208730936050415, "learning_rate": 5.78623391158366e-07, "loss": 0.1028, "step": 54240 }, { "epoch": 1.2649179563405322, "grad_norm": 1.4916163682937622, "learning_rate": 5.785456693402971e-07, "loss": 0.1114, "step": 54250 }, { "epoch": 1.26515111771735, "grad_norm": 2.1433217525482178, "learning_rate": 5.784679475222284e-07, "loss": 0.1166, "step": 54260 }, { "epoch": 1.265384279094168, "grad_norm": 1.4257786273956299, "learning_rate": 5.783902257041596e-07, "loss": 0.1109, "step": 54270 }, { "epoch": 1.265617440470986, "grad_norm": 1.567623257637024, "learning_rate": 5.783125038860909e-07, "loss": 0.1131, "step": 54280 }, { "epoch": 1.265850601847804, "grad_norm": 3.877162456512451, "learning_rate": 5.782347820680221e-07, "loss": 0.1163, "step": 54290 }, { "epoch": 1.2660837632246218, "grad_norm": 1.2597267627716064, "learning_rate": 5.781570602499534e-07, "loss": 0.1144, "step": 54300 }, { "epoch": 1.2663169246014399, "grad_norm": 1.3115400075912476, "learning_rate": 5.780793384318847e-07, "loss": 0.1144, "step": 54310 }, { "epoch": 1.2665500859782577, "grad_norm": 2.7821123600006104, "learning_rate": 5.780016166138158e-07, "loss": 0.1133, "step": 54320 }, { "epoch": 1.2667832473550757, "grad_norm": 1.5473250150680542, "learning_rate": 5.77923894795747e-07, "loss": 0.0996, "step": 54330 }, { "epoch": 1.2670164087318936, "grad_norm": 1.5333364009857178, "learning_rate": 5.778461729776782e-07, "loss": 0.1166, "step": 54340 }, { "epoch": 1.2672495701087114, "grad_norm": 3.063180446624756, "learning_rate": 5.777684511596095e-07, "loss": 0.1079, "step": 54350 }, { "epoch": 1.2674827314855295, "grad_norm": 1.507815957069397, "learning_rate": 5.776907293415408e-07, "loss": 0.1119, "step": 54360 }, { "epoch": 1.2677158928623473, "grad_norm": 2.4243054389953613, "learning_rate": 5.77613007523472e-07, "loss": 0.1212, "step": 54370 }, { "epoch": 1.2679490542391654, "grad_norm": 2.364705801010132, "learning_rate": 5.775352857054032e-07, "loss": 0.1124, "step": 54380 }, { "epoch": 1.2681822156159832, "grad_norm": 1.9646838903427124, "learning_rate": 5.774575638873344e-07, "loss": 0.1195, "step": 54390 }, { "epoch": 1.268415376992801, "grad_norm": 2.8662590980529785, "learning_rate": 5.773798420692657e-07, "loss": 0.1069, "step": 54400 }, { "epoch": 1.268648538369619, "grad_norm": 1.8035238981246948, "learning_rate": 5.773021202511969e-07, "loss": 0.1148, "step": 54410 }, { "epoch": 1.268881699746437, "grad_norm": 3.1251425743103027, "learning_rate": 5.772243984331281e-07, "loss": 0.1069, "step": 54420 }, { "epoch": 1.269114861123255, "grad_norm": 2.0581750869750977, "learning_rate": 5.771466766150594e-07, "loss": 0.1081, "step": 54430 }, { "epoch": 1.2693480225000728, "grad_norm": 2.8952834606170654, "learning_rate": 5.770689547969905e-07, "loss": 0.1118, "step": 54440 }, { "epoch": 1.2695811838768907, "grad_norm": 1.321754813194275, "learning_rate": 5.769912329789218e-07, "loss": 0.1235, "step": 54450 }, { "epoch": 1.2698143452537087, "grad_norm": 1.7300236225128174, "learning_rate": 5.769135111608531e-07, "loss": 0.1157, "step": 54460 }, { "epoch": 1.2700475066305268, "grad_norm": 1.7737301588058472, "learning_rate": 5.768357893427843e-07, "loss": 0.1109, "step": 54470 }, { "epoch": 1.2702806680073446, "grad_norm": 1.7340002059936523, "learning_rate": 5.767580675247156e-07, "loss": 0.1104, "step": 54480 }, { "epoch": 1.2705138293841625, "grad_norm": 1.2032707929611206, "learning_rate": 5.766803457066467e-07, "loss": 0.1148, "step": 54490 }, { "epoch": 1.2707469907609805, "grad_norm": 2.74558162689209, "learning_rate": 5.766026238885779e-07, "loss": 0.1109, "step": 54500 }, { "epoch": 1.2709801521377984, "grad_norm": 1.6553970575332642, "learning_rate": 5.765249020705092e-07, "loss": 0.0982, "step": 54510 }, { "epoch": 1.2712133135146164, "grad_norm": 1.8852758407592773, "learning_rate": 5.764471802524404e-07, "loss": 0.1182, "step": 54520 }, { "epoch": 1.2714464748914343, "grad_norm": 2.4518635272979736, "learning_rate": 5.763694584343717e-07, "loss": 0.1233, "step": 54530 }, { "epoch": 1.271679636268252, "grad_norm": 1.2229578495025635, "learning_rate": 5.76291736616303e-07, "loss": 0.122, "step": 54540 }, { "epoch": 1.2719127976450701, "grad_norm": 2.031376838684082, "learning_rate": 5.762140147982342e-07, "loss": 0.1176, "step": 54550 }, { "epoch": 1.272145959021888, "grad_norm": 1.3739204406738281, "learning_rate": 5.761362929801654e-07, "loss": 0.1148, "step": 54560 }, { "epoch": 1.272379120398706, "grad_norm": 1.3963686227798462, "learning_rate": 5.760585711620965e-07, "loss": 0.1041, "step": 54570 }, { "epoch": 1.2726122817755239, "grad_norm": 3.688987970352173, "learning_rate": 5.759808493440278e-07, "loss": 0.1249, "step": 54580 }, { "epoch": 1.2728454431523417, "grad_norm": 3.1511545181274414, "learning_rate": 5.75903127525959e-07, "loss": 0.1176, "step": 54590 }, { "epoch": 1.2730786045291598, "grad_norm": 1.241883397102356, "learning_rate": 5.758254057078903e-07, "loss": 0.112, "step": 54600 }, { "epoch": 1.2733117659059776, "grad_norm": 1.863034963607788, "learning_rate": 5.757476838898216e-07, "loss": 0.1077, "step": 54610 }, { "epoch": 1.2735449272827957, "grad_norm": 1.2283319234848022, "learning_rate": 5.756699620717527e-07, "loss": 0.1075, "step": 54620 }, { "epoch": 1.2737780886596135, "grad_norm": 0.972208559513092, "learning_rate": 5.75592240253684e-07, "loss": 0.1023, "step": 54630 }, { "epoch": 1.2740112500364313, "grad_norm": 3.4984872341156006, "learning_rate": 5.755145184356153e-07, "loss": 0.117, "step": 54640 }, { "epoch": 1.2742444114132494, "grad_norm": 1.9127811193466187, "learning_rate": 5.754367966175464e-07, "loss": 0.1051, "step": 54650 }, { "epoch": 1.2744775727900675, "grad_norm": 1.732624888420105, "learning_rate": 5.753590747994777e-07, "loss": 0.1136, "step": 54660 }, { "epoch": 1.2747107341668853, "grad_norm": 1.7012007236480713, "learning_rate": 5.752813529814089e-07, "loss": 0.1068, "step": 54670 }, { "epoch": 1.2749438955437031, "grad_norm": 3.2096073627471924, "learning_rate": 5.752036311633401e-07, "loss": 0.1114, "step": 54680 }, { "epoch": 1.2751770569205212, "grad_norm": 1.6285994052886963, "learning_rate": 5.751259093452714e-07, "loss": 0.1319, "step": 54690 }, { "epoch": 1.275410218297339, "grad_norm": 1.395232081413269, "learning_rate": 5.750481875272026e-07, "loss": 0.102, "step": 54700 }, { "epoch": 1.275643379674157, "grad_norm": 2.4777212142944336, "learning_rate": 5.749704657091339e-07, "loss": 0.1148, "step": 54710 }, { "epoch": 1.275876541050975, "grad_norm": 3.310875177383423, "learning_rate": 5.748927438910651e-07, "loss": 0.1077, "step": 54720 }, { "epoch": 1.2761097024277928, "grad_norm": 1.6385371685028076, "learning_rate": 5.748150220729963e-07, "loss": 0.113, "step": 54730 }, { "epoch": 1.2763428638046108, "grad_norm": 1.151158094406128, "learning_rate": 5.747373002549275e-07, "loss": 0.1126, "step": 54740 }, { "epoch": 1.2765760251814287, "grad_norm": 2.2854225635528564, "learning_rate": 5.746595784368587e-07, "loss": 0.1106, "step": 54750 }, { "epoch": 1.2768091865582467, "grad_norm": 2.5952627658843994, "learning_rate": 5.7458185661879e-07, "loss": 0.1124, "step": 54760 }, { "epoch": 1.2770423479350645, "grad_norm": 1.5391079187393188, "learning_rate": 5.745041348007212e-07, "loss": 0.1126, "step": 54770 }, { "epoch": 1.2772755093118824, "grad_norm": 1.6719913482666016, "learning_rate": 5.744264129826525e-07, "loss": 0.1162, "step": 54780 }, { "epoch": 1.2775086706887004, "grad_norm": 1.7580560445785522, "learning_rate": 5.743486911645838e-07, "loss": 0.1058, "step": 54790 }, { "epoch": 1.2777418320655183, "grad_norm": 2.449359178543091, "learning_rate": 5.74270969346515e-07, "loss": 0.1111, "step": 54800 }, { "epoch": 1.2779749934423363, "grad_norm": 1.4926284551620483, "learning_rate": 5.741932475284461e-07, "loss": 0.1068, "step": 54810 }, { "epoch": 1.2782081548191542, "grad_norm": 2.64162540435791, "learning_rate": 5.741155257103773e-07, "loss": 0.101, "step": 54820 }, { "epoch": 1.278441316195972, "grad_norm": 1.2284226417541504, "learning_rate": 5.740378038923086e-07, "loss": 0.1083, "step": 54830 }, { "epoch": 1.27867447757279, "grad_norm": 2.823638439178467, "learning_rate": 5.739600820742399e-07, "loss": 0.1079, "step": 54840 }, { "epoch": 1.2789076389496081, "grad_norm": 1.2279740571975708, "learning_rate": 5.738823602561711e-07, "loss": 0.1037, "step": 54850 }, { "epoch": 1.279140800326426, "grad_norm": 1.3807382583618164, "learning_rate": 5.738046384381024e-07, "loss": 0.1128, "step": 54860 }, { "epoch": 1.2793739617032438, "grad_norm": 1.2609845399856567, "learning_rate": 5.737269166200335e-07, "loss": 0.1126, "step": 54870 }, { "epoch": 1.2796071230800619, "grad_norm": 1.9365390539169312, "learning_rate": 5.736491948019648e-07, "loss": 0.1087, "step": 54880 }, { "epoch": 1.2798402844568797, "grad_norm": 1.6505322456359863, "learning_rate": 5.73571472983896e-07, "loss": 0.1192, "step": 54890 }, { "epoch": 1.2800734458336978, "grad_norm": 1.7042723894119263, "learning_rate": 5.734937511658272e-07, "loss": 0.1136, "step": 54900 }, { "epoch": 1.2803066072105156, "grad_norm": 1.4317103624343872, "learning_rate": 5.734160293477585e-07, "loss": 0.1072, "step": 54910 }, { "epoch": 1.2805397685873334, "grad_norm": 2.017094612121582, "learning_rate": 5.733383075296897e-07, "loss": 0.1031, "step": 54920 }, { "epoch": 1.2807729299641515, "grad_norm": 1.1243751049041748, "learning_rate": 5.732605857116209e-07, "loss": 0.0984, "step": 54930 }, { "epoch": 1.2810060913409693, "grad_norm": 3.0960841178894043, "learning_rate": 5.731828638935522e-07, "loss": 0.1151, "step": 54940 }, { "epoch": 1.2812392527177874, "grad_norm": 1.5432978868484497, "learning_rate": 5.731051420754834e-07, "loss": 0.1142, "step": 54950 }, { "epoch": 1.2814724140946052, "grad_norm": 3.702371120452881, "learning_rate": 5.730274202574147e-07, "loss": 0.1104, "step": 54960 }, { "epoch": 1.281705575471423, "grad_norm": 3.280038595199585, "learning_rate": 5.729496984393458e-07, "loss": 0.1199, "step": 54970 }, { "epoch": 1.2819387368482411, "grad_norm": 1.6687347888946533, "learning_rate": 5.728719766212771e-07, "loss": 0.1086, "step": 54980 }, { "epoch": 1.282171898225059, "grad_norm": 1.410575270652771, "learning_rate": 5.727942548032083e-07, "loss": 0.1108, "step": 54990 }, { "epoch": 1.282405059601877, "grad_norm": 2.8556668758392334, "learning_rate": 5.727165329851395e-07, "loss": 0.112, "step": 55000 }, { "epoch": 1.2826382209786948, "grad_norm": 2.2744345664978027, "learning_rate": 5.726388111670708e-07, "loss": 0.1189, "step": 55010 }, { "epoch": 1.2828713823555127, "grad_norm": 1.8802440166473389, "learning_rate": 5.72561089349002e-07, "loss": 0.1077, "step": 55020 }, { "epoch": 1.2831045437323307, "grad_norm": 1.47622549533844, "learning_rate": 5.724833675309333e-07, "loss": 0.1139, "step": 55030 }, { "epoch": 1.2833377051091488, "grad_norm": 1.620671033859253, "learning_rate": 5.724056457128646e-07, "loss": 0.1237, "step": 55040 }, { "epoch": 1.2835708664859666, "grad_norm": 1.1655076742172241, "learning_rate": 5.723279238947956e-07, "loss": 0.1051, "step": 55050 }, { "epoch": 1.2838040278627845, "grad_norm": 2.720712184906006, "learning_rate": 5.722502020767269e-07, "loss": 0.1055, "step": 55060 }, { "epoch": 1.2840371892396025, "grad_norm": 1.6638636589050293, "learning_rate": 5.721724802586582e-07, "loss": 0.1062, "step": 55070 }, { "epoch": 1.2842703506164204, "grad_norm": 1.5271964073181152, "learning_rate": 5.720947584405894e-07, "loss": 0.1105, "step": 55080 }, { "epoch": 1.2845035119932384, "grad_norm": 2.7493555545806885, "learning_rate": 5.720170366225207e-07, "loss": 0.1116, "step": 55090 }, { "epoch": 1.2847366733700563, "grad_norm": 1.005234956741333, "learning_rate": 5.719393148044519e-07, "loss": 0.1055, "step": 55100 }, { "epoch": 1.284969834746874, "grad_norm": 1.4825360774993896, "learning_rate": 5.718615929863831e-07, "loss": 0.1176, "step": 55110 }, { "epoch": 1.2852029961236922, "grad_norm": 2.5219674110412598, "learning_rate": 5.717838711683144e-07, "loss": 0.11, "step": 55120 }, { "epoch": 1.28543615750051, "grad_norm": 1.1680666208267212, "learning_rate": 5.717061493502455e-07, "loss": 0.1093, "step": 55130 }, { "epoch": 1.285669318877328, "grad_norm": 1.700873851776123, "learning_rate": 5.716284275321768e-07, "loss": 0.1131, "step": 55140 }, { "epoch": 1.2859024802541459, "grad_norm": 1.5769859552383423, "learning_rate": 5.71550705714108e-07, "loss": 0.1099, "step": 55150 }, { "epoch": 1.2861356416309637, "grad_norm": 1.4592961072921753, "learning_rate": 5.714729838960393e-07, "loss": 0.1146, "step": 55160 }, { "epoch": 1.2863688030077818, "grad_norm": 1.3310896158218384, "learning_rate": 5.713952620779706e-07, "loss": 0.1004, "step": 55170 }, { "epoch": 1.2866019643845996, "grad_norm": 1.9879876375198364, "learning_rate": 5.713175402599017e-07, "loss": 0.1133, "step": 55180 }, { "epoch": 1.2868351257614177, "grad_norm": 1.84815514087677, "learning_rate": 5.71239818441833e-07, "loss": 0.1027, "step": 55190 }, { "epoch": 1.2870682871382355, "grad_norm": 1.1859108209609985, "learning_rate": 5.711620966237642e-07, "loss": 0.0989, "step": 55200 }, { "epoch": 1.2873014485150533, "grad_norm": 1.8004003763198853, "learning_rate": 5.710843748056954e-07, "loss": 0.1094, "step": 55210 }, { "epoch": 1.2875346098918714, "grad_norm": 1.509967565536499, "learning_rate": 5.710066529876267e-07, "loss": 0.1099, "step": 55220 }, { "epoch": 1.2877677712686895, "grad_norm": 1.6736100912094116, "learning_rate": 5.709289311695579e-07, "loss": 0.1101, "step": 55230 }, { "epoch": 1.2880009326455073, "grad_norm": 1.747218132019043, "learning_rate": 5.708512093514891e-07, "loss": 0.1013, "step": 55240 }, { "epoch": 1.2882340940223251, "grad_norm": 0.9206863045692444, "learning_rate": 5.707734875334203e-07, "loss": 0.1042, "step": 55250 }, { "epoch": 1.2884672553991432, "grad_norm": 2.6082205772399902, "learning_rate": 5.706957657153516e-07, "loss": 0.1077, "step": 55260 }, { "epoch": 1.288700416775961, "grad_norm": 1.3662811517715454, "learning_rate": 5.706180438972829e-07, "loss": 0.0988, "step": 55270 }, { "epoch": 1.288933578152779, "grad_norm": 2.629847288131714, "learning_rate": 5.705403220792141e-07, "loss": 0.1057, "step": 55280 }, { "epoch": 1.289166739529597, "grad_norm": 1.2561516761779785, "learning_rate": 5.704626002611453e-07, "loss": 0.1068, "step": 55290 }, { "epoch": 1.2893999009064148, "grad_norm": 1.3638817071914673, "learning_rate": 5.703848784430764e-07, "loss": 0.107, "step": 55300 }, { "epoch": 1.2896330622832328, "grad_norm": 1.336321234703064, "learning_rate": 5.703071566250077e-07, "loss": 0.0994, "step": 55310 }, { "epoch": 1.2898662236600507, "grad_norm": 1.5634093284606934, "learning_rate": 5.70229434806939e-07, "loss": 0.101, "step": 55320 }, { "epoch": 1.2900993850368687, "grad_norm": 1.7219393253326416, "learning_rate": 5.701517129888702e-07, "loss": 0.1064, "step": 55330 }, { "epoch": 1.2903325464136866, "grad_norm": 1.6620540618896484, "learning_rate": 5.700739911708015e-07, "loss": 0.121, "step": 55340 }, { "epoch": 1.2905657077905044, "grad_norm": 1.1433016061782837, "learning_rate": 5.699962693527327e-07, "loss": 0.1043, "step": 55350 }, { "epoch": 1.2907988691673224, "grad_norm": 1.357252597808838, "learning_rate": 5.699185475346639e-07, "loss": 0.1149, "step": 55360 }, { "epoch": 1.2910320305441403, "grad_norm": 2.114982843399048, "learning_rate": 5.698408257165951e-07, "loss": 0.1025, "step": 55370 }, { "epoch": 1.2912651919209583, "grad_norm": 3.5709757804870605, "learning_rate": 5.697631038985263e-07, "loss": 0.1112, "step": 55380 }, { "epoch": 1.2914983532977762, "grad_norm": 1.3517975807189941, "learning_rate": 5.696853820804576e-07, "loss": 0.1086, "step": 55390 }, { "epoch": 1.291731514674594, "grad_norm": 3.344698667526245, "learning_rate": 5.696076602623888e-07, "loss": 0.1106, "step": 55400 }, { "epoch": 1.291964676051412, "grad_norm": 1.370832085609436, "learning_rate": 5.695299384443201e-07, "loss": 0.0967, "step": 55410 }, { "epoch": 1.2921978374282301, "grad_norm": 1.3822953701019287, "learning_rate": 5.694522166262513e-07, "loss": 0.1043, "step": 55420 }, { "epoch": 1.292430998805048, "grad_norm": 1.8405025005340576, "learning_rate": 5.693744948081825e-07, "loss": 0.1173, "step": 55430 }, { "epoch": 1.2926641601818658, "grad_norm": 1.602081060409546, "learning_rate": 5.692967729901138e-07, "loss": 0.1023, "step": 55440 }, { "epoch": 1.2928973215586839, "grad_norm": 2.2013750076293945, "learning_rate": 5.69219051172045e-07, "loss": 0.1016, "step": 55450 }, { "epoch": 1.2931304829355017, "grad_norm": 1.6479543447494507, "learning_rate": 5.691413293539762e-07, "loss": 0.1105, "step": 55460 }, { "epoch": 1.2933636443123198, "grad_norm": 2.177273750305176, "learning_rate": 5.690636075359075e-07, "loss": 0.1116, "step": 55470 }, { "epoch": 1.2935968056891376, "grad_norm": 3.183589220046997, "learning_rate": 5.689936578996456e-07, "loss": 0.1088, "step": 55480 }, { "epoch": 1.2938299670659554, "grad_norm": 1.9664616584777832, "learning_rate": 5.689159360815769e-07, "loss": 0.1231, "step": 55490 }, { "epoch": 1.2940631284427735, "grad_norm": 1.8519917726516724, "learning_rate": 5.68838214263508e-07, "loss": 0.1047, "step": 55500 }, { "epoch": 1.2942962898195913, "grad_norm": 2.0677759647369385, "learning_rate": 5.687604924454392e-07, "loss": 0.1094, "step": 55510 }, { "epoch": 1.2945294511964094, "grad_norm": 1.223796010017395, "learning_rate": 5.686827706273704e-07, "loss": 0.1012, "step": 55520 }, { "epoch": 1.2947626125732272, "grad_norm": 2.0249388217926025, "learning_rate": 5.686050488093017e-07, "loss": 0.1059, "step": 55530 }, { "epoch": 1.294995773950045, "grad_norm": 2.61891770362854, "learning_rate": 5.68527326991233e-07, "loss": 0.1145, "step": 55540 }, { "epoch": 1.2952289353268631, "grad_norm": 2.5531463623046875, "learning_rate": 5.684496051731642e-07, "loss": 0.1196, "step": 55550 }, { "epoch": 1.295462096703681, "grad_norm": 2.6335034370422363, "learning_rate": 5.683718833550955e-07, "loss": 0.1088, "step": 55560 }, { "epoch": 1.295695258080499, "grad_norm": 3.0161609649658203, "learning_rate": 5.682941615370267e-07, "loss": 0.1071, "step": 55570 }, { "epoch": 1.2959284194573168, "grad_norm": 1.4410125017166138, "learning_rate": 5.682164397189578e-07, "loss": 0.1056, "step": 55580 }, { "epoch": 1.296161580834135, "grad_norm": 1.5687010288238525, "learning_rate": 5.681387179008891e-07, "loss": 0.1038, "step": 55590 }, { "epoch": 1.2963947422109527, "grad_norm": 1.6143771409988403, "learning_rate": 5.680609960828203e-07, "loss": 0.1073, "step": 55600 }, { "epoch": 1.2966279035877708, "grad_norm": 1.906398057937622, "learning_rate": 5.679832742647516e-07, "loss": 0.1035, "step": 55610 }, { "epoch": 1.2968610649645886, "grad_norm": 2.041571855545044, "learning_rate": 5.679055524466828e-07, "loss": 0.1093, "step": 55620 }, { "epoch": 1.2970942263414065, "grad_norm": 2.8281054496765137, "learning_rate": 5.678278306286141e-07, "loss": 0.1111, "step": 55630 }, { "epoch": 1.2973273877182245, "grad_norm": 1.5935364961624146, "learning_rate": 5.677501088105453e-07, "loss": 0.1012, "step": 55640 }, { "epoch": 1.2975605490950424, "grad_norm": 1.462843656539917, "learning_rate": 5.676723869924765e-07, "loss": 0.1102, "step": 55650 }, { "epoch": 1.2977937104718604, "grad_norm": 1.354611873626709, "learning_rate": 5.675946651744077e-07, "loss": 0.1061, "step": 55660 }, { "epoch": 1.2980268718486783, "grad_norm": 1.6519173383712769, "learning_rate": 5.675169433563389e-07, "loss": 0.1051, "step": 55670 }, { "epoch": 1.298260033225496, "grad_norm": 1.3127163648605347, "learning_rate": 5.674392215382702e-07, "loss": 0.1143, "step": 55680 }, { "epoch": 1.2984931946023142, "grad_norm": 1.777182936668396, "learning_rate": 5.673614997202015e-07, "loss": 0.1135, "step": 55690 }, { "epoch": 1.298726355979132, "grad_norm": 2.580634355545044, "learning_rate": 5.672837779021326e-07, "loss": 0.1128, "step": 55700 }, { "epoch": 1.29895951735595, "grad_norm": 1.2606784105300903, "learning_rate": 5.672060560840639e-07, "loss": 0.111, "step": 55710 }, { "epoch": 1.299192678732768, "grad_norm": 2.4399328231811523, "learning_rate": 5.671283342659951e-07, "loss": 0.1137, "step": 55720 }, { "epoch": 1.2994258401095857, "grad_norm": 1.4854366779327393, "learning_rate": 5.670506124479264e-07, "loss": 0.1079, "step": 55730 }, { "epoch": 1.2996590014864038, "grad_norm": 2.759061098098755, "learning_rate": 5.669728906298576e-07, "loss": 0.1091, "step": 55740 }, { "epoch": 1.2998921628632218, "grad_norm": 2.381392478942871, "learning_rate": 5.668951688117888e-07, "loss": 0.112, "step": 55750 }, { "epoch": 1.3001253242400397, "grad_norm": 1.8923015594482422, "learning_rate": 5.6681744699372e-07, "loss": 0.1034, "step": 55760 }, { "epoch": 1.3003584856168575, "grad_norm": 1.882035493850708, "learning_rate": 5.667397251756512e-07, "loss": 0.1119, "step": 55770 }, { "epoch": 1.3005916469936756, "grad_norm": 1.7451143264770508, "learning_rate": 5.666620033575825e-07, "loss": 0.1043, "step": 55780 }, { "epoch": 1.3008248083704934, "grad_norm": 2.0562663078308105, "learning_rate": 5.665842815395138e-07, "loss": 0.1074, "step": 55790 }, { "epoch": 1.3010579697473115, "grad_norm": 1.3806205987930298, "learning_rate": 5.66506559721445e-07, "loss": 0.111, "step": 55800 }, { "epoch": 1.3012911311241293, "grad_norm": 1.3021228313446045, "learning_rate": 5.664288379033763e-07, "loss": 0.1117, "step": 55810 }, { "epoch": 1.3015242925009471, "grad_norm": 2.0878725051879883, "learning_rate": 5.663511160853073e-07, "loss": 0.1157, "step": 55820 }, { "epoch": 1.3017574538777652, "grad_norm": 1.1544668674468994, "learning_rate": 5.662733942672386e-07, "loss": 0.1042, "step": 55830 }, { "epoch": 1.301990615254583, "grad_norm": 1.8589831590652466, "learning_rate": 5.661956724491699e-07, "loss": 0.1043, "step": 55840 }, { "epoch": 1.302223776631401, "grad_norm": 1.1413992643356323, "learning_rate": 5.661179506311011e-07, "loss": 0.1095, "step": 55850 }, { "epoch": 1.302456938008219, "grad_norm": 1.8587771654129028, "learning_rate": 5.660402288130324e-07, "loss": 0.1002, "step": 55860 }, { "epoch": 1.3026900993850368, "grad_norm": 1.0797604322433472, "learning_rate": 5.659625069949637e-07, "loss": 0.102, "step": 55870 }, { "epoch": 1.3029232607618548, "grad_norm": 1.2471182346343994, "learning_rate": 5.658847851768948e-07, "loss": 0.109, "step": 55880 }, { "epoch": 1.3031564221386727, "grad_norm": 1.538725733757019, "learning_rate": 5.658070633588261e-07, "loss": 0.1118, "step": 55890 }, { "epoch": 1.3033895835154907, "grad_norm": 1.4045621156692505, "learning_rate": 5.657293415407572e-07, "loss": 0.1091, "step": 55900 }, { "epoch": 1.3036227448923086, "grad_norm": 2.3213207721710205, "learning_rate": 5.656516197226885e-07, "loss": 0.1069, "step": 55910 }, { "epoch": 1.3038559062691264, "grad_norm": 1.2518746852874756, "learning_rate": 5.655738979046198e-07, "loss": 0.1114, "step": 55920 }, { "epoch": 1.3040890676459445, "grad_norm": 1.8745924234390259, "learning_rate": 5.65496176086551e-07, "loss": 0.0929, "step": 55930 }, { "epoch": 1.3043222290227625, "grad_norm": 1.8611338138580322, "learning_rate": 5.654184542684822e-07, "loss": 0.0965, "step": 55940 }, { "epoch": 1.3045553903995804, "grad_norm": 1.7346307039260864, "learning_rate": 5.653407324504134e-07, "loss": 0.1038, "step": 55950 }, { "epoch": 1.3047885517763982, "grad_norm": 2.0572144985198975, "learning_rate": 5.652630106323447e-07, "loss": 0.1107, "step": 55960 }, { "epoch": 1.3050217131532162, "grad_norm": 3.841176748275757, "learning_rate": 5.65185288814276e-07, "loss": 0.1098, "step": 55970 }, { "epoch": 1.305254874530034, "grad_norm": 1.8320242166519165, "learning_rate": 5.651075669962071e-07, "loss": 0.111, "step": 55980 }, { "epoch": 1.3054880359068521, "grad_norm": 1.9907888174057007, "learning_rate": 5.650298451781384e-07, "loss": 0.1122, "step": 55990 }, { "epoch": 1.30572119728367, "grad_norm": 3.3420767784118652, "learning_rate": 5.649521233600695e-07, "loss": 0.1098, "step": 56000 }, { "epoch": 1.3059543586604878, "grad_norm": 1.9453542232513428, "learning_rate": 5.648744015420008e-07, "loss": 0.109, "step": 56010 }, { "epoch": 1.3061875200373059, "grad_norm": 1.7438350915908813, "learning_rate": 5.647966797239321e-07, "loss": 0.1102, "step": 56020 }, { "epoch": 1.3064206814141237, "grad_norm": 1.871178150177002, "learning_rate": 5.647189579058633e-07, "loss": 0.1125, "step": 56030 }, { "epoch": 1.3066538427909418, "grad_norm": 1.4874364137649536, "learning_rate": 5.646412360877946e-07, "loss": 0.1202, "step": 56040 }, { "epoch": 1.3068870041677596, "grad_norm": 1.3816068172454834, "learning_rate": 5.645635142697258e-07, "loss": 0.1167, "step": 56050 }, { "epoch": 1.3071201655445774, "grad_norm": 1.4477803707122803, "learning_rate": 5.64485792451657e-07, "loss": 0.1213, "step": 56060 }, { "epoch": 1.3073533269213955, "grad_norm": 1.3034095764160156, "learning_rate": 5.644080706335882e-07, "loss": 0.1139, "step": 56070 }, { "epoch": 1.3075864882982133, "grad_norm": 1.3596071004867554, "learning_rate": 5.643303488155194e-07, "loss": 0.1114, "step": 56080 }, { "epoch": 1.3078196496750314, "grad_norm": 1.665572166442871, "learning_rate": 5.642526269974507e-07, "loss": 0.1003, "step": 56090 }, { "epoch": 1.3080528110518492, "grad_norm": 2.498850107192993, "learning_rate": 5.641749051793819e-07, "loss": 0.1063, "step": 56100 }, { "epoch": 1.308285972428667, "grad_norm": 3.421926736831665, "learning_rate": 5.640971833613132e-07, "loss": 0.1072, "step": 56110 }, { "epoch": 1.3085191338054851, "grad_norm": 1.4620336294174194, "learning_rate": 5.640194615432445e-07, "loss": 0.1081, "step": 56120 }, { "epoch": 1.3087522951823032, "grad_norm": 2.002582550048828, "learning_rate": 5.639417397251756e-07, "loss": 0.0955, "step": 56130 }, { "epoch": 1.308985456559121, "grad_norm": 1.6419614553451538, "learning_rate": 5.638640179071068e-07, "loss": 0.1041, "step": 56140 }, { "epoch": 1.3092186179359389, "grad_norm": 1.554440975189209, "learning_rate": 5.63786296089038e-07, "loss": 0.1025, "step": 56150 }, { "epoch": 1.309451779312757, "grad_norm": 7.303431987762451, "learning_rate": 5.637085742709693e-07, "loss": 0.1071, "step": 56160 }, { "epoch": 1.3096849406895748, "grad_norm": 1.1598385572433472, "learning_rate": 5.636308524529006e-07, "loss": 0.1017, "step": 56170 }, { "epoch": 1.3099181020663928, "grad_norm": 2.3992066383361816, "learning_rate": 5.635531306348318e-07, "loss": 0.1089, "step": 56180 }, { "epoch": 1.3101512634432106, "grad_norm": 1.4864863157272339, "learning_rate": 5.63475408816763e-07, "loss": 0.1048, "step": 56190 }, { "epoch": 1.3103844248200285, "grad_norm": 1.2370315790176392, "learning_rate": 5.633976869986942e-07, "loss": 0.1102, "step": 56200 }, { "epoch": 1.3106175861968465, "grad_norm": 1.7546238899230957, "learning_rate": 5.633199651806255e-07, "loss": 0.113, "step": 56210 }, { "epoch": 1.3108507475736644, "grad_norm": 2.7139787673950195, "learning_rate": 5.632422433625567e-07, "loss": 0.1133, "step": 56220 }, { "epoch": 1.3110839089504824, "grad_norm": 1.0213392972946167, "learning_rate": 5.631645215444879e-07, "loss": 0.1164, "step": 56230 }, { "epoch": 1.3113170703273003, "grad_norm": 1.835268497467041, "learning_rate": 5.630867997264192e-07, "loss": 0.1108, "step": 56240 }, { "epoch": 1.311550231704118, "grad_norm": 1.735947847366333, "learning_rate": 5.630090779083503e-07, "loss": 0.1223, "step": 56250 }, { "epoch": 1.3117833930809362, "grad_norm": 2.013819932937622, "learning_rate": 5.629313560902816e-07, "loss": 0.116, "step": 56260 }, { "epoch": 1.312016554457754, "grad_norm": 1.6628472805023193, "learning_rate": 5.628536342722129e-07, "loss": 0.1141, "step": 56270 }, { "epoch": 1.312249715834572, "grad_norm": 1.3745520114898682, "learning_rate": 5.627759124541441e-07, "loss": 0.1014, "step": 56280 }, { "epoch": 1.31248287721139, "grad_norm": 1.2189887762069702, "learning_rate": 5.626981906360754e-07, "loss": 0.1112, "step": 56290 }, { "epoch": 1.3127160385882077, "grad_norm": 1.691633701324463, "learning_rate": 5.626204688180066e-07, "loss": 0.1265, "step": 56300 }, { "epoch": 1.3129491999650258, "grad_norm": 2.0274498462677, "learning_rate": 5.625427469999377e-07, "loss": 0.1044, "step": 56310 }, { "epoch": 1.3131823613418439, "grad_norm": 1.3518000841140747, "learning_rate": 5.62465025181869e-07, "loss": 0.1164, "step": 56320 }, { "epoch": 1.3134155227186617, "grad_norm": 1.4770598411560059, "learning_rate": 5.623873033638002e-07, "loss": 0.1061, "step": 56330 }, { "epoch": 1.3136486840954795, "grad_norm": 1.5092780590057373, "learning_rate": 5.623095815457315e-07, "loss": 0.108, "step": 56340 }, { "epoch": 1.3138818454722976, "grad_norm": 1.298223853111267, "learning_rate": 5.622318597276628e-07, "loss": 0.1246, "step": 56350 }, { "epoch": 1.3141150068491154, "grad_norm": 3.1010868549346924, "learning_rate": 5.62154137909594e-07, "loss": 0.1067, "step": 56360 }, { "epoch": 1.3143481682259335, "grad_norm": 3.512404680252075, "learning_rate": 5.620764160915253e-07, "loss": 0.1238, "step": 56370 }, { "epoch": 1.3145813296027513, "grad_norm": 1.3558948040008545, "learning_rate": 5.619986942734563e-07, "loss": 0.1039, "step": 56380 }, { "epoch": 1.3148144909795692, "grad_norm": 1.2549159526824951, "learning_rate": 5.619209724553876e-07, "loss": 0.1114, "step": 56390 }, { "epoch": 1.3150476523563872, "grad_norm": 1.8185533285140991, "learning_rate": 5.618432506373189e-07, "loss": 0.1146, "step": 56400 }, { "epoch": 1.315280813733205, "grad_norm": 1.3571897745132446, "learning_rate": 5.617655288192501e-07, "loss": 0.1035, "step": 56410 }, { "epoch": 1.315513975110023, "grad_norm": 2.942807197570801, "learning_rate": 5.616878070011814e-07, "loss": 0.1174, "step": 56420 }, { "epoch": 1.315747136486841, "grad_norm": 2.19146728515625, "learning_rate": 5.616100851831126e-07, "loss": 0.1049, "step": 56430 }, { "epoch": 1.3159802978636588, "grad_norm": 1.5248535871505737, "learning_rate": 5.615323633650438e-07, "loss": 0.1068, "step": 56440 }, { "epoch": 1.3162134592404768, "grad_norm": 2.040308952331543, "learning_rate": 5.614546415469751e-07, "loss": 0.0994, "step": 56450 }, { "epoch": 1.3164466206172947, "grad_norm": 1.8355132341384888, "learning_rate": 5.613769197289063e-07, "loss": 0.1194, "step": 56460 }, { "epoch": 1.3166797819941127, "grad_norm": 1.4482269287109375, "learning_rate": 5.612991979108375e-07, "loss": 0.1196, "step": 56470 }, { "epoch": 1.3169129433709306, "grad_norm": 2.7572929859161377, "learning_rate": 5.612214760927687e-07, "loss": 0.1282, "step": 56480 }, { "epoch": 1.3171461047477484, "grad_norm": 1.568135142326355, "learning_rate": 5.611437542747e-07, "loss": 0.1134, "step": 56490 }, { "epoch": 1.3173792661245665, "grad_norm": 1.3413355350494385, "learning_rate": 5.610660324566312e-07, "loss": 0.1176, "step": 56500 }, { "epoch": 1.3176124275013845, "grad_norm": 1.7137985229492188, "learning_rate": 5.609883106385624e-07, "loss": 0.1153, "step": 56510 }, { "epoch": 1.3178455888782024, "grad_norm": 1.111842393875122, "learning_rate": 5.609105888204937e-07, "loss": 0.1003, "step": 56520 }, { "epoch": 1.3180787502550202, "grad_norm": 1.311193823814392, "learning_rate": 5.608328670024249e-07, "loss": 0.1111, "step": 56530 }, { "epoch": 1.3183119116318383, "grad_norm": 1.9925382137298584, "learning_rate": 5.607551451843562e-07, "loss": 0.1154, "step": 56540 }, { "epoch": 1.318545073008656, "grad_norm": 1.8917386531829834, "learning_rate": 5.606774233662874e-07, "loss": 0.1092, "step": 56550 }, { "epoch": 1.3187782343854741, "grad_norm": 1.7865290641784668, "learning_rate": 5.605997015482185e-07, "loss": 0.1201, "step": 56560 }, { "epoch": 1.319011395762292, "grad_norm": 1.451187014579773, "learning_rate": 5.605219797301498e-07, "loss": 0.0972, "step": 56570 }, { "epoch": 1.3192445571391098, "grad_norm": 1.3357549905776978, "learning_rate": 5.60444257912081e-07, "loss": 0.1035, "step": 56580 }, { "epoch": 1.3194777185159279, "grad_norm": 2.9218878746032715, "learning_rate": 5.603665360940123e-07, "loss": 0.1227, "step": 56590 }, { "epoch": 1.3197108798927457, "grad_norm": 1.944416880607605, "learning_rate": 5.602888142759436e-07, "loss": 0.1082, "step": 56600 }, { "epoch": 1.3199440412695638, "grad_norm": 1.181036353111267, "learning_rate": 5.602110924578748e-07, "loss": 0.1126, "step": 56610 }, { "epoch": 1.3201772026463816, "grad_norm": 2.23905348777771, "learning_rate": 5.60133370639806e-07, "loss": 0.1119, "step": 56620 }, { "epoch": 1.3204103640231994, "grad_norm": 1.4890968799591064, "learning_rate": 5.600556488217371e-07, "loss": 0.1043, "step": 56630 }, { "epoch": 1.3206435254000175, "grad_norm": 1.9164146184921265, "learning_rate": 5.599779270036684e-07, "loss": 0.1131, "step": 56640 }, { "epoch": 1.3208766867768353, "grad_norm": 1.6123727560043335, "learning_rate": 5.599002051855997e-07, "loss": 0.1207, "step": 56650 }, { "epoch": 1.3211098481536534, "grad_norm": 1.3121978044509888, "learning_rate": 5.598224833675309e-07, "loss": 0.113, "step": 56660 }, { "epoch": 1.3213430095304712, "grad_norm": 3.2001423835754395, "learning_rate": 5.597447615494622e-07, "loss": 0.1114, "step": 56670 }, { "epoch": 1.321576170907289, "grad_norm": 1.2826844453811646, "learning_rate": 5.596670397313933e-07, "loss": 0.097, "step": 56680 }, { "epoch": 1.3218093322841071, "grad_norm": 1.8829094171524048, "learning_rate": 5.595893179133246e-07, "loss": 0.122, "step": 56690 }, { "epoch": 1.3220424936609252, "grad_norm": 1.309313416481018, "learning_rate": 5.595115960952559e-07, "loss": 0.1087, "step": 56700 }, { "epoch": 1.322275655037743, "grad_norm": 1.384460210800171, "learning_rate": 5.59433874277187e-07, "loss": 0.1139, "step": 56710 }, { "epoch": 1.3225088164145609, "grad_norm": 1.3553816080093384, "learning_rate": 5.593561524591183e-07, "loss": 0.1106, "step": 56720 }, { "epoch": 1.322741977791379, "grad_norm": 1.4838111400604248, "learning_rate": 5.592784306410496e-07, "loss": 0.108, "step": 56730 }, { "epoch": 1.3229751391681968, "grad_norm": 1.5552432537078857, "learning_rate": 5.592007088229807e-07, "loss": 0.11, "step": 56740 }, { "epoch": 1.3232083005450148, "grad_norm": 1.5432097911834717, "learning_rate": 5.59122987004912e-07, "loss": 0.1142, "step": 56750 }, { "epoch": 1.3234414619218327, "grad_norm": 1.142555594444275, "learning_rate": 5.590452651868432e-07, "loss": 0.1145, "step": 56760 }, { "epoch": 1.3236746232986505, "grad_norm": 1.703552484512329, "learning_rate": 5.589675433687745e-07, "loss": 0.1136, "step": 56770 }, { "epoch": 1.3239077846754685, "grad_norm": 2.808351993560791, "learning_rate": 5.588898215507058e-07, "loss": 0.1182, "step": 56780 }, { "epoch": 1.3241409460522864, "grad_norm": 1.8719432353973389, "learning_rate": 5.588120997326369e-07, "loss": 0.1002, "step": 56790 }, { "epoch": 1.3243741074291044, "grad_norm": 3.1085686683654785, "learning_rate": 5.587343779145681e-07, "loss": 0.1155, "step": 56800 }, { "epoch": 1.3246072688059223, "grad_norm": 1.2768596410751343, "learning_rate": 5.586566560964993e-07, "loss": 0.1223, "step": 56810 }, { "epoch": 1.3248404301827401, "grad_norm": 1.391087293624878, "learning_rate": 5.585789342784306e-07, "loss": 0.1186, "step": 56820 }, { "epoch": 1.3250735915595582, "grad_norm": 1.1543092727661133, "learning_rate": 5.585012124603619e-07, "loss": 0.1145, "step": 56830 }, { "epoch": 1.325306752936376, "grad_norm": 1.1999963521957397, "learning_rate": 5.584234906422931e-07, "loss": 0.1097, "step": 56840 }, { "epoch": 1.325539914313194, "grad_norm": 1.5587507486343384, "learning_rate": 5.583457688242244e-07, "loss": 0.1081, "step": 56850 }, { "epoch": 1.325773075690012, "grad_norm": 1.3442132472991943, "learning_rate": 5.582680470061556e-07, "loss": 0.1186, "step": 56860 }, { "epoch": 1.32600623706683, "grad_norm": 1.7261768579483032, "learning_rate": 5.581903251880867e-07, "loss": 0.1192, "step": 56870 }, { "epoch": 1.3262393984436478, "grad_norm": 2.3528425693511963, "learning_rate": 5.58112603370018e-07, "loss": 0.1178, "step": 56880 }, { "epoch": 1.3264725598204659, "grad_norm": 2.197495222091675, "learning_rate": 5.580348815519492e-07, "loss": 0.1084, "step": 56890 }, { "epoch": 1.3267057211972837, "grad_norm": 3.0110394954681396, "learning_rate": 5.579571597338805e-07, "loss": 0.0949, "step": 56900 }, { "epoch": 1.3269388825741015, "grad_norm": 1.604404330253601, "learning_rate": 5.578794379158117e-07, "loss": 0.1205, "step": 56910 }, { "epoch": 1.3271720439509196, "grad_norm": 1.2428617477416992, "learning_rate": 5.57801716097743e-07, "loss": 0.1113, "step": 56920 }, { "epoch": 1.3274052053277374, "grad_norm": 2.693373680114746, "learning_rate": 5.577239942796742e-07, "loss": 0.1044, "step": 56930 }, { "epoch": 1.3276383667045555, "grad_norm": 2.5690231323242188, "learning_rate": 5.576462724616054e-07, "loss": 0.1022, "step": 56940 }, { "epoch": 1.3278715280813733, "grad_norm": 3.241459369659424, "learning_rate": 5.575685506435366e-07, "loss": 0.1183, "step": 56950 }, { "epoch": 1.3281046894581912, "grad_norm": 1.0792219638824463, "learning_rate": 5.574908288254678e-07, "loss": 0.1109, "step": 56960 }, { "epoch": 1.3283378508350092, "grad_norm": 1.1782426834106445, "learning_rate": 5.574131070073991e-07, "loss": 0.1043, "step": 56970 }, { "epoch": 1.328571012211827, "grad_norm": 1.5267343521118164, "learning_rate": 5.573353851893304e-07, "loss": 0.1105, "step": 56980 }, { "epoch": 1.328804173588645, "grad_norm": 1.6768888235092163, "learning_rate": 5.572576633712615e-07, "loss": 0.1027, "step": 56990 }, { "epoch": 1.329037334965463, "grad_norm": 2.3193655014038086, "learning_rate": 5.571799415531928e-07, "loss": 0.1088, "step": 57000 }, { "epoch": 1.3292704963422808, "grad_norm": 1.4557896852493286, "learning_rate": 5.57102219735124e-07, "loss": 0.1054, "step": 57010 }, { "epoch": 1.3295036577190988, "grad_norm": 1.406930923461914, "learning_rate": 5.570244979170553e-07, "loss": 0.1126, "step": 57020 }, { "epoch": 1.329736819095917, "grad_norm": 3.0630502700805664, "learning_rate": 5.569467760989865e-07, "loss": 0.1049, "step": 57030 }, { "epoch": 1.3299699804727347, "grad_norm": 1.5877015590667725, "learning_rate": 5.568690542809177e-07, "loss": 0.1167, "step": 57040 }, { "epoch": 1.3302031418495526, "grad_norm": 1.0778223276138306, "learning_rate": 5.567913324628489e-07, "loss": 0.107, "step": 57050 }, { "epoch": 1.3304363032263706, "grad_norm": 1.0566843748092651, "learning_rate": 5.567136106447801e-07, "loss": 0.1037, "step": 57060 }, { "epoch": 1.3306694646031885, "grad_norm": 2.701838731765747, "learning_rate": 5.566358888267114e-07, "loss": 0.1065, "step": 57070 }, { "epoch": 1.3309026259800065, "grad_norm": 1.3038735389709473, "learning_rate": 5.565581670086427e-07, "loss": 0.1112, "step": 57080 }, { "epoch": 1.3311357873568244, "grad_norm": 2.192873239517212, "learning_rate": 5.564804451905739e-07, "loss": 0.1166, "step": 57090 }, { "epoch": 1.3313689487336422, "grad_norm": 1.3142659664154053, "learning_rate": 5.564027233725052e-07, "loss": 0.1052, "step": 57100 }, { "epoch": 1.3316021101104603, "grad_norm": 2.1809120178222656, "learning_rate": 5.563250015544362e-07, "loss": 0.1066, "step": 57110 }, { "epoch": 1.331835271487278, "grad_norm": 1.1814895868301392, "learning_rate": 5.562472797363675e-07, "loss": 0.1114, "step": 57120 }, { "epoch": 1.3320684328640962, "grad_norm": 1.1214594841003418, "learning_rate": 5.561695579182988e-07, "loss": 0.1091, "step": 57130 }, { "epoch": 1.332301594240914, "grad_norm": 1.3440494537353516, "learning_rate": 5.5609183610023e-07, "loss": 0.106, "step": 57140 }, { "epoch": 1.3325347556177318, "grad_norm": 1.9129890203475952, "learning_rate": 5.560141142821613e-07, "loss": 0.1095, "step": 57150 }, { "epoch": 1.3327679169945499, "grad_norm": 1.4720643758773804, "learning_rate": 5.559363924640926e-07, "loss": 0.1108, "step": 57160 }, { "epoch": 1.3330010783713677, "grad_norm": 1.6941496133804321, "learning_rate": 5.558586706460237e-07, "loss": 0.1108, "step": 57170 }, { "epoch": 1.3332342397481858, "grad_norm": 2.1989502906799316, "learning_rate": 5.55780948827955e-07, "loss": 0.1041, "step": 57180 }, { "epoch": 1.3334674011250036, "grad_norm": 1.525704026222229, "learning_rate": 5.557032270098861e-07, "loss": 0.1015, "step": 57190 }, { "epoch": 1.3337005625018215, "grad_norm": 1.783607006072998, "learning_rate": 5.556255051918174e-07, "loss": 0.1225, "step": 57200 }, { "epoch": 1.3339337238786395, "grad_norm": 1.351188063621521, "learning_rate": 5.555477833737487e-07, "loss": 0.1056, "step": 57210 }, { "epoch": 1.3341668852554576, "grad_norm": 1.854447364807129, "learning_rate": 5.554700615556799e-07, "loss": 0.1038, "step": 57220 }, { "epoch": 1.3344000466322754, "grad_norm": 3.1275665760040283, "learning_rate": 5.553923397376111e-07, "loss": 0.1186, "step": 57230 }, { "epoch": 1.3346332080090932, "grad_norm": 2.5200743675231934, "learning_rate": 5.553146179195423e-07, "loss": 0.1001, "step": 57240 }, { "epoch": 1.3348663693859113, "grad_norm": 1.824141502380371, "learning_rate": 5.552368961014736e-07, "loss": 0.1175, "step": 57250 }, { "epoch": 1.3350995307627291, "grad_norm": 1.5824979543685913, "learning_rate": 5.551591742834049e-07, "loss": 0.1145, "step": 57260 }, { "epoch": 1.3353326921395472, "grad_norm": 1.3175841569900513, "learning_rate": 5.55081452465336e-07, "loss": 0.1096, "step": 57270 }, { "epoch": 1.335565853516365, "grad_norm": 1.8951148986816406, "learning_rate": 5.550037306472673e-07, "loss": 0.1018, "step": 57280 }, { "epoch": 1.3357990148931829, "grad_norm": 1.3085025548934937, "learning_rate": 5.549260088291985e-07, "loss": 0.114, "step": 57290 }, { "epoch": 1.336032176270001, "grad_norm": 1.2617965936660767, "learning_rate": 5.548482870111297e-07, "loss": 0.0996, "step": 57300 }, { "epoch": 1.3362653376468188, "grad_norm": 1.249271035194397, "learning_rate": 5.54770565193061e-07, "loss": 0.1059, "step": 57310 }, { "epoch": 1.3364984990236368, "grad_norm": 2.6254403591156006, "learning_rate": 5.546928433749922e-07, "loss": 0.123, "step": 57320 }, { "epoch": 1.3367316604004547, "grad_norm": 3.3189477920532227, "learning_rate": 5.546151215569235e-07, "loss": 0.114, "step": 57330 }, { "epoch": 1.3369648217772725, "grad_norm": 1.5578491687774658, "learning_rate": 5.545373997388547e-07, "loss": 0.1119, "step": 57340 }, { "epoch": 1.3371979831540906, "grad_norm": 2.3693912029266357, "learning_rate": 5.544596779207859e-07, "loss": 0.1014, "step": 57350 }, { "epoch": 1.3374311445309084, "grad_norm": 1.4345605373382568, "learning_rate": 5.543819561027171e-07, "loss": 0.1125, "step": 57360 }, { "epoch": 1.3376643059077264, "grad_norm": 2.162090301513672, "learning_rate": 5.543042342846483e-07, "loss": 0.1121, "step": 57370 }, { "epoch": 1.3378974672845443, "grad_norm": 1.6986687183380127, "learning_rate": 5.542265124665796e-07, "loss": 0.1051, "step": 57380 }, { "epoch": 1.3381306286613621, "grad_norm": 1.5226773023605347, "learning_rate": 5.541487906485108e-07, "loss": 0.1147, "step": 57390 }, { "epoch": 1.3383637900381802, "grad_norm": 1.8145081996917725, "learning_rate": 5.540710688304421e-07, "loss": 0.1087, "step": 57400 }, { "epoch": 1.3385969514149982, "grad_norm": 1.6750638484954834, "learning_rate": 5.539933470123734e-07, "loss": 0.1035, "step": 57410 }, { "epoch": 1.338830112791816, "grad_norm": 1.4753146171569824, "learning_rate": 5.539156251943045e-07, "loss": 0.1191, "step": 57420 }, { "epoch": 1.339063274168634, "grad_norm": 1.9849209785461426, "learning_rate": 5.538379033762357e-07, "loss": 0.1151, "step": 57430 }, { "epoch": 1.339296435545452, "grad_norm": 2.060681104660034, "learning_rate": 5.537601815581669e-07, "loss": 0.1127, "step": 57440 }, { "epoch": 1.3395295969222698, "grad_norm": 1.6937578916549683, "learning_rate": 5.536824597400982e-07, "loss": 0.1163, "step": 57450 }, { "epoch": 1.3397627582990879, "grad_norm": 1.230584740638733, "learning_rate": 5.536047379220295e-07, "loss": 0.1029, "step": 57460 }, { "epoch": 1.3399959196759057, "grad_norm": 1.1992590427398682, "learning_rate": 5.535270161039607e-07, "loss": 0.1044, "step": 57470 }, { "epoch": 1.3402290810527235, "grad_norm": 1.3098167181015015, "learning_rate": 5.534492942858919e-07, "loss": 0.1089, "step": 57480 }, { "epoch": 1.3404622424295416, "grad_norm": 1.5042924880981445, "learning_rate": 5.533715724678231e-07, "loss": 0.1099, "step": 57490 }, { "epoch": 1.3406954038063594, "grad_norm": 1.4071158170700073, "learning_rate": 5.532938506497544e-07, "loss": 0.1055, "step": 57500 }, { "epoch": 1.3409285651831775, "grad_norm": 2.1152877807617188, "learning_rate": 5.532161288316856e-07, "loss": 0.1091, "step": 57510 }, { "epoch": 1.3411617265599953, "grad_norm": 1.9709200859069824, "learning_rate": 5.531384070136168e-07, "loss": 0.1189, "step": 57520 }, { "epoch": 1.3413948879368132, "grad_norm": 1.283282995223999, "learning_rate": 5.530606851955481e-07, "loss": 0.1145, "step": 57530 }, { "epoch": 1.3416280493136312, "grad_norm": 1.834809422492981, "learning_rate": 5.529829633774792e-07, "loss": 0.1073, "step": 57540 }, { "epoch": 1.341861210690449, "grad_norm": 1.1457983255386353, "learning_rate": 5.529052415594105e-07, "loss": 0.1051, "step": 57550 }, { "epoch": 1.3420943720672671, "grad_norm": 1.2666575908660889, "learning_rate": 5.528275197413418e-07, "loss": 0.1197, "step": 57560 }, { "epoch": 1.342327533444085, "grad_norm": 2.561948299407959, "learning_rate": 5.52749797923273e-07, "loss": 0.1144, "step": 57570 }, { "epoch": 1.3425606948209028, "grad_norm": 1.9823685884475708, "learning_rate": 5.526720761052043e-07, "loss": 0.1124, "step": 57580 }, { "epoch": 1.3427938561977208, "grad_norm": 3.063386917114258, "learning_rate": 5.525943542871355e-07, "loss": 0.1075, "step": 57590 }, { "epoch": 1.343027017574539, "grad_norm": 1.443787932395935, "learning_rate": 5.525166324690666e-07, "loss": 0.1062, "step": 57600 }, { "epoch": 1.3432601789513567, "grad_norm": 1.4937652349472046, "learning_rate": 5.524389106509979e-07, "loss": 0.1174, "step": 57610 }, { "epoch": 1.3434933403281746, "grad_norm": 1.053694486618042, "learning_rate": 5.523611888329291e-07, "loss": 0.1095, "step": 57620 }, { "epoch": 1.3437265017049926, "grad_norm": 1.2644284963607788, "learning_rate": 5.522834670148604e-07, "loss": 0.108, "step": 57630 }, { "epoch": 1.3439596630818105, "grad_norm": 1.7765778303146362, "learning_rate": 5.522057451967917e-07, "loss": 0.1077, "step": 57640 }, { "epoch": 1.3441928244586285, "grad_norm": 1.1877570152282715, "learning_rate": 5.521280233787229e-07, "loss": 0.1067, "step": 57650 }, { "epoch": 1.3444259858354464, "grad_norm": 3.6431007385253906, "learning_rate": 5.520503015606542e-07, "loss": 0.1116, "step": 57660 }, { "epoch": 1.3446591472122642, "grad_norm": 1.5347528457641602, "learning_rate": 5.519725797425852e-07, "loss": 0.1134, "step": 57670 }, { "epoch": 1.3448923085890823, "grad_norm": 1.1719807386398315, "learning_rate": 5.518948579245165e-07, "loss": 0.1051, "step": 57680 }, { "epoch": 1.3451254699659, "grad_norm": 1.2002530097961426, "learning_rate": 5.518171361064478e-07, "loss": 0.1137, "step": 57690 }, { "epoch": 1.3453586313427182, "grad_norm": 2.869129180908203, "learning_rate": 5.51739414288379e-07, "loss": 0.1172, "step": 57700 }, { "epoch": 1.345591792719536, "grad_norm": 3.5607657432556152, "learning_rate": 5.516616924703103e-07, "loss": 0.1114, "step": 57710 }, { "epoch": 1.3458249540963538, "grad_norm": 1.4377515316009521, "learning_rate": 5.515839706522415e-07, "loss": 0.1093, "step": 57720 }, { "epoch": 1.346058115473172, "grad_norm": 2.2886641025543213, "learning_rate": 5.515062488341727e-07, "loss": 0.1167, "step": 57730 }, { "epoch": 1.3462912768499897, "grad_norm": 1.499837875366211, "learning_rate": 5.51428527016104e-07, "loss": 0.0966, "step": 57740 }, { "epoch": 1.3465244382268078, "grad_norm": 3.121812582015991, "learning_rate": 5.513508051980351e-07, "loss": 0.1167, "step": 57750 }, { "epoch": 1.3467575996036256, "grad_norm": 1.3303335905075073, "learning_rate": 5.512730833799664e-07, "loss": 0.0977, "step": 57760 }, { "epoch": 1.3469907609804435, "grad_norm": 1.9906374216079712, "learning_rate": 5.511953615618976e-07, "loss": 0.1004, "step": 57770 }, { "epoch": 1.3472239223572615, "grad_norm": 2.4892196655273438, "learning_rate": 5.511176397438289e-07, "loss": 0.1137, "step": 57780 }, { "epoch": 1.3474570837340796, "grad_norm": 1.2992225885391235, "learning_rate": 5.510399179257601e-07, "loss": 0.1141, "step": 57790 }, { "epoch": 1.3476902451108974, "grad_norm": 1.8189022541046143, "learning_rate": 5.509621961076913e-07, "loss": 0.1054, "step": 57800 }, { "epoch": 1.3479234064877152, "grad_norm": 4.77476167678833, "learning_rate": 5.508844742896226e-07, "loss": 0.1049, "step": 57810 }, { "epoch": 1.3481565678645333, "grad_norm": 1.833649754524231, "learning_rate": 5.508067524715538e-07, "loss": 0.1134, "step": 57820 }, { "epoch": 1.3483897292413511, "grad_norm": 2.6264448165893555, "learning_rate": 5.50729030653485e-07, "loss": 0.1185, "step": 57830 }, { "epoch": 1.3486228906181692, "grad_norm": 1.3444623947143555, "learning_rate": 5.506513088354163e-07, "loss": 0.0994, "step": 57840 }, { "epoch": 1.348856051994987, "grad_norm": 1.4212111234664917, "learning_rate": 5.505735870173474e-07, "loss": 0.113, "step": 57850 }, { "epoch": 1.3490892133718049, "grad_norm": 3.478355884552002, "learning_rate": 5.504958651992787e-07, "loss": 0.1198, "step": 57860 }, { "epoch": 1.349322374748623, "grad_norm": 2.1738107204437256, "learning_rate": 5.504181433812099e-07, "loss": 0.121, "step": 57870 }, { "epoch": 1.3495555361254408, "grad_norm": 1.4518791437149048, "learning_rate": 5.503404215631412e-07, "loss": 0.1159, "step": 57880 }, { "epoch": 1.3497886975022588, "grad_norm": 1.90678071975708, "learning_rate": 5.502626997450725e-07, "loss": 0.1024, "step": 57890 }, { "epoch": 1.3500218588790767, "grad_norm": 4.57755708694458, "learning_rate": 5.501849779270037e-07, "loss": 0.1099, "step": 57900 }, { "epoch": 1.3502550202558945, "grad_norm": 1.0076935291290283, "learning_rate": 5.501072561089348e-07, "loss": 0.1047, "step": 57910 }, { "epoch": 1.3504881816327126, "grad_norm": 1.758228063583374, "learning_rate": 5.50029534290866e-07, "loss": 0.1036, "step": 57920 }, { "epoch": 1.3507213430095304, "grad_norm": 1.4042943716049194, "learning_rate": 5.499518124727973e-07, "loss": 0.1116, "step": 57930 }, { "epoch": 1.3509545043863485, "grad_norm": 1.993415117263794, "learning_rate": 5.498740906547286e-07, "loss": 0.1009, "step": 57940 }, { "epoch": 1.3511876657631663, "grad_norm": 2.034278154373169, "learning_rate": 5.497963688366598e-07, "loss": 0.1102, "step": 57950 }, { "epoch": 1.3514208271399841, "grad_norm": 1.4112135171890259, "learning_rate": 5.497186470185911e-07, "loss": 0.1124, "step": 57960 }, { "epoch": 1.3516539885168022, "grad_norm": 1.7227593660354614, "learning_rate": 5.496409252005222e-07, "loss": 0.1067, "step": 57970 }, { "epoch": 1.3518871498936202, "grad_norm": 2.9268760681152344, "learning_rate": 5.495632033824535e-07, "loss": 0.1127, "step": 57980 }, { "epoch": 1.352120311270438, "grad_norm": 1.2501510381698608, "learning_rate": 5.494854815643847e-07, "loss": 0.1038, "step": 57990 }, { "epoch": 1.352353472647256, "grad_norm": 2.1877238750457764, "learning_rate": 5.494077597463159e-07, "loss": 0.1126, "step": 58000 }, { "epoch": 1.352586634024074, "grad_norm": 1.599561333656311, "learning_rate": 5.493300379282472e-07, "loss": 0.1033, "step": 58010 }, { "epoch": 1.3528197954008918, "grad_norm": 3.249448299407959, "learning_rate": 5.492523161101785e-07, "loss": 0.1091, "step": 58020 }, { "epoch": 1.3530529567777099, "grad_norm": 1.5046610832214355, "learning_rate": 5.491745942921096e-07, "loss": 0.1059, "step": 58030 }, { "epoch": 1.3532861181545277, "grad_norm": 2.706573486328125, "learning_rate": 5.490968724740409e-07, "loss": 0.1191, "step": 58040 }, { "epoch": 1.3535192795313455, "grad_norm": 1.7770426273345947, "learning_rate": 5.490191506559721e-07, "loss": 0.0952, "step": 58050 }, { "epoch": 1.3537524409081636, "grad_norm": 2.288752317428589, "learning_rate": 5.489414288379034e-07, "loss": 0.1186, "step": 58060 }, { "epoch": 1.3539856022849814, "grad_norm": 1.3554357290267944, "learning_rate": 5.488637070198346e-07, "loss": 0.1028, "step": 58070 }, { "epoch": 1.3542187636617995, "grad_norm": 2.017366886138916, "learning_rate": 5.487859852017658e-07, "loss": 0.1262, "step": 58080 }, { "epoch": 1.3544519250386173, "grad_norm": 1.879276156425476, "learning_rate": 5.48708263383697e-07, "loss": 0.1147, "step": 58090 }, { "epoch": 1.3546850864154352, "grad_norm": 1.4270678758621216, "learning_rate": 5.486305415656282e-07, "loss": 0.1029, "step": 58100 }, { "epoch": 1.3549182477922532, "grad_norm": 1.4375799894332886, "learning_rate": 5.485528197475595e-07, "loss": 0.1139, "step": 58110 }, { "epoch": 1.355151409169071, "grad_norm": 1.9412933588027954, "learning_rate": 5.484750979294908e-07, "loss": 0.1186, "step": 58120 }, { "epoch": 1.3553845705458891, "grad_norm": 1.1195589303970337, "learning_rate": 5.48397376111422e-07, "loss": 0.1119, "step": 58130 }, { "epoch": 1.355617731922707, "grad_norm": 1.378206729888916, "learning_rate": 5.483196542933533e-07, "loss": 0.1041, "step": 58140 }, { "epoch": 1.3558508932995248, "grad_norm": 2.8473968505859375, "learning_rate": 5.482419324752843e-07, "loss": 0.1001, "step": 58150 }, { "epoch": 1.3560840546763429, "grad_norm": 1.4275455474853516, "learning_rate": 5.481642106572156e-07, "loss": 0.1189, "step": 58160 }, { "epoch": 1.356317216053161, "grad_norm": 2.272451400756836, "learning_rate": 5.480864888391469e-07, "loss": 0.1083, "step": 58170 }, { "epoch": 1.3565503774299787, "grad_norm": 1.079770565032959, "learning_rate": 5.480087670210781e-07, "loss": 0.1151, "step": 58180 }, { "epoch": 1.3567835388067966, "grad_norm": 1.8231858015060425, "learning_rate": 5.479310452030094e-07, "loss": 0.1033, "step": 58190 }, { "epoch": 1.3570167001836146, "grad_norm": 1.3988227844238281, "learning_rate": 5.478533233849406e-07, "loss": 0.095, "step": 58200 }, { "epoch": 1.3572498615604325, "grad_norm": 2.048405170440674, "learning_rate": 5.477756015668719e-07, "loss": 0.1001, "step": 58210 }, { "epoch": 1.3574830229372505, "grad_norm": 2.535555839538574, "learning_rate": 5.476978797488031e-07, "loss": 0.1076, "step": 58220 }, { "epoch": 1.3577161843140684, "grad_norm": 1.7061803340911865, "learning_rate": 5.476201579307342e-07, "loss": 0.1054, "step": 58230 }, { "epoch": 1.3579493456908862, "grad_norm": 2.505626916885376, "learning_rate": 5.475424361126655e-07, "loss": 0.1126, "step": 58240 }, { "epoch": 1.3581825070677043, "grad_norm": 1.897695541381836, "learning_rate": 5.474647142945967e-07, "loss": 0.1042, "step": 58250 }, { "epoch": 1.358415668444522, "grad_norm": 1.8125755786895752, "learning_rate": 5.47386992476528e-07, "loss": 0.1152, "step": 58260 }, { "epoch": 1.3586488298213402, "grad_norm": 1.1707748174667358, "learning_rate": 5.473092706584593e-07, "loss": 0.0997, "step": 58270 }, { "epoch": 1.358881991198158, "grad_norm": 1.5508087873458862, "learning_rate": 5.472315488403904e-07, "loss": 0.1052, "step": 58280 }, { "epoch": 1.3591151525749758, "grad_norm": 1.3906346559524536, "learning_rate": 5.471538270223217e-07, "loss": 0.1178, "step": 58290 }, { "epoch": 1.359348313951794, "grad_norm": 1.3585822582244873, "learning_rate": 5.47076105204253e-07, "loss": 0.1139, "step": 58300 }, { "epoch": 1.3595814753286117, "grad_norm": 1.765577793121338, "learning_rate": 5.469983833861841e-07, "loss": 0.1095, "step": 58310 }, { "epoch": 1.3598146367054298, "grad_norm": 1.556654691696167, "learning_rate": 5.469206615681154e-07, "loss": 0.1029, "step": 58320 }, { "epoch": 1.3600477980822476, "grad_norm": 1.5333061218261719, "learning_rate": 5.468429397500466e-07, "loss": 0.1176, "step": 58330 }, { "epoch": 1.3602809594590657, "grad_norm": 1.618135929107666, "learning_rate": 5.467652179319778e-07, "loss": 0.105, "step": 58340 }, { "epoch": 1.3605141208358835, "grad_norm": 1.8474433422088623, "learning_rate": 5.46687496113909e-07, "loss": 0.1071, "step": 58350 }, { "epoch": 1.3607472822127016, "grad_norm": 1.9759466648101807, "learning_rate": 5.466097742958403e-07, "loss": 0.1262, "step": 58360 }, { "epoch": 1.3609804435895194, "grad_norm": 3.977966070175171, "learning_rate": 5.465320524777716e-07, "loss": 0.1025, "step": 58370 }, { "epoch": 1.3612136049663373, "grad_norm": 1.6166775226593018, "learning_rate": 5.464543306597028e-07, "loss": 0.1065, "step": 58380 }, { "epoch": 1.3614467663431553, "grad_norm": 1.7561993598937988, "learning_rate": 5.46376608841634e-07, "loss": 0.098, "step": 58390 }, { "epoch": 1.3616799277199731, "grad_norm": 1.5532690286636353, "learning_rate": 5.462988870235651e-07, "loss": 0.1078, "step": 58400 }, { "epoch": 1.3619130890967912, "grad_norm": 1.6520359516143799, "learning_rate": 5.462211652054964e-07, "loss": 0.1036, "step": 58410 }, { "epoch": 1.362146250473609, "grad_norm": 1.5263029336929321, "learning_rate": 5.461434433874277e-07, "loss": 0.1184, "step": 58420 }, { "epoch": 1.3623794118504269, "grad_norm": 1.2619800567626953, "learning_rate": 5.460657215693589e-07, "loss": 0.1067, "step": 58430 }, { "epoch": 1.362612573227245, "grad_norm": 2.2358498573303223, "learning_rate": 5.459879997512902e-07, "loss": 0.1225, "step": 58440 }, { "epoch": 1.3628457346040628, "grad_norm": 1.6378053426742554, "learning_rate": 5.459102779332215e-07, "loss": 0.1164, "step": 58450 }, { "epoch": 1.3630788959808808, "grad_norm": 3.451570510864258, "learning_rate": 5.458325561151526e-07, "loss": 0.0985, "step": 58460 }, { "epoch": 1.3633120573576987, "grad_norm": 2.08664870262146, "learning_rate": 5.457548342970838e-07, "loss": 0.1091, "step": 58470 }, { "epoch": 1.3635452187345165, "grad_norm": 1.657291054725647, "learning_rate": 5.45677112479015e-07, "loss": 0.1159, "step": 58480 }, { "epoch": 1.3637783801113346, "grad_norm": 1.1419543027877808, "learning_rate": 5.455993906609463e-07, "loss": 0.1032, "step": 58490 }, { "epoch": 1.3640115414881526, "grad_norm": 1.78587806224823, "learning_rate": 5.455216688428776e-07, "loss": 0.1188, "step": 58500 }, { "epoch": 1.3642447028649705, "grad_norm": 1.0849794149398804, "learning_rate": 5.454439470248088e-07, "loss": 0.1023, "step": 58510 }, { "epoch": 1.3644778642417883, "grad_norm": 2.7042012214660645, "learning_rate": 5.453662252067401e-07, "loss": 0.1193, "step": 58520 }, { "epoch": 1.3647110256186064, "grad_norm": 2.0069711208343506, "learning_rate": 5.452885033886712e-07, "loss": 0.1164, "step": 58530 }, { "epoch": 1.3649441869954242, "grad_norm": 1.8646644353866577, "learning_rate": 5.452107815706025e-07, "loss": 0.1119, "step": 58540 }, { "epoch": 1.3651773483722422, "grad_norm": 2.000972270965576, "learning_rate": 5.451330597525337e-07, "loss": 0.1231, "step": 58550 }, { "epoch": 1.36541050974906, "grad_norm": 1.9768877029418945, "learning_rate": 5.450553379344649e-07, "loss": 0.1126, "step": 58560 }, { "epoch": 1.365643671125878, "grad_norm": 2.584784507751465, "learning_rate": 5.449776161163962e-07, "loss": 0.1109, "step": 58570 }, { "epoch": 1.365876832502696, "grad_norm": 1.5274685621261597, "learning_rate": 5.448998942983274e-07, "loss": 0.1046, "step": 58580 }, { "epoch": 1.3661099938795138, "grad_norm": 2.308227777481079, "learning_rate": 5.448221724802586e-07, "loss": 0.1177, "step": 58590 }, { "epoch": 1.3663431552563319, "grad_norm": 2.3711764812469482, "learning_rate": 5.447444506621899e-07, "loss": 0.1143, "step": 58600 }, { "epoch": 1.3665763166331497, "grad_norm": 1.2969191074371338, "learning_rate": 5.446667288441211e-07, "loss": 0.1241, "step": 58610 }, { "epoch": 1.3668094780099675, "grad_norm": 1.299564242362976, "learning_rate": 5.445890070260524e-07, "loss": 0.1134, "step": 58620 }, { "epoch": 1.3670426393867856, "grad_norm": 1.3250682353973389, "learning_rate": 5.445112852079835e-07, "loss": 0.1155, "step": 58630 }, { "epoch": 1.3672758007636034, "grad_norm": 1.4188176393508911, "learning_rate": 5.444335633899148e-07, "loss": 0.1172, "step": 58640 }, { "epoch": 1.3675089621404215, "grad_norm": 2.572361946105957, "learning_rate": 5.44355841571846e-07, "loss": 0.1208, "step": 58650 }, { "epoch": 1.3677421235172393, "grad_norm": 1.1664270162582397, "learning_rate": 5.442781197537772e-07, "loss": 0.1041, "step": 58660 }, { "epoch": 1.3679752848940572, "grad_norm": 1.2473357915878296, "learning_rate": 5.442003979357085e-07, "loss": 0.0994, "step": 58670 }, { "epoch": 1.3682084462708752, "grad_norm": 1.5325182676315308, "learning_rate": 5.441226761176397e-07, "loss": 0.1115, "step": 58680 }, { "epoch": 1.3684416076476933, "grad_norm": 1.1090658903121948, "learning_rate": 5.44044954299571e-07, "loss": 0.0996, "step": 58690 }, { "epoch": 1.3686747690245111, "grad_norm": 2.448276996612549, "learning_rate": 5.439672324815023e-07, "loss": 0.1124, "step": 58700 }, { "epoch": 1.368907930401329, "grad_norm": 1.215887188911438, "learning_rate": 5.438895106634333e-07, "loss": 0.1106, "step": 58710 }, { "epoch": 1.369141091778147, "grad_norm": 1.6086680889129639, "learning_rate": 5.438117888453646e-07, "loss": 0.1138, "step": 58720 }, { "epoch": 1.3693742531549649, "grad_norm": 1.2040596008300781, "learning_rate": 5.437340670272958e-07, "loss": 0.0969, "step": 58730 }, { "epoch": 1.369607414531783, "grad_norm": 1.1991188526153564, "learning_rate": 5.436563452092271e-07, "loss": 0.1208, "step": 58740 }, { "epoch": 1.3698405759086008, "grad_norm": 1.6937757730484009, "learning_rate": 5.435786233911584e-07, "loss": 0.1128, "step": 58750 }, { "epoch": 1.3700737372854186, "grad_norm": 1.356902837753296, "learning_rate": 5.435009015730896e-07, "loss": 0.1023, "step": 58760 }, { "epoch": 1.3703068986622366, "grad_norm": 1.2461390495300293, "learning_rate": 5.434231797550208e-07, "loss": 0.1063, "step": 58770 }, { "epoch": 1.3705400600390545, "grad_norm": 1.4145476818084717, "learning_rate": 5.43345457936952e-07, "loss": 0.1112, "step": 58780 }, { "epoch": 1.3707732214158725, "grad_norm": 1.0115023851394653, "learning_rate": 5.432677361188832e-07, "loss": 0.1062, "step": 58790 }, { "epoch": 1.3710063827926904, "grad_norm": 1.1960662603378296, "learning_rate": 5.431900143008145e-07, "loss": 0.1112, "step": 58800 }, { "epoch": 1.3712395441695082, "grad_norm": 1.541923999786377, "learning_rate": 5.431122924827457e-07, "loss": 0.112, "step": 58810 }, { "epoch": 1.3714727055463263, "grad_norm": 1.1022043228149414, "learning_rate": 5.43034570664677e-07, "loss": 0.1129, "step": 58820 }, { "epoch": 1.3717058669231441, "grad_norm": 1.7136831283569336, "learning_rate": 5.429568488466081e-07, "loss": 0.1241, "step": 58830 }, { "epoch": 1.3719390282999622, "grad_norm": 2.7147321701049805, "learning_rate": 5.428791270285394e-07, "loss": 0.1046, "step": 58840 }, { "epoch": 1.37217218967678, "grad_norm": 1.5252792835235596, "learning_rate": 5.428014052104707e-07, "loss": 0.1089, "step": 58850 }, { "epoch": 1.3724053510535978, "grad_norm": 2.6420602798461914, "learning_rate": 5.427236833924019e-07, "loss": 0.107, "step": 58860 }, { "epoch": 1.372638512430416, "grad_norm": 1.1307694911956787, "learning_rate": 5.426459615743332e-07, "loss": 0.1005, "step": 58870 }, { "epoch": 1.372871673807234, "grad_norm": 3.2603580951690674, "learning_rate": 5.425682397562644e-07, "loss": 0.1106, "step": 58880 }, { "epoch": 1.3731048351840518, "grad_norm": 1.4050484895706177, "learning_rate": 5.424905179381955e-07, "loss": 0.1054, "step": 58890 }, { "epoch": 1.3733379965608696, "grad_norm": 2.230217456817627, "learning_rate": 5.424127961201268e-07, "loss": 0.1129, "step": 58900 }, { "epoch": 1.3735711579376877, "grad_norm": 2.4465255737304688, "learning_rate": 5.42335074302058e-07, "loss": 0.1134, "step": 58910 }, { "epoch": 1.3738043193145055, "grad_norm": 1.779882550239563, "learning_rate": 5.422573524839893e-07, "loss": 0.1058, "step": 58920 }, { "epoch": 1.3740374806913236, "grad_norm": 2.217855215072632, "learning_rate": 5.421796306659206e-07, "loss": 0.1116, "step": 58930 }, { "epoch": 1.3742706420681414, "grad_norm": 2.2973337173461914, "learning_rate": 5.421019088478518e-07, "loss": 0.1104, "step": 58940 }, { "epoch": 1.3745038034449593, "grad_norm": 1.2544151544570923, "learning_rate": 5.420241870297831e-07, "loss": 0.11, "step": 58950 }, { "epoch": 1.3747369648217773, "grad_norm": 1.674827218055725, "learning_rate": 5.419464652117141e-07, "loss": 0.1021, "step": 58960 }, { "epoch": 1.3749701261985952, "grad_norm": 1.4778938293457031, "learning_rate": 5.418687433936454e-07, "loss": 0.099, "step": 58970 }, { "epoch": 1.3752032875754132, "grad_norm": 1.9089618921279907, "learning_rate": 5.417910215755767e-07, "loss": 0.098, "step": 58980 }, { "epoch": 1.375436448952231, "grad_norm": 1.2988444566726685, "learning_rate": 5.417132997575079e-07, "loss": 0.1062, "step": 58990 }, { "epoch": 1.3756696103290489, "grad_norm": 2.495483875274658, "learning_rate": 5.416355779394392e-07, "loss": 0.0974, "step": 59000 }, { "epoch": 1.375902771705867, "grad_norm": 1.0224488973617554, "learning_rate": 5.415578561213704e-07, "loss": 0.1204, "step": 59010 }, { "epoch": 1.3761359330826848, "grad_norm": 1.593133807182312, "learning_rate": 5.414801343033016e-07, "loss": 0.11, "step": 59020 }, { "epoch": 1.3763690944595028, "grad_norm": 3.0065088272094727, "learning_rate": 5.414024124852329e-07, "loss": 0.1253, "step": 59030 }, { "epoch": 1.3766022558363207, "grad_norm": 1.3688817024230957, "learning_rate": 5.41324690667164e-07, "loss": 0.1058, "step": 59040 }, { "epoch": 1.3768354172131385, "grad_norm": 1.2690985202789307, "learning_rate": 5.412469688490953e-07, "loss": 0.1168, "step": 59050 }, { "epoch": 1.3770685785899566, "grad_norm": 1.3008273839950562, "learning_rate": 5.411692470310265e-07, "loss": 0.1061, "step": 59060 }, { "epoch": 1.3773017399667746, "grad_norm": 2.2702136039733887, "learning_rate": 5.410915252129578e-07, "loss": 0.1117, "step": 59070 }, { "epoch": 1.3775349013435925, "grad_norm": 1.9795526266098022, "learning_rate": 5.41013803394889e-07, "loss": 0.1103, "step": 59080 }, { "epoch": 1.3777680627204103, "grad_norm": 1.2053369283676147, "learning_rate": 5.409360815768202e-07, "loss": 0.1066, "step": 59090 }, { "epoch": 1.3780012240972284, "grad_norm": 3.5375888347625732, "learning_rate": 5.408583597587515e-07, "loss": 0.109, "step": 59100 }, { "epoch": 1.3782343854740462, "grad_norm": 3.7483513355255127, "learning_rate": 5.407806379406827e-07, "loss": 0.1029, "step": 59110 }, { "epoch": 1.3784675468508643, "grad_norm": 2.3513360023498535, "learning_rate": 5.407029161226139e-07, "loss": 0.112, "step": 59120 }, { "epoch": 1.378700708227682, "grad_norm": 1.9515597820281982, "learning_rate": 5.406251943045452e-07, "loss": 0.1071, "step": 59130 }, { "epoch": 1.3789338696045, "grad_norm": 1.3361562490463257, "learning_rate": 5.405474724864763e-07, "loss": 0.1174, "step": 59140 }, { "epoch": 1.379167030981318, "grad_norm": 1.3660316467285156, "learning_rate": 5.404697506684076e-07, "loss": 0.0953, "step": 59150 }, { "epoch": 1.3794001923581358, "grad_norm": 3.092233419418335, "learning_rate": 5.403920288503388e-07, "loss": 0.1096, "step": 59160 }, { "epoch": 1.3796333537349539, "grad_norm": 2.4510080814361572, "learning_rate": 5.403143070322701e-07, "loss": 0.1057, "step": 59170 }, { "epoch": 1.3798665151117717, "grad_norm": 1.5768041610717773, "learning_rate": 5.402365852142014e-07, "loss": 0.1092, "step": 59180 }, { "epoch": 1.3800996764885896, "grad_norm": 2.1013758182525635, "learning_rate": 5.401588633961326e-07, "loss": 0.1254, "step": 59190 }, { "epoch": 1.3803328378654076, "grad_norm": 2.1024179458618164, "learning_rate": 5.400811415780637e-07, "loss": 0.1101, "step": 59200 }, { "epoch": 1.3805659992422255, "grad_norm": 1.4988754987716675, "learning_rate": 5.400034197599949e-07, "loss": 0.1157, "step": 59210 }, { "epoch": 1.3807991606190435, "grad_norm": 2.7379941940307617, "learning_rate": 5.399256979419262e-07, "loss": 0.1104, "step": 59220 }, { "epoch": 1.3810323219958613, "grad_norm": 1.0406132936477661, "learning_rate": 5.398479761238575e-07, "loss": 0.0945, "step": 59230 }, { "epoch": 1.3812654833726792, "grad_norm": 3.3460590839385986, "learning_rate": 5.397702543057887e-07, "loss": 0.1077, "step": 59240 }, { "epoch": 1.3814986447494972, "grad_norm": 1.4050849676132202, "learning_rate": 5.3969253248772e-07, "loss": 0.1105, "step": 59250 }, { "epoch": 1.3817318061263153, "grad_norm": 1.3171778917312622, "learning_rate": 5.396148106696511e-07, "loss": 0.1106, "step": 59260 }, { "epoch": 1.3819649675031331, "grad_norm": 2.63045072555542, "learning_rate": 5.395370888515824e-07, "loss": 0.1251, "step": 59270 }, { "epoch": 1.382198128879951, "grad_norm": 1.213744044303894, "learning_rate": 5.394593670335136e-07, "loss": 0.1095, "step": 59280 }, { "epoch": 1.382431290256769, "grad_norm": 1.2768471240997314, "learning_rate": 5.393816452154448e-07, "loss": 0.1091, "step": 59290 }, { "epoch": 1.3826644516335869, "grad_norm": 1.1115009784698486, "learning_rate": 5.393039233973761e-07, "loss": 0.1172, "step": 59300 }, { "epoch": 1.382897613010405, "grad_norm": 2.154066801071167, "learning_rate": 5.392262015793074e-07, "loss": 0.1127, "step": 59310 }, { "epoch": 1.3831307743872228, "grad_norm": 1.4903621673583984, "learning_rate": 5.391484797612385e-07, "loss": 0.1071, "step": 59320 }, { "epoch": 1.3833639357640406, "grad_norm": 2.4833791255950928, "learning_rate": 5.390707579431698e-07, "loss": 0.1072, "step": 59330 }, { "epoch": 1.3835970971408587, "grad_norm": 2.8979649543762207, "learning_rate": 5.38993036125101e-07, "loss": 0.1075, "step": 59340 }, { "epoch": 1.3838302585176765, "grad_norm": 1.141993522644043, "learning_rate": 5.389153143070323e-07, "loss": 0.1042, "step": 59350 }, { "epoch": 1.3840634198944946, "grad_norm": 1.249992847442627, "learning_rate": 5.388375924889635e-07, "loss": 0.1078, "step": 59360 }, { "epoch": 1.3842965812713124, "grad_norm": 1.319121241569519, "learning_rate": 5.387598706708947e-07, "loss": 0.1072, "step": 59370 }, { "epoch": 1.3845297426481302, "grad_norm": 3.112213611602783, "learning_rate": 5.386821488528259e-07, "loss": 0.1091, "step": 59380 }, { "epoch": 1.3847629040249483, "grad_norm": 1.6805570125579834, "learning_rate": 5.386044270347571e-07, "loss": 0.1148, "step": 59390 }, { "epoch": 1.3849960654017661, "grad_norm": 1.2788467407226562, "learning_rate": 5.385267052166884e-07, "loss": 0.1027, "step": 59400 }, { "epoch": 1.3852292267785842, "grad_norm": 1.35621178150177, "learning_rate": 5.384489833986197e-07, "loss": 0.1109, "step": 59410 }, { "epoch": 1.385462388155402, "grad_norm": 1.261452555656433, "learning_rate": 5.383712615805509e-07, "loss": 0.0966, "step": 59420 }, { "epoch": 1.3856955495322199, "grad_norm": 1.1773273944854736, "learning_rate": 5.382935397624822e-07, "loss": 0.1023, "step": 59430 }, { "epoch": 1.385928710909038, "grad_norm": 1.4993903636932373, "learning_rate": 5.382158179444133e-07, "loss": 0.1005, "step": 59440 }, { "epoch": 1.386161872285856, "grad_norm": 3.1094186305999756, "learning_rate": 5.381380961263445e-07, "loss": 0.1022, "step": 59450 }, { "epoch": 1.3863950336626738, "grad_norm": 2.035989761352539, "learning_rate": 5.380603743082758e-07, "loss": 0.11, "step": 59460 }, { "epoch": 1.3866281950394916, "grad_norm": 1.0580177307128906, "learning_rate": 5.37982652490207e-07, "loss": 0.1022, "step": 59470 }, { "epoch": 1.3868613564163097, "grad_norm": 1.881394386291504, "learning_rate": 5.379127028539451e-07, "loss": 0.1183, "step": 59480 }, { "epoch": 1.3870945177931275, "grad_norm": 1.4929429292678833, "learning_rate": 5.378349810358763e-07, "loss": 0.1105, "step": 59490 }, { "epoch": 1.3873276791699456, "grad_norm": 2.0052480697631836, "learning_rate": 5.377572592178076e-07, "loss": 0.116, "step": 59500 }, { "epoch": 1.3875608405467634, "grad_norm": 5.155190467834473, "learning_rate": 5.376795373997388e-07, "loss": 0.1279, "step": 59510 }, { "epoch": 1.3877940019235813, "grad_norm": 1.1117562055587769, "learning_rate": 5.376018155816701e-07, "loss": 0.1058, "step": 59520 }, { "epoch": 1.3880271633003993, "grad_norm": 2.101874589920044, "learning_rate": 5.375240937636013e-07, "loss": 0.108, "step": 59530 }, { "epoch": 1.3882603246772172, "grad_norm": 1.5976413488388062, "learning_rate": 5.374463719455325e-07, "loss": 0.103, "step": 59540 }, { "epoch": 1.3884934860540352, "grad_norm": 1.3892810344696045, "learning_rate": 5.373686501274638e-07, "loss": 0.1167, "step": 59550 }, { "epoch": 1.388726647430853, "grad_norm": 1.7454543113708496, "learning_rate": 5.37290928309395e-07, "loss": 0.1233, "step": 59560 }, { "epoch": 1.388959808807671, "grad_norm": 3.049494743347168, "learning_rate": 5.372132064913262e-07, "loss": 0.1093, "step": 59570 }, { "epoch": 1.389192970184489, "grad_norm": 1.4224573373794556, "learning_rate": 5.371354846732574e-07, "loss": 0.1159, "step": 59580 }, { "epoch": 1.3894261315613068, "grad_norm": 1.6494413614273071, "learning_rate": 5.370577628551887e-07, "loss": 0.1088, "step": 59590 }, { "epoch": 1.3896592929381248, "grad_norm": 3.3012630939483643, "learning_rate": 5.369800410371199e-07, "loss": 0.1208, "step": 59600 }, { "epoch": 1.3898924543149427, "grad_norm": 1.7474236488342285, "learning_rate": 5.369023192190511e-07, "loss": 0.1089, "step": 59610 }, { "epoch": 1.3901256156917607, "grad_norm": 0.9257320165634155, "learning_rate": 5.368245974009824e-07, "loss": 0.0997, "step": 59620 }, { "epoch": 1.3903587770685786, "grad_norm": 1.5610520839691162, "learning_rate": 5.367468755829137e-07, "loss": 0.1178, "step": 59630 }, { "epoch": 1.3905919384453966, "grad_norm": 1.3282955884933472, "learning_rate": 5.366691537648449e-07, "loss": 0.1002, "step": 59640 }, { "epoch": 1.3908250998222145, "grad_norm": 2.453206777572632, "learning_rate": 5.365914319467761e-07, "loss": 0.1116, "step": 59650 }, { "epoch": 1.3910582611990323, "grad_norm": 1.4686102867126465, "learning_rate": 5.365137101287072e-07, "loss": 0.1109, "step": 59660 }, { "epoch": 1.3912914225758504, "grad_norm": 1.4278950691223145, "learning_rate": 5.364359883106385e-07, "loss": 0.1132, "step": 59670 }, { "epoch": 1.3915245839526682, "grad_norm": 1.1697392463684082, "learning_rate": 5.363582664925698e-07, "loss": 0.1072, "step": 59680 }, { "epoch": 1.3917577453294863, "grad_norm": 2.241034984588623, "learning_rate": 5.36280544674501e-07, "loss": 0.1164, "step": 59690 }, { "epoch": 1.391990906706304, "grad_norm": 4.328071117401123, "learning_rate": 5.362028228564323e-07, "loss": 0.1032, "step": 59700 }, { "epoch": 1.392224068083122, "grad_norm": 1.431980848312378, "learning_rate": 5.361251010383635e-07, "loss": 0.1025, "step": 59710 }, { "epoch": 1.39245722945994, "grad_norm": 1.6051039695739746, "learning_rate": 5.360473792202948e-07, "loss": 0.1069, "step": 59720 }, { "epoch": 1.3926903908367578, "grad_norm": 2.8886239528656006, "learning_rate": 5.359696574022259e-07, "loss": 0.109, "step": 59730 }, { "epoch": 1.3929235522135759, "grad_norm": 1.7003982067108154, "learning_rate": 5.358919355841571e-07, "loss": 0.1186, "step": 59740 }, { "epoch": 1.3931567135903937, "grad_norm": 1.4411211013793945, "learning_rate": 5.358142137660884e-07, "loss": 0.1049, "step": 59750 }, { "epoch": 1.3933898749672116, "grad_norm": 2.402921438217163, "learning_rate": 5.357364919480196e-07, "loss": 0.1103, "step": 59760 }, { "epoch": 1.3936230363440296, "grad_norm": 1.523386836051941, "learning_rate": 5.356587701299509e-07, "loss": 0.1069, "step": 59770 }, { "epoch": 1.3938561977208477, "grad_norm": 2.062612771987915, "learning_rate": 5.355810483118822e-07, "loss": 0.121, "step": 59780 }, { "epoch": 1.3940893590976655, "grad_norm": 1.8613392114639282, "learning_rate": 5.355033264938133e-07, "loss": 0.1084, "step": 59790 }, { "epoch": 1.3943225204744834, "grad_norm": 2.326942205429077, "learning_rate": 5.354256046757446e-07, "loss": 0.0993, "step": 59800 }, { "epoch": 1.3945556818513014, "grad_norm": 1.0198875665664673, "learning_rate": 5.353478828576757e-07, "loss": 0.1046, "step": 59810 }, { "epoch": 1.3947888432281192, "grad_norm": 1.0735511779785156, "learning_rate": 5.35270161039607e-07, "loss": 0.11, "step": 59820 }, { "epoch": 1.3950220046049373, "grad_norm": 2.8276078701019287, "learning_rate": 5.351924392215383e-07, "loss": 0.1177, "step": 59830 }, { "epoch": 1.3952551659817551, "grad_norm": 1.7457003593444824, "learning_rate": 5.351147174034695e-07, "loss": 0.1067, "step": 59840 }, { "epoch": 1.395488327358573, "grad_norm": 1.4734565019607544, "learning_rate": 5.350369955854007e-07, "loss": 0.1175, "step": 59850 }, { "epoch": 1.395721488735391, "grad_norm": 1.4947391748428345, "learning_rate": 5.349592737673319e-07, "loss": 0.1067, "step": 59860 }, { "epoch": 1.3959546501122089, "grad_norm": 2.1363816261291504, "learning_rate": 5.348815519492632e-07, "loss": 0.1237, "step": 59870 }, { "epoch": 1.396187811489027, "grad_norm": 1.3749630451202393, "learning_rate": 5.348038301311945e-07, "loss": 0.1145, "step": 59880 }, { "epoch": 1.3964209728658448, "grad_norm": 1.399862289428711, "learning_rate": 5.347261083131256e-07, "loss": 0.1218, "step": 59890 }, { "epoch": 1.3966541342426626, "grad_norm": 1.7686879634857178, "learning_rate": 5.346483864950569e-07, "loss": 0.0965, "step": 59900 }, { "epoch": 1.3968872956194807, "grad_norm": 2.1713805198669434, "learning_rate": 5.34570664676988e-07, "loss": 0.1034, "step": 59910 }, { "epoch": 1.3971204569962985, "grad_norm": 1.8445348739624023, "learning_rate": 5.344929428589193e-07, "loss": 0.1003, "step": 59920 }, { "epoch": 1.3973536183731166, "grad_norm": 3.303630828857422, "learning_rate": 5.344152210408506e-07, "loss": 0.108, "step": 59930 }, { "epoch": 1.3975867797499344, "grad_norm": 1.5490329265594482, "learning_rate": 5.343374992227818e-07, "loss": 0.108, "step": 59940 }, { "epoch": 1.3978199411267522, "grad_norm": 1.3931859731674194, "learning_rate": 5.342597774047131e-07, "loss": 0.0988, "step": 59950 }, { "epoch": 1.3980531025035703, "grad_norm": 1.3713328838348389, "learning_rate": 5.341820555866444e-07, "loss": 0.0982, "step": 59960 }, { "epoch": 1.3982862638803883, "grad_norm": 1.2141164541244507, "learning_rate": 5.341043337685754e-07, "loss": 0.1049, "step": 59970 }, { "epoch": 1.3985194252572062, "grad_norm": 3.78898286819458, "learning_rate": 5.340266119505067e-07, "loss": 0.1086, "step": 59980 }, { "epoch": 1.398752586634024, "grad_norm": 2.838322162628174, "learning_rate": 5.339488901324379e-07, "loss": 0.11, "step": 59990 }, { "epoch": 1.398985748010842, "grad_norm": 3.0814616680145264, "learning_rate": 5.338711683143692e-07, "loss": 0.1112, "step": 60000 }, { "epoch": 1.398985748010842, "eval_accuracy": 0.9433094816270455, "eval_f1": 0.9593532504587202, "eval_loss": 0.1488361358642578, "eval_runtime": 3908.3012, "eval_samples_per_second": 468.213, "eval_steps_per_second": 58.527, "step": 60000 }, { "epoch": 1.39921890938766, "grad_norm": 1.763010859489441, "learning_rate": 5.337934464963005e-07, "loss": 0.1074, "step": 60010 }, { "epoch": 1.399452070764478, "grad_norm": 1.2935421466827393, "learning_rate": 5.337157246782317e-07, "loss": 0.1139, "step": 60020 }, { "epoch": 1.3996852321412958, "grad_norm": 1.5564297437667847, "learning_rate": 5.336380028601629e-07, "loss": 0.1097, "step": 60030 }, { "epoch": 1.3999183935181136, "grad_norm": 1.8338526487350464, "learning_rate": 5.335602810420941e-07, "loss": 0.1121, "step": 60040 }, { "epoch": 1.4001515548949317, "grad_norm": 1.5610913038253784, "learning_rate": 5.334825592240253e-07, "loss": 0.1171, "step": 60050 }, { "epoch": 1.4003847162717495, "grad_norm": 1.677230954170227, "learning_rate": 5.334048374059565e-07, "loss": 0.1095, "step": 60060 }, { "epoch": 1.4006178776485676, "grad_norm": 2.1203272342681885, "learning_rate": 5.333271155878878e-07, "loss": 0.117, "step": 60070 }, { "epoch": 1.4008510390253854, "grad_norm": 1.3371771574020386, "learning_rate": 5.332493937698191e-07, "loss": 0.1205, "step": 60080 }, { "epoch": 1.4010842004022033, "grad_norm": 1.5596462488174438, "learning_rate": 5.331716719517502e-07, "loss": 0.1087, "step": 60090 }, { "epoch": 1.4013173617790213, "grad_norm": 1.2856147289276123, "learning_rate": 5.330939501336815e-07, "loss": 0.1149, "step": 60100 }, { "epoch": 1.4015505231558392, "grad_norm": 1.7748806476593018, "learning_rate": 5.330162283156128e-07, "loss": 0.1042, "step": 60110 }, { "epoch": 1.4017836845326572, "grad_norm": 1.648330807685852, "learning_rate": 5.32938506497544e-07, "loss": 0.1003, "step": 60120 }, { "epoch": 1.402016845909475, "grad_norm": 1.2791436910629272, "learning_rate": 5.328607846794752e-07, "loss": 0.1055, "step": 60130 }, { "epoch": 1.402250007286293, "grad_norm": 2.3317031860351562, "learning_rate": 5.327830628614064e-07, "loss": 0.1076, "step": 60140 }, { "epoch": 1.402483168663111, "grad_norm": 1.28895902633667, "learning_rate": 5.327053410433376e-07, "loss": 0.1033, "step": 60150 }, { "epoch": 1.402716330039929, "grad_norm": 1.4163389205932617, "learning_rate": 5.326276192252689e-07, "loss": 0.1072, "step": 60160 }, { "epoch": 1.4029494914167469, "grad_norm": 1.4645006656646729, "learning_rate": 5.325498974072001e-07, "loss": 0.118, "step": 60170 }, { "epoch": 1.4031826527935647, "grad_norm": 1.3937288522720337, "learning_rate": 5.324721755891314e-07, "loss": 0.1036, "step": 60180 }, { "epoch": 1.4034158141703827, "grad_norm": 4.533760070800781, "learning_rate": 5.323944537710626e-07, "loss": 0.1199, "step": 60190 }, { "epoch": 1.4036489755472006, "grad_norm": 1.2106859683990479, "learning_rate": 5.323167319529939e-07, "loss": 0.1046, "step": 60200 }, { "epoch": 1.4038821369240186, "grad_norm": 1.3378171920776367, "learning_rate": 5.32239010134925e-07, "loss": 0.1045, "step": 60210 }, { "epoch": 1.4041152983008365, "grad_norm": 1.8086631298065186, "learning_rate": 5.321612883168562e-07, "loss": 0.1217, "step": 60220 }, { "epoch": 1.4043484596776543, "grad_norm": 2.306100845336914, "learning_rate": 5.320835664987875e-07, "loss": 0.1112, "step": 60230 }, { "epoch": 1.4045816210544724, "grad_norm": 1.3608007431030273, "learning_rate": 5.320058446807187e-07, "loss": 0.1091, "step": 60240 }, { "epoch": 1.4048147824312902, "grad_norm": 1.1957718133926392, "learning_rate": 5.3192812286265e-07, "loss": 0.1058, "step": 60250 }, { "epoch": 1.4050479438081083, "grad_norm": 2.218325614929199, "learning_rate": 5.318504010445813e-07, "loss": 0.1297, "step": 60260 }, { "epoch": 1.405281105184926, "grad_norm": 1.7022382020950317, "learning_rate": 5.317726792265125e-07, "loss": 0.1202, "step": 60270 }, { "epoch": 1.405514266561744, "grad_norm": 1.9279651641845703, "learning_rate": 5.316949574084437e-07, "loss": 0.1074, "step": 60280 }, { "epoch": 1.405747427938562, "grad_norm": 1.892501950263977, "learning_rate": 5.316172355903748e-07, "loss": 0.0969, "step": 60290 }, { "epoch": 1.4059805893153798, "grad_norm": 1.735141634941101, "learning_rate": 5.315395137723061e-07, "loss": 0.1097, "step": 60300 }, { "epoch": 1.406213750692198, "grad_norm": 2.2464590072631836, "learning_rate": 5.314617919542374e-07, "loss": 0.1126, "step": 60310 }, { "epoch": 1.4064469120690157, "grad_norm": 1.4192895889282227, "learning_rate": 5.313840701361686e-07, "loss": 0.1114, "step": 60320 }, { "epoch": 1.4066800734458336, "grad_norm": 1.315644383430481, "learning_rate": 5.313063483180999e-07, "loss": 0.1094, "step": 60330 }, { "epoch": 1.4069132348226516, "grad_norm": 2.260509490966797, "learning_rate": 5.31228626500031e-07, "loss": 0.1002, "step": 60340 }, { "epoch": 1.4071463961994697, "grad_norm": 1.3635828495025635, "learning_rate": 5.311509046819623e-07, "loss": 0.1048, "step": 60350 }, { "epoch": 1.4073795575762875, "grad_norm": 1.602742075920105, "learning_rate": 5.310731828638936e-07, "loss": 0.111, "step": 60360 }, { "epoch": 1.4076127189531054, "grad_norm": 3.1080234050750732, "learning_rate": 5.309954610458247e-07, "loss": 0.1178, "step": 60370 }, { "epoch": 1.4078458803299234, "grad_norm": 1.9282664060592651, "learning_rate": 5.30917739227756e-07, "loss": 0.1105, "step": 60380 }, { "epoch": 1.4080790417067413, "grad_norm": 3.0189433097839355, "learning_rate": 5.308400174096872e-07, "loss": 0.1036, "step": 60390 }, { "epoch": 1.4083122030835593, "grad_norm": 1.5868159532546997, "learning_rate": 5.307622955916184e-07, "loss": 0.1112, "step": 60400 }, { "epoch": 1.4085453644603771, "grad_norm": 1.5925012826919556, "learning_rate": 5.306845737735497e-07, "loss": 0.0954, "step": 60410 }, { "epoch": 1.408778525837195, "grad_norm": 1.2700262069702148, "learning_rate": 5.306068519554809e-07, "loss": 0.1124, "step": 60420 }, { "epoch": 1.409011687214013, "grad_norm": 1.4452567100524902, "learning_rate": 5.305291301374122e-07, "loss": 0.1215, "step": 60430 }, { "epoch": 1.4092448485908309, "grad_norm": 1.229537010192871, "learning_rate": 5.304514083193435e-07, "loss": 0.1098, "step": 60440 }, { "epoch": 1.409478009967649, "grad_norm": 2.419161796569824, "learning_rate": 5.303736865012746e-07, "loss": 0.1063, "step": 60450 }, { "epoch": 1.4097111713444668, "grad_norm": 1.6608878374099731, "learning_rate": 5.302959646832058e-07, "loss": 0.1067, "step": 60460 }, { "epoch": 1.4099443327212846, "grad_norm": 1.5661125183105469, "learning_rate": 5.30218242865137e-07, "loss": 0.1157, "step": 60470 }, { "epoch": 1.4101774940981027, "grad_norm": 1.8595054149627686, "learning_rate": 5.301405210470683e-07, "loss": 0.1038, "step": 60480 }, { "epoch": 1.4104106554749205, "grad_norm": 1.5926315784454346, "learning_rate": 5.300627992289996e-07, "loss": 0.1114, "step": 60490 }, { "epoch": 1.4106438168517386, "grad_norm": 1.5384503602981567, "learning_rate": 5.299850774109308e-07, "loss": 0.1121, "step": 60500 }, { "epoch": 1.4108769782285564, "grad_norm": 1.292381763458252, "learning_rate": 5.299073555928621e-07, "loss": 0.1102, "step": 60510 }, { "epoch": 1.4111101396053742, "grad_norm": 1.8005038499832153, "learning_rate": 5.298296337747932e-07, "loss": 0.1133, "step": 60520 }, { "epoch": 1.4113433009821923, "grad_norm": 1.0806275606155396, "learning_rate": 5.297519119567244e-07, "loss": 0.1125, "step": 60530 }, { "epoch": 1.4115764623590104, "grad_norm": 1.2645788192749023, "learning_rate": 5.296741901386557e-07, "loss": 0.1082, "step": 60540 }, { "epoch": 1.4118096237358282, "grad_norm": 1.5274500846862793, "learning_rate": 5.295964683205869e-07, "loss": 0.1095, "step": 60550 }, { "epoch": 1.412042785112646, "grad_norm": 1.2875885963439941, "learning_rate": 5.295187465025182e-07, "loss": 0.1192, "step": 60560 }, { "epoch": 1.412275946489464, "grad_norm": 1.855231761932373, "learning_rate": 5.294410246844494e-07, "loss": 0.1138, "step": 60570 }, { "epoch": 1.412509107866282, "grad_norm": 1.0619192123413086, "learning_rate": 5.293633028663806e-07, "loss": 0.1017, "step": 60580 }, { "epoch": 1.4127422692431, "grad_norm": 1.835404872894287, "learning_rate": 5.292855810483119e-07, "loss": 0.111, "step": 60590 }, { "epoch": 1.4129754306199178, "grad_norm": 1.3843648433685303, "learning_rate": 5.292078592302431e-07, "loss": 0.1199, "step": 60600 }, { "epoch": 1.4132085919967357, "grad_norm": 2.9697365760803223, "learning_rate": 5.291301374121743e-07, "loss": 0.112, "step": 60610 }, { "epoch": 1.4134417533735537, "grad_norm": 2.645960807800293, "learning_rate": 5.290524155941055e-07, "loss": 0.1226, "step": 60620 }, { "epoch": 1.4136749147503715, "grad_norm": 1.4692163467407227, "learning_rate": 5.289746937760368e-07, "loss": 0.1062, "step": 60630 }, { "epoch": 1.4139080761271896, "grad_norm": 1.3680611848831177, "learning_rate": 5.288969719579681e-07, "loss": 0.1127, "step": 60640 }, { "epoch": 1.4141412375040074, "grad_norm": 1.3989022970199585, "learning_rate": 5.288192501398992e-07, "loss": 0.1073, "step": 60650 }, { "epoch": 1.4143743988808253, "grad_norm": 1.5863975286483765, "learning_rate": 5.287415283218305e-07, "loss": 0.1019, "step": 60660 }, { "epoch": 1.4146075602576433, "grad_norm": 1.3296263217926025, "learning_rate": 5.286638065037617e-07, "loss": 0.1087, "step": 60670 }, { "epoch": 1.4148407216344612, "grad_norm": 1.6572022438049316, "learning_rate": 5.28586084685693e-07, "loss": 0.1268, "step": 60680 }, { "epoch": 1.4150738830112792, "grad_norm": 3.00911021232605, "learning_rate": 5.285083628676242e-07, "loss": 0.1044, "step": 60690 }, { "epoch": 1.415307044388097, "grad_norm": 1.3683301210403442, "learning_rate": 5.284306410495554e-07, "loss": 0.1191, "step": 60700 }, { "epoch": 1.415540205764915, "grad_norm": 1.725950002670288, "learning_rate": 5.283529192314866e-07, "loss": 0.1129, "step": 60710 }, { "epoch": 1.415773367141733, "grad_norm": 1.0903651714324951, "learning_rate": 5.282751974134178e-07, "loss": 0.107, "step": 60720 }, { "epoch": 1.416006528518551, "grad_norm": 2.161115884780884, "learning_rate": 5.281974755953491e-07, "loss": 0.11, "step": 60730 }, { "epoch": 1.4162396898953689, "grad_norm": 1.294264316558838, "learning_rate": 5.281197537772804e-07, "loss": 0.102, "step": 60740 }, { "epoch": 1.4164728512721867, "grad_norm": 1.324004888534546, "learning_rate": 5.280420319592116e-07, "loss": 0.1025, "step": 60750 }, { "epoch": 1.4167060126490048, "grad_norm": 1.23979914188385, "learning_rate": 5.279643101411429e-07, "loss": 0.1109, "step": 60760 }, { "epoch": 1.4169391740258226, "grad_norm": 3.0998497009277344, "learning_rate": 5.278865883230739e-07, "loss": 0.1145, "step": 60770 }, { "epoch": 1.4171723354026406, "grad_norm": 1.4504395723342896, "learning_rate": 5.278088665050052e-07, "loss": 0.0955, "step": 60780 }, { "epoch": 1.4174054967794585, "grad_norm": 1.2363202571868896, "learning_rate": 5.277311446869365e-07, "loss": 0.1135, "step": 60790 }, { "epoch": 1.4176386581562763, "grad_norm": 1.8200372457504272, "learning_rate": 5.276534228688677e-07, "loss": 0.101, "step": 60800 }, { "epoch": 1.4178718195330944, "grad_norm": 1.702074408531189, "learning_rate": 5.27575701050799e-07, "loss": 0.1147, "step": 60810 }, { "epoch": 1.4181049809099122, "grad_norm": 1.826875925064087, "learning_rate": 5.274979792327302e-07, "loss": 0.1045, "step": 60820 }, { "epoch": 1.4183381422867303, "grad_norm": 2.24568772315979, "learning_rate": 5.274202574146614e-07, "loss": 0.1107, "step": 60830 }, { "epoch": 1.418571303663548, "grad_norm": 1.5731252431869507, "learning_rate": 5.273425355965927e-07, "loss": 0.1092, "step": 60840 }, { "epoch": 1.418804465040366, "grad_norm": 1.4682420492172241, "learning_rate": 5.272648137785238e-07, "loss": 0.108, "step": 60850 }, { "epoch": 1.419037626417184, "grad_norm": 2.4800803661346436, "learning_rate": 5.271870919604551e-07, "loss": 0.1096, "step": 60860 }, { "epoch": 1.4192707877940018, "grad_norm": 3.372600793838501, "learning_rate": 5.271093701423863e-07, "loss": 0.1127, "step": 60870 }, { "epoch": 1.41950394917082, "grad_norm": 1.1977570056915283, "learning_rate": 5.270316483243176e-07, "loss": 0.1169, "step": 60880 }, { "epoch": 1.4197371105476377, "grad_norm": 2.8066580295562744, "learning_rate": 5.269539265062488e-07, "loss": 0.1179, "step": 60890 }, { "epoch": 1.4199702719244558, "grad_norm": 1.345266580581665, "learning_rate": 5.2687620468818e-07, "loss": 0.0988, "step": 60900 }, { "epoch": 1.4202034333012736, "grad_norm": 1.251085638999939, "learning_rate": 5.267984828701113e-07, "loss": 0.1144, "step": 60910 }, { "epoch": 1.4204365946780917, "grad_norm": 1.4296696186065674, "learning_rate": 5.267207610520426e-07, "loss": 0.0972, "step": 60920 }, { "epoch": 1.4206697560549095, "grad_norm": 1.441293716430664, "learning_rate": 5.266430392339737e-07, "loss": 0.1128, "step": 60930 }, { "epoch": 1.4209029174317274, "grad_norm": 2.0931968688964844, "learning_rate": 5.26565317415905e-07, "loss": 0.1223, "step": 60940 }, { "epoch": 1.4211360788085454, "grad_norm": 2.307725667953491, "learning_rate": 5.264875955978361e-07, "loss": 0.1022, "step": 60950 }, { "epoch": 1.4213692401853633, "grad_norm": 1.082290768623352, "learning_rate": 5.264098737797674e-07, "loss": 0.1083, "step": 60960 }, { "epoch": 1.4216024015621813, "grad_norm": 1.4822487831115723, "learning_rate": 5.263321519616987e-07, "loss": 0.1018, "step": 60970 }, { "epoch": 1.4218355629389992, "grad_norm": 1.2261545658111572, "learning_rate": 5.262544301436299e-07, "loss": 0.1137, "step": 60980 }, { "epoch": 1.422068724315817, "grad_norm": 1.4464000463485718, "learning_rate": 5.261767083255612e-07, "loss": 0.1053, "step": 60990 }, { "epoch": 1.422301885692635, "grad_norm": 2.402799606323242, "learning_rate": 5.260989865074924e-07, "loss": 0.1189, "step": 61000 }, { "epoch": 1.4225350470694529, "grad_norm": 2.8567399978637695, "learning_rate": 5.260212646894235e-07, "loss": 0.1029, "step": 61010 }, { "epoch": 1.422768208446271, "grad_norm": 4.22568941116333, "learning_rate": 5.259435428713548e-07, "loss": 0.1125, "step": 61020 }, { "epoch": 1.4230013698230888, "grad_norm": 1.2038136720657349, "learning_rate": 5.25865821053286e-07, "loss": 0.1152, "step": 61030 }, { "epoch": 1.4232345311999066, "grad_norm": 2.1353659629821777, "learning_rate": 5.257880992352173e-07, "loss": 0.1149, "step": 61040 }, { "epoch": 1.4234676925767247, "grad_norm": 1.5781803131103516, "learning_rate": 5.257103774171485e-07, "loss": 0.1067, "step": 61050 }, { "epoch": 1.4237008539535427, "grad_norm": 2.29020357131958, "learning_rate": 5.256326555990798e-07, "loss": 0.1094, "step": 61060 }, { "epoch": 1.4239340153303606, "grad_norm": 2.0109856128692627, "learning_rate": 5.255549337810111e-07, "loss": 0.0979, "step": 61070 }, { "epoch": 1.4241671767071784, "grad_norm": 2.237948179244995, "learning_rate": 5.254772119629422e-07, "loss": 0.1094, "step": 61080 }, { "epoch": 1.4244003380839965, "grad_norm": 1.2947618961334229, "learning_rate": 5.253994901448734e-07, "loss": 0.1036, "step": 61090 }, { "epoch": 1.4246334994608143, "grad_norm": 3.469068765640259, "learning_rate": 5.253217683268046e-07, "loss": 0.1155, "step": 61100 }, { "epoch": 1.4248666608376324, "grad_norm": 1.709426999092102, "learning_rate": 5.252440465087359e-07, "loss": 0.1118, "step": 61110 }, { "epoch": 1.4250998222144502, "grad_norm": 1.0014539957046509, "learning_rate": 5.251663246906672e-07, "loss": 0.103, "step": 61120 }, { "epoch": 1.425332983591268, "grad_norm": 2.3221702575683594, "learning_rate": 5.250886028725984e-07, "loss": 0.1048, "step": 61130 }, { "epoch": 1.425566144968086, "grad_norm": 1.2310501337051392, "learning_rate": 5.250108810545296e-07, "loss": 0.1101, "step": 61140 }, { "epoch": 1.425799306344904, "grad_norm": 2.1137845516204834, "learning_rate": 5.249331592364608e-07, "loss": 0.1081, "step": 61150 }, { "epoch": 1.426032467721722, "grad_norm": 2.356130361557007, "learning_rate": 5.248554374183921e-07, "loss": 0.1101, "step": 61160 }, { "epoch": 1.4262656290985398, "grad_norm": 1.3510078191757202, "learning_rate": 5.247777156003233e-07, "loss": 0.1164, "step": 61170 }, { "epoch": 1.4264987904753577, "grad_norm": 1.3740253448486328, "learning_rate": 5.246999937822545e-07, "loss": 0.1064, "step": 61180 }, { "epoch": 1.4267319518521757, "grad_norm": 1.687147617340088, "learning_rate": 5.246222719641858e-07, "loss": 0.1034, "step": 61190 }, { "epoch": 1.4269651132289936, "grad_norm": 1.7765899896621704, "learning_rate": 5.245445501461169e-07, "loss": 0.1174, "step": 61200 }, { "epoch": 1.4271982746058116, "grad_norm": 1.6655596494674683, "learning_rate": 5.244668283280482e-07, "loss": 0.1126, "step": 61210 }, { "epoch": 1.4274314359826294, "grad_norm": 1.5651012659072876, "learning_rate": 5.243891065099795e-07, "loss": 0.1079, "step": 61220 }, { "epoch": 1.4276645973594473, "grad_norm": 2.2672128677368164, "learning_rate": 5.243113846919107e-07, "loss": 0.103, "step": 61230 }, { "epoch": 1.4278977587362653, "grad_norm": 3.763139247894287, "learning_rate": 5.24233662873842e-07, "loss": 0.1026, "step": 61240 }, { "epoch": 1.4281309201130834, "grad_norm": 1.71270751953125, "learning_rate": 5.241559410557731e-07, "loss": 0.1083, "step": 61250 }, { "epoch": 1.4283640814899012, "grad_norm": 2.1717629432678223, "learning_rate": 5.240782192377043e-07, "loss": 0.1147, "step": 61260 }, { "epoch": 1.428597242866719, "grad_norm": 1.7698251008987427, "learning_rate": 5.240004974196356e-07, "loss": 0.1198, "step": 61270 }, { "epoch": 1.4288304042435371, "grad_norm": 1.6541739702224731, "learning_rate": 5.239227756015668e-07, "loss": 0.1116, "step": 61280 }, { "epoch": 1.429063565620355, "grad_norm": 1.0379300117492676, "learning_rate": 5.238450537834981e-07, "loss": 0.103, "step": 61290 }, { "epoch": 1.429296726997173, "grad_norm": 1.4176088571548462, "learning_rate": 5.237673319654294e-07, "loss": 0.1125, "step": 61300 }, { "epoch": 1.4295298883739909, "grad_norm": 3.1490554809570312, "learning_rate": 5.236896101473606e-07, "loss": 0.1183, "step": 61310 }, { "epoch": 1.4297630497508087, "grad_norm": 1.287609338760376, "learning_rate": 5.236118883292918e-07, "loss": 0.1035, "step": 61320 }, { "epoch": 1.4299962111276268, "grad_norm": 1.4875695705413818, "learning_rate": 5.235341665112229e-07, "loss": 0.1088, "step": 61330 }, { "epoch": 1.4302293725044446, "grad_norm": 1.6908442974090576, "learning_rate": 5.234564446931542e-07, "loss": 0.1094, "step": 61340 }, { "epoch": 1.4304625338812627, "grad_norm": 1.2835147380828857, "learning_rate": 5.233787228750854e-07, "loss": 0.1095, "step": 61350 }, { "epoch": 1.4306956952580805, "grad_norm": 1.1644660234451294, "learning_rate": 5.233010010570167e-07, "loss": 0.0974, "step": 61360 }, { "epoch": 1.4309288566348983, "grad_norm": 1.5765050649642944, "learning_rate": 5.23223279238948e-07, "loss": 0.1207, "step": 61370 }, { "epoch": 1.4311620180117164, "grad_norm": 1.6334742307662964, "learning_rate": 5.231455574208791e-07, "loss": 0.0972, "step": 61380 }, { "epoch": 1.4313951793885342, "grad_norm": 1.0233689546585083, "learning_rate": 5.230678356028104e-07, "loss": 0.1108, "step": 61390 }, { "epoch": 1.4316283407653523, "grad_norm": 3.1962151527404785, "learning_rate": 5.229901137847417e-07, "loss": 0.1078, "step": 61400 }, { "epoch": 1.4318615021421701, "grad_norm": 1.735973834991455, "learning_rate": 5.229123919666728e-07, "loss": 0.118, "step": 61410 }, { "epoch": 1.432094663518988, "grad_norm": 1.4001964330673218, "learning_rate": 5.228346701486041e-07, "loss": 0.1091, "step": 61420 }, { "epoch": 1.432327824895806, "grad_norm": 1.1225779056549072, "learning_rate": 5.227569483305353e-07, "loss": 0.1133, "step": 61430 }, { "epoch": 1.432560986272624, "grad_norm": 2.7467117309570312, "learning_rate": 5.226792265124665e-07, "loss": 0.1019, "step": 61440 }, { "epoch": 1.432794147649442, "grad_norm": 2.7745912075042725, "learning_rate": 5.226015046943978e-07, "loss": 0.0996, "step": 61450 }, { "epoch": 1.4330273090262597, "grad_norm": 1.7831156253814697, "learning_rate": 5.22523782876329e-07, "loss": 0.1051, "step": 61460 }, { "epoch": 1.4332604704030778, "grad_norm": 1.31305730342865, "learning_rate": 5.224460610582603e-07, "loss": 0.1152, "step": 61470 }, { "epoch": 1.4334936317798956, "grad_norm": Infinity, "learning_rate": 5.223761114219983e-07, "loss": 0.1053, "step": 61480 }, { "epoch": 1.4337267931567137, "grad_norm": 1.4663364887237549, "learning_rate": 5.222983896039296e-07, "loss": 0.111, "step": 61490 }, { "epoch": 1.4339599545335315, "grad_norm": 3.9705233573913574, "learning_rate": 5.222206677858608e-07, "loss": 0.1215, "step": 61500 }, { "epoch": 1.4341931159103494, "grad_norm": 3.4747934341430664, "learning_rate": 5.221429459677921e-07, "loss": 0.1094, "step": 61510 }, { "epoch": 1.4344262772871674, "grad_norm": 1.905181884765625, "learning_rate": 5.220652241497233e-07, "loss": 0.1161, "step": 61520 }, { "epoch": 1.4346594386639853, "grad_norm": 1.2314388751983643, "learning_rate": 5.219875023316546e-07, "loss": 0.0937, "step": 61530 }, { "epoch": 1.4348926000408033, "grad_norm": 1.0878297090530396, "learning_rate": 5.219097805135857e-07, "loss": 0.0911, "step": 61540 }, { "epoch": 1.4351257614176212, "grad_norm": 1.8635632991790771, "learning_rate": 5.218320586955169e-07, "loss": 0.1037, "step": 61550 }, { "epoch": 1.435358922794439, "grad_norm": 1.8240199089050293, "learning_rate": 5.217543368774482e-07, "loss": 0.1077, "step": 61560 }, { "epoch": 1.435592084171257, "grad_norm": 1.1603927612304688, "learning_rate": 5.216766150593794e-07, "loss": 0.1034, "step": 61570 }, { "epoch": 1.435825245548075, "grad_norm": 1.5660676956176758, "learning_rate": 5.215988932413107e-07, "loss": 0.0965, "step": 61580 }, { "epoch": 1.436058406924893, "grad_norm": 1.9403679370880127, "learning_rate": 5.21521171423242e-07, "loss": 0.1208, "step": 61590 }, { "epoch": 1.4362915683017108, "grad_norm": 3.0284664630889893, "learning_rate": 5.2145122178698e-07, "loss": 0.1145, "step": 61600 }, { "epoch": 1.4365247296785286, "grad_norm": 2.211289644241333, "learning_rate": 5.213734999689113e-07, "loss": 0.1195, "step": 61610 }, { "epoch": 1.4367578910553467, "grad_norm": 1.4732023477554321, "learning_rate": 5.212957781508425e-07, "loss": 0.1109, "step": 61620 }, { "epoch": 1.4369910524321647, "grad_norm": 1.8039799928665161, "learning_rate": 5.212180563327737e-07, "loss": 0.1066, "step": 61630 }, { "epoch": 1.4372242138089826, "grad_norm": 1.5516927242279053, "learning_rate": 5.211403345147049e-07, "loss": 0.1029, "step": 61640 }, { "epoch": 1.4374573751858004, "grad_norm": 1.6059949398040771, "learning_rate": 5.210626126966362e-07, "loss": 0.1169, "step": 61650 }, { "epoch": 1.4376905365626185, "grad_norm": 2.2971601486206055, "learning_rate": 5.209848908785675e-07, "loss": 0.1188, "step": 61660 }, { "epoch": 1.4379236979394363, "grad_norm": 1.8538018465042114, "learning_rate": 5.209071690604986e-07, "loss": 0.1165, "step": 61670 }, { "epoch": 1.4381568593162544, "grad_norm": 1.522165060043335, "learning_rate": 5.208294472424299e-07, "loss": 0.1206, "step": 61680 }, { "epoch": 1.4383900206930722, "grad_norm": 1.413742184638977, "learning_rate": 5.20751725424361e-07, "loss": 0.1051, "step": 61690 }, { "epoch": 1.43862318206989, "grad_norm": 2.2782373428344727, "learning_rate": 5.206740036062923e-07, "loss": 0.1131, "step": 61700 }, { "epoch": 1.438856343446708, "grad_norm": 1.6149879693984985, "learning_rate": 5.205962817882236e-07, "loss": 0.1129, "step": 61710 }, { "epoch": 1.439089504823526, "grad_norm": 1.8572182655334473, "learning_rate": 5.205185599701548e-07, "loss": 0.1111, "step": 61720 }, { "epoch": 1.439322666200344, "grad_norm": 1.784849762916565, "learning_rate": 5.204408381520861e-07, "loss": 0.1158, "step": 61730 }, { "epoch": 1.4395558275771618, "grad_norm": 1.2880500555038452, "learning_rate": 5.203631163340173e-07, "loss": 0.1159, "step": 61740 }, { "epoch": 1.4397889889539797, "grad_norm": 1.4428573846817017, "learning_rate": 5.202853945159484e-07, "loss": 0.1019, "step": 61750 }, { "epoch": 1.4400221503307977, "grad_norm": 1.9063414335250854, "learning_rate": 5.202076726978797e-07, "loss": 0.1033, "step": 61760 }, { "epoch": 1.4402553117076156, "grad_norm": 2.7599942684173584, "learning_rate": 5.201299508798109e-07, "loss": 0.1048, "step": 61770 }, { "epoch": 1.4404884730844336, "grad_norm": 1.7333201169967651, "learning_rate": 5.200522290617422e-07, "loss": 0.112, "step": 61780 }, { "epoch": 1.4407216344612515, "grad_norm": 1.1905814409255981, "learning_rate": 5.199745072436734e-07, "loss": 0.0934, "step": 61790 }, { "epoch": 1.4409547958380693, "grad_norm": 1.5416107177734375, "learning_rate": 5.198967854256047e-07, "loss": 0.113, "step": 61800 }, { "epoch": 1.4411879572148873, "grad_norm": 1.301985263824463, "learning_rate": 5.19819063607536e-07, "loss": 0.11, "step": 61810 }, { "epoch": 1.4414211185917054, "grad_norm": 2.1710286140441895, "learning_rate": 5.197413417894671e-07, "loss": 0.12, "step": 61820 }, { "epoch": 1.4416542799685232, "grad_norm": 1.8766244649887085, "learning_rate": 5.196636199713983e-07, "loss": 0.0995, "step": 61830 }, { "epoch": 1.441887441345341, "grad_norm": 3.1614725589752197, "learning_rate": 5.195858981533295e-07, "loss": 0.1139, "step": 61840 }, { "epoch": 1.4421206027221591, "grad_norm": 1.1533641815185547, "learning_rate": 5.195081763352608e-07, "loss": 0.0973, "step": 61850 }, { "epoch": 1.442353764098977, "grad_norm": 1.6163685321807861, "learning_rate": 5.194304545171921e-07, "loss": 0.1125, "step": 61860 }, { "epoch": 1.442586925475795, "grad_norm": 2.327188014984131, "learning_rate": 5.193527326991233e-07, "loss": 0.1108, "step": 61870 }, { "epoch": 1.4428200868526129, "grad_norm": 1.9778754711151123, "learning_rate": 5.192750108810545e-07, "loss": 0.1027, "step": 61880 }, { "epoch": 1.4430532482294307, "grad_norm": 1.6901018619537354, "learning_rate": 5.191972890629857e-07, "loss": 0.1047, "step": 61890 }, { "epoch": 1.4432864096062488, "grad_norm": 1.375587821006775, "learning_rate": 5.19119567244917e-07, "loss": 0.0949, "step": 61900 }, { "epoch": 1.4435195709830666, "grad_norm": 1.2673951387405396, "learning_rate": 5.190418454268482e-07, "loss": 0.1201, "step": 61910 }, { "epoch": 1.4437527323598847, "grad_norm": 1.431261420249939, "learning_rate": 5.189641236087794e-07, "loss": 0.107, "step": 61920 }, { "epoch": 1.4439858937367025, "grad_norm": 1.955349087715149, "learning_rate": 5.188864017907107e-07, "loss": 0.1069, "step": 61930 }, { "epoch": 1.4442190551135203, "grad_norm": 1.8729703426361084, "learning_rate": 5.188086799726418e-07, "loss": 0.1027, "step": 61940 }, { "epoch": 1.4444522164903384, "grad_norm": 2.3654351234436035, "learning_rate": 5.187309581545731e-07, "loss": 0.1146, "step": 61950 }, { "epoch": 1.4446853778671562, "grad_norm": 1.8914251327514648, "learning_rate": 5.186532363365044e-07, "loss": 0.1105, "step": 61960 }, { "epoch": 1.4449185392439743, "grad_norm": 1.9982370138168335, "learning_rate": 5.185755145184356e-07, "loss": 0.1052, "step": 61970 }, { "epoch": 1.4451517006207921, "grad_norm": 2.289797067642212, "learning_rate": 5.184977927003669e-07, "loss": 0.114, "step": 61980 }, { "epoch": 1.44538486199761, "grad_norm": 3.1654551029205322, "learning_rate": 5.184200708822982e-07, "loss": 0.1005, "step": 61990 }, { "epoch": 1.445618023374428, "grad_norm": 1.0441455841064453, "learning_rate": 5.183423490642292e-07, "loss": 0.098, "step": 62000 }, { "epoch": 1.445851184751246, "grad_norm": 1.7616956233978271, "learning_rate": 5.182646272461605e-07, "loss": 0.106, "step": 62010 }, { "epoch": 1.446084346128064, "grad_norm": 2.533994436264038, "learning_rate": 5.181869054280917e-07, "loss": 0.107, "step": 62020 }, { "epoch": 1.4463175075048817, "grad_norm": 1.2865071296691895, "learning_rate": 5.18109183610023e-07, "loss": 0.1006, "step": 62030 }, { "epoch": 1.4465506688816998, "grad_norm": 1.533934235572815, "learning_rate": 5.180314617919543e-07, "loss": 0.1199, "step": 62040 }, { "epoch": 1.4467838302585176, "grad_norm": 2.844728708267212, "learning_rate": 5.179537399738855e-07, "loss": 0.1129, "step": 62050 }, { "epoch": 1.4470169916353357, "grad_norm": 1.4482632875442505, "learning_rate": 5.178760181558167e-07, "loss": 0.1161, "step": 62060 }, { "epoch": 1.4472501530121535, "grad_norm": 1.3925096988677979, "learning_rate": 5.177982963377479e-07, "loss": 0.0923, "step": 62070 }, { "epoch": 1.4474833143889714, "grad_norm": 1.702351689338684, "learning_rate": 5.177205745196791e-07, "loss": 0.0982, "step": 62080 }, { "epoch": 1.4477164757657894, "grad_norm": 1.334070086479187, "learning_rate": 5.176428527016104e-07, "loss": 0.1094, "step": 62090 }, { "epoch": 1.4479496371426073, "grad_norm": 1.8177142143249512, "learning_rate": 5.175651308835416e-07, "loss": 0.1147, "step": 62100 }, { "epoch": 1.4481827985194253, "grad_norm": 1.3187559843063354, "learning_rate": 5.174874090654729e-07, "loss": 0.1069, "step": 62110 }, { "epoch": 1.4484159598962432, "grad_norm": 1.166792869567871, "learning_rate": 5.17409687247404e-07, "loss": 0.1059, "step": 62120 }, { "epoch": 1.448649121273061, "grad_norm": 1.7178179025650024, "learning_rate": 5.173319654293353e-07, "loss": 0.1116, "step": 62130 }, { "epoch": 1.448882282649879, "grad_norm": 2.6199264526367188, "learning_rate": 5.172542436112666e-07, "loss": 0.1087, "step": 62140 }, { "epoch": 1.449115444026697, "grad_norm": 1.0697544813156128, "learning_rate": 5.171765217931978e-07, "loss": 0.1056, "step": 62150 }, { "epoch": 1.449348605403515, "grad_norm": 1.8557595014572144, "learning_rate": 5.17098799975129e-07, "loss": 0.1092, "step": 62160 }, { "epoch": 1.4495817667803328, "grad_norm": 2.1588239669799805, "learning_rate": 5.170210781570602e-07, "loss": 0.1038, "step": 62170 }, { "epoch": 1.4498149281571509, "grad_norm": 1.1595646142959595, "learning_rate": 5.169433563389914e-07, "loss": 0.1079, "step": 62180 }, { "epoch": 1.4500480895339687, "grad_norm": 2.1387951374053955, "learning_rate": 5.168656345209227e-07, "loss": 0.1045, "step": 62190 }, { "epoch": 1.4502812509107867, "grad_norm": 1.546189308166504, "learning_rate": 5.167879127028539e-07, "loss": 0.1097, "step": 62200 }, { "epoch": 1.4505144122876046, "grad_norm": 1.1981154680252075, "learning_rate": 5.167101908847852e-07, "loss": 0.1107, "step": 62210 }, { "epoch": 1.4507475736644224, "grad_norm": 2.32736873626709, "learning_rate": 5.166324690667164e-07, "loss": 0.1049, "step": 62220 }, { "epoch": 1.4509807350412405, "grad_norm": 2.853085994720459, "learning_rate": 5.165547472486477e-07, "loss": 0.1022, "step": 62230 }, { "epoch": 1.4512138964180583, "grad_norm": 1.4409767389297485, "learning_rate": 5.164770254305788e-07, "loss": 0.1034, "step": 62240 }, { "epoch": 1.4514470577948764, "grad_norm": 3.5658535957336426, "learning_rate": 5.1639930361251e-07, "loss": 0.0983, "step": 62250 }, { "epoch": 1.4516802191716942, "grad_norm": 1.7020469903945923, "learning_rate": 5.163215817944413e-07, "loss": 0.1002, "step": 62260 }, { "epoch": 1.451913380548512, "grad_norm": 1.3564900159835815, "learning_rate": 5.162438599763725e-07, "loss": 0.1266, "step": 62270 }, { "epoch": 1.45214654192533, "grad_norm": 1.7387462854385376, "learning_rate": 5.161661381583038e-07, "loss": 0.1058, "step": 62280 }, { "epoch": 1.452379703302148, "grad_norm": 1.46476149559021, "learning_rate": 5.160884163402351e-07, "loss": 0.1236, "step": 62290 }, { "epoch": 1.452612864678966, "grad_norm": 1.6917123794555664, "learning_rate": 5.160106945221663e-07, "loss": 0.1134, "step": 62300 }, { "epoch": 1.4528460260557838, "grad_norm": 1.2582297325134277, "learning_rate": 5.159329727040975e-07, "loss": 0.1087, "step": 62310 }, { "epoch": 1.4530791874326017, "grad_norm": 2.345045328140259, "learning_rate": 5.158552508860286e-07, "loss": 0.1116, "step": 62320 }, { "epoch": 1.4533123488094197, "grad_norm": 2.1542887687683105, "learning_rate": 5.157775290679599e-07, "loss": 0.1039, "step": 62330 }, { "epoch": 1.4535455101862378, "grad_norm": 2.1807565689086914, "learning_rate": 5.156998072498912e-07, "loss": 0.1164, "step": 62340 }, { "epoch": 1.4537786715630556, "grad_norm": 1.0915954113006592, "learning_rate": 5.156220854318224e-07, "loss": 0.1086, "step": 62350 }, { "epoch": 1.4540118329398735, "grad_norm": 1.767733097076416, "learning_rate": 5.155443636137537e-07, "loss": 0.1102, "step": 62360 }, { "epoch": 1.4542449943166915, "grad_norm": 2.0558021068573, "learning_rate": 5.154666417956848e-07, "loss": 0.1006, "step": 62370 }, { "epoch": 1.4544781556935094, "grad_norm": 1.3651421070098877, "learning_rate": 5.153889199776161e-07, "loss": 0.1161, "step": 62380 }, { "epoch": 1.4547113170703274, "grad_norm": 1.6265254020690918, "learning_rate": 5.153111981595474e-07, "loss": 0.1119, "step": 62390 }, { "epoch": 1.4549444784471453, "grad_norm": 1.2723162174224854, "learning_rate": 5.152334763414785e-07, "loss": 0.1056, "step": 62400 }, { "epoch": 1.455177639823963, "grad_norm": 1.3749732971191406, "learning_rate": 5.151557545234098e-07, "loss": 0.113, "step": 62410 }, { "epoch": 1.4554108012007811, "grad_norm": 1.2301274538040161, "learning_rate": 5.15078032705341e-07, "loss": 0.102, "step": 62420 }, { "epoch": 1.455643962577599, "grad_norm": 4.905620574951172, "learning_rate": 5.150003108872722e-07, "loss": 0.1124, "step": 62430 }, { "epoch": 1.455877123954417, "grad_norm": 2.2891788482666016, "learning_rate": 5.149225890692035e-07, "loss": 0.1094, "step": 62440 }, { "epoch": 1.4561102853312349, "grad_norm": 2.8140995502471924, "learning_rate": 5.148448672511347e-07, "loss": 0.1104, "step": 62450 }, { "epoch": 1.4563434467080527, "grad_norm": 2.3474650382995605, "learning_rate": 5.14767145433066e-07, "loss": 0.1123, "step": 62460 }, { "epoch": 1.4565766080848708, "grad_norm": 3.448734998703003, "learning_rate": 5.146894236149973e-07, "loss": 0.1117, "step": 62470 }, { "epoch": 1.4568097694616886, "grad_norm": 2.648515462875366, "learning_rate": 5.146117017969284e-07, "loss": 0.0989, "step": 62480 }, { "epoch": 1.4570429308385067, "grad_norm": 1.0137470960617065, "learning_rate": 5.145339799788596e-07, "loss": 0.1059, "step": 62490 }, { "epoch": 1.4572760922153245, "grad_norm": 2.379002094268799, "learning_rate": 5.144562581607908e-07, "loss": 0.1166, "step": 62500 }, { "epoch": 1.4575092535921423, "grad_norm": 1.8289759159088135, "learning_rate": 5.143785363427221e-07, "loss": 0.0974, "step": 62510 }, { "epoch": 1.4577424149689604, "grad_norm": 2.794877529144287, "learning_rate": 5.143008145246534e-07, "loss": 0.1001, "step": 62520 }, { "epoch": 1.4579755763457785, "grad_norm": 1.3289645910263062, "learning_rate": 5.142230927065846e-07, "loss": 0.098, "step": 62530 }, { "epoch": 1.4582087377225963, "grad_norm": 1.2426869869232178, "learning_rate": 5.141453708885159e-07, "loss": 0.1006, "step": 62540 }, { "epoch": 1.4584418990994141, "grad_norm": 2.16935658454895, "learning_rate": 5.14067649070447e-07, "loss": 0.1044, "step": 62550 }, { "epoch": 1.4586750604762322, "grad_norm": 3.255861759185791, "learning_rate": 5.139899272523782e-07, "loss": 0.098, "step": 62560 }, { "epoch": 1.45890822185305, "grad_norm": 1.3942228555679321, "learning_rate": 5.139122054343095e-07, "loss": 0.1059, "step": 62570 }, { "epoch": 1.459141383229868, "grad_norm": 1.5698102712631226, "learning_rate": 5.138344836162407e-07, "loss": 0.1023, "step": 62580 }, { "epoch": 1.459374544606686, "grad_norm": 1.3420194387435913, "learning_rate": 5.13756761798172e-07, "loss": 0.1106, "step": 62590 }, { "epoch": 1.4596077059835038, "grad_norm": 1.3383049964904785, "learning_rate": 5.136790399801032e-07, "loss": 0.1105, "step": 62600 }, { "epoch": 1.4598408673603218, "grad_norm": 1.7079545259475708, "learning_rate": 5.136013181620344e-07, "loss": 0.1023, "step": 62610 }, { "epoch": 1.4600740287371397, "grad_norm": 2.0731430053710938, "learning_rate": 5.135235963439657e-07, "loss": 0.1153, "step": 62620 }, { "epoch": 1.4603071901139577, "grad_norm": 2.316680908203125, "learning_rate": 5.134458745258969e-07, "loss": 0.1051, "step": 62630 }, { "epoch": 1.4605403514907755, "grad_norm": 1.9521905183792114, "learning_rate": 5.133681527078281e-07, "loss": 0.1078, "step": 62640 }, { "epoch": 1.4607735128675934, "grad_norm": 1.9559357166290283, "learning_rate": 5.132904308897593e-07, "loss": 0.1027, "step": 62650 }, { "epoch": 1.4610066742444114, "grad_norm": 1.6062253713607788, "learning_rate": 5.132127090716906e-07, "loss": 0.1027, "step": 62660 }, { "epoch": 1.4612398356212293, "grad_norm": 3.0408525466918945, "learning_rate": 5.131349872536219e-07, "loss": 0.1163, "step": 62670 }, { "epoch": 1.4614729969980473, "grad_norm": 1.106744647026062, "learning_rate": 5.13057265435553e-07, "loss": 0.1051, "step": 62680 }, { "epoch": 1.4617061583748652, "grad_norm": 2.275766134262085, "learning_rate": 5.129795436174843e-07, "loss": 0.1002, "step": 62690 }, { "epoch": 1.461939319751683, "grad_norm": 1.3273687362670898, "learning_rate": 5.129018217994155e-07, "loss": 0.1047, "step": 62700 }, { "epoch": 1.462172481128501, "grad_norm": 2.088824510574341, "learning_rate": 5.128240999813468e-07, "loss": 0.1005, "step": 62710 }, { "epoch": 1.4624056425053191, "grad_norm": 1.5929174423217773, "learning_rate": 5.12746378163278e-07, "loss": 0.1221, "step": 62720 }, { "epoch": 1.462638803882137, "grad_norm": 1.3838961124420166, "learning_rate": 5.126686563452092e-07, "loss": 0.1102, "step": 62730 }, { "epoch": 1.4628719652589548, "grad_norm": 1.2245246171951294, "learning_rate": 5.125909345271404e-07, "loss": 0.1071, "step": 62740 }, { "epoch": 1.4631051266357729, "grad_norm": 1.7493553161621094, "learning_rate": 5.125132127090716e-07, "loss": 0.0937, "step": 62750 }, { "epoch": 1.4633382880125907, "grad_norm": 1.8108025789260864, "learning_rate": 5.124354908910029e-07, "loss": 0.1122, "step": 62760 }, { "epoch": 1.4635714493894088, "grad_norm": 1.8687057495117188, "learning_rate": 5.123577690729342e-07, "loss": 0.1042, "step": 62770 }, { "epoch": 1.4638046107662266, "grad_norm": 2.4500107765197754, "learning_rate": 5.122800472548654e-07, "loss": 0.1139, "step": 62780 }, { "epoch": 1.4640377721430444, "grad_norm": 2.5812244415283203, "learning_rate": 5.122023254367967e-07, "loss": 0.1024, "step": 62790 }, { "epoch": 1.4642709335198625, "grad_norm": 2.0082666873931885, "learning_rate": 5.121246036187277e-07, "loss": 0.1081, "step": 62800 }, { "epoch": 1.4645040948966803, "grad_norm": 2.3052704334259033, "learning_rate": 5.12046881800659e-07, "loss": 0.0968, "step": 62810 }, { "epoch": 1.4647372562734984, "grad_norm": 1.8051146268844604, "learning_rate": 5.119691599825903e-07, "loss": 0.1111, "step": 62820 }, { "epoch": 1.4649704176503162, "grad_norm": 1.3534075021743774, "learning_rate": 5.118914381645215e-07, "loss": 0.1093, "step": 62830 }, { "epoch": 1.465203579027134, "grad_norm": 1.330283761024475, "learning_rate": 5.118137163464528e-07, "loss": 0.1098, "step": 62840 }, { "epoch": 1.465436740403952, "grad_norm": 1.9660191535949707, "learning_rate": 5.11735994528384e-07, "loss": 0.1001, "step": 62850 }, { "epoch": 1.46566990178077, "grad_norm": 1.8782225847244263, "learning_rate": 5.116582727103152e-07, "loss": 0.1055, "step": 62860 }, { "epoch": 1.465903063157588, "grad_norm": 1.9736086130142212, "learning_rate": 5.115805508922465e-07, "loss": 0.104, "step": 62870 }, { "epoch": 1.4661362245344058, "grad_norm": 1.1577767133712769, "learning_rate": 5.115028290741776e-07, "loss": 0.0953, "step": 62880 }, { "epoch": 1.4663693859112237, "grad_norm": 1.5440654754638672, "learning_rate": 5.114251072561089e-07, "loss": 0.1052, "step": 62890 }, { "epoch": 1.4666025472880417, "grad_norm": 1.4134883880615234, "learning_rate": 5.113473854380402e-07, "loss": 0.1086, "step": 62900 }, { "epoch": 1.4668357086648598, "grad_norm": 1.185468316078186, "learning_rate": 5.112696636199714e-07, "loss": 0.1079, "step": 62910 }, { "epoch": 1.4670688700416776, "grad_norm": 1.4446622133255005, "learning_rate": 5.111919418019026e-07, "loss": 0.1074, "step": 62920 }, { "epoch": 1.4673020314184955, "grad_norm": 1.9152603149414062, "learning_rate": 5.111142199838338e-07, "loss": 0.1142, "step": 62930 }, { "epoch": 1.4675351927953135, "grad_norm": 2.0219802856445312, "learning_rate": 5.110364981657651e-07, "loss": 0.1152, "step": 62940 }, { "epoch": 1.4677683541721314, "grad_norm": 1.8896987438201904, "learning_rate": 5.109587763476964e-07, "loss": 0.1059, "step": 62950 }, { "epoch": 1.4680015155489494, "grad_norm": 1.8248227834701538, "learning_rate": 5.108810545296275e-07, "loss": 0.1106, "step": 62960 }, { "epoch": 1.4682346769257673, "grad_norm": 1.3650951385498047, "learning_rate": 5.108033327115588e-07, "loss": 0.1026, "step": 62970 }, { "epoch": 1.468467838302585, "grad_norm": 1.5283904075622559, "learning_rate": 5.107256108934899e-07, "loss": 0.1024, "step": 62980 }, { "epoch": 1.4687009996794032, "grad_norm": 1.2486727237701416, "learning_rate": 5.106478890754212e-07, "loss": 0.1055, "step": 62990 }, { "epoch": 1.468934161056221, "grad_norm": 2.3880693912506104, "learning_rate": 5.105701672573525e-07, "loss": 0.1033, "step": 63000 }, { "epoch": 1.469167322433039, "grad_norm": 1.279646635055542, "learning_rate": 5.104924454392837e-07, "loss": 0.0968, "step": 63010 }, { "epoch": 1.4694004838098569, "grad_norm": 1.3940452337265015, "learning_rate": 5.10414723621215e-07, "loss": 0.122, "step": 63020 }, { "epoch": 1.4696336451866747, "grad_norm": 1.7227990627288818, "learning_rate": 5.103370018031462e-07, "loss": 0.1184, "step": 63030 }, { "epoch": 1.4698668065634928, "grad_norm": 2.7549920082092285, "learning_rate": 5.102592799850773e-07, "loss": 0.0989, "step": 63040 }, { "epoch": 1.4700999679403106, "grad_norm": 2.837466239929199, "learning_rate": 5.101815581670086e-07, "loss": 0.1081, "step": 63050 }, { "epoch": 1.4703331293171287, "grad_norm": 1.2817481756210327, "learning_rate": 5.101038363489398e-07, "loss": 0.1045, "step": 63060 }, { "epoch": 1.4705662906939465, "grad_norm": 1.226894497871399, "learning_rate": 5.100261145308711e-07, "loss": 0.1078, "step": 63070 }, { "epoch": 1.4707994520707643, "grad_norm": 1.4272054433822632, "learning_rate": 5.099483927128023e-07, "loss": 0.0958, "step": 63080 }, { "epoch": 1.4710326134475824, "grad_norm": 1.7297295331954956, "learning_rate": 5.098706708947336e-07, "loss": 0.1177, "step": 63090 }, { "epoch": 1.4712657748244005, "grad_norm": 2.643423080444336, "learning_rate": 5.097929490766649e-07, "loss": 0.1083, "step": 63100 }, { "epoch": 1.4714989362012183, "grad_norm": 2.231189250946045, "learning_rate": 5.09715227258596e-07, "loss": 0.1084, "step": 63110 }, { "epoch": 1.4717320975780361, "grad_norm": 1.3611557483673096, "learning_rate": 5.096375054405272e-07, "loss": 0.1031, "step": 63120 }, { "epoch": 1.4719652589548542, "grad_norm": 2.3515050411224365, "learning_rate": 5.095597836224584e-07, "loss": 0.1164, "step": 63130 }, { "epoch": 1.472198420331672, "grad_norm": 1.770559310913086, "learning_rate": 5.094820618043897e-07, "loss": 0.1219, "step": 63140 }, { "epoch": 1.47243158170849, "grad_norm": 1.345253586769104, "learning_rate": 5.09404339986321e-07, "loss": 0.109, "step": 63150 }, { "epoch": 1.472664743085308, "grad_norm": 1.6892890930175781, "learning_rate": 5.093266181682522e-07, "loss": 0.1158, "step": 63160 }, { "epoch": 1.4728979044621258, "grad_norm": 1.2133983373641968, "learning_rate": 5.092488963501834e-07, "loss": 0.1045, "step": 63170 }, { "epoch": 1.4731310658389438, "grad_norm": 1.5966980457305908, "learning_rate": 5.091711745321146e-07, "loss": 0.1112, "step": 63180 }, { "epoch": 1.4733642272157617, "grad_norm": 1.9651715755462646, "learning_rate": 5.090934527140459e-07, "loss": 0.1094, "step": 63190 }, { "epoch": 1.4735973885925797, "grad_norm": 2.8736066818237305, "learning_rate": 5.090157308959771e-07, "loss": 0.1064, "step": 63200 }, { "epoch": 1.4738305499693976, "grad_norm": 2.426327705383301, "learning_rate": 5.089380090779083e-07, "loss": 0.1117, "step": 63210 }, { "epoch": 1.4740637113462154, "grad_norm": 1.6298398971557617, "learning_rate": 5.088602872598396e-07, "loss": 0.1143, "step": 63220 }, { "epoch": 1.4742968727230334, "grad_norm": 1.2349038124084473, "learning_rate": 5.087825654417707e-07, "loss": 0.1056, "step": 63230 }, { "epoch": 1.4745300340998513, "grad_norm": 1.6821022033691406, "learning_rate": 5.08704843623702e-07, "loss": 0.1036, "step": 63240 }, { "epoch": 1.4747631954766693, "grad_norm": 1.4826772212982178, "learning_rate": 5.086271218056333e-07, "loss": 0.0983, "step": 63250 }, { "epoch": 1.4749963568534872, "grad_norm": 1.4528076648712158, "learning_rate": 5.085493999875645e-07, "loss": 0.1236, "step": 63260 }, { "epoch": 1.475229518230305, "grad_norm": 1.2604013681411743, "learning_rate": 5.084716781694958e-07, "loss": 0.113, "step": 63270 }, { "epoch": 1.475462679607123, "grad_norm": 2.146472930908203, "learning_rate": 5.08393956351427e-07, "loss": 0.1085, "step": 63280 }, { "epoch": 1.4756958409839411, "grad_norm": 2.8945021629333496, "learning_rate": 5.083162345333581e-07, "loss": 0.1162, "step": 63290 }, { "epoch": 1.475929002360759, "grad_norm": 1.2204234600067139, "learning_rate": 5.082385127152894e-07, "loss": 0.108, "step": 63300 }, { "epoch": 1.4761621637375768, "grad_norm": 1.0126183032989502, "learning_rate": 5.081607908972206e-07, "loss": 0.1063, "step": 63310 }, { "epoch": 1.4763953251143949, "grad_norm": 1.1797765493392944, "learning_rate": 5.080830690791519e-07, "loss": 0.1024, "step": 63320 }, { "epoch": 1.4766284864912127, "grad_norm": 1.3259143829345703, "learning_rate": 5.080053472610832e-07, "loss": 0.0951, "step": 63330 }, { "epoch": 1.4768616478680308, "grad_norm": 1.1731618642807007, "learning_rate": 5.079276254430144e-07, "loss": 0.1035, "step": 63340 }, { "epoch": 1.4770948092448486, "grad_norm": 1.6055927276611328, "learning_rate": 5.078499036249456e-07, "loss": 0.1034, "step": 63350 }, { "epoch": 1.4773279706216664, "grad_norm": 1.43122136592865, "learning_rate": 5.077721818068767e-07, "loss": 0.1073, "step": 63360 }, { "epoch": 1.4775611319984845, "grad_norm": 1.3033651113510132, "learning_rate": 5.07694459988808e-07, "loss": 0.0999, "step": 63370 }, { "epoch": 1.4777942933753023, "grad_norm": 1.0760811567306519, "learning_rate": 5.076167381707393e-07, "loss": 0.1013, "step": 63380 }, { "epoch": 1.4780274547521204, "grad_norm": 1.4998162984848022, "learning_rate": 5.075390163526705e-07, "loss": 0.1083, "step": 63390 }, { "epoch": 1.4782606161289382, "grad_norm": 1.3513725996017456, "learning_rate": 5.074612945346018e-07, "loss": 0.0931, "step": 63400 }, { "epoch": 1.478493777505756, "grad_norm": 1.966417670249939, "learning_rate": 5.073835727165329e-07, "loss": 0.1173, "step": 63410 }, { "epoch": 1.4787269388825741, "grad_norm": 1.3677290678024292, "learning_rate": 5.073058508984642e-07, "loss": 0.1118, "step": 63420 }, { "epoch": 1.478960100259392, "grad_norm": 1.6918344497680664, "learning_rate": 5.072281290803955e-07, "loss": 0.1172, "step": 63430 }, { "epoch": 1.47919326163621, "grad_norm": 1.9343291521072388, "learning_rate": 5.071504072623266e-07, "loss": 0.1174, "step": 63440 }, { "epoch": 1.4794264230130278, "grad_norm": 2.570331335067749, "learning_rate": 5.070726854442579e-07, "loss": 0.1015, "step": 63450 }, { "epoch": 1.4796595843898457, "grad_norm": 2.8527944087982178, "learning_rate": 5.069949636261891e-07, "loss": 0.114, "step": 63460 }, { "epoch": 1.4798927457666637, "grad_norm": 1.2679582834243774, "learning_rate": 5.069172418081203e-07, "loss": 0.1108, "step": 63470 }, { "epoch": 1.4801259071434818, "grad_norm": 2.6892173290252686, "learning_rate": 5.068395199900516e-07, "loss": 0.0972, "step": 63480 }, { "epoch": 1.4803590685202996, "grad_norm": 1.2220635414123535, "learning_rate": 5.067617981719828e-07, "loss": 0.1052, "step": 63490 }, { "epoch": 1.4805922298971175, "grad_norm": 2.1471498012542725, "learning_rate": 5.066840763539141e-07, "loss": 0.1143, "step": 63500 }, { "epoch": 1.4808253912739355, "grad_norm": 2.1729447841644287, "learning_rate": 5.066063545358453e-07, "loss": 0.1096, "step": 63510 }, { "epoch": 1.4810585526507534, "grad_norm": 2.154050350189209, "learning_rate": 5.065286327177765e-07, "loss": 0.1048, "step": 63520 }, { "epoch": 1.4812917140275714, "grad_norm": 1.281006097793579, "learning_rate": 5.064509108997077e-07, "loss": 0.0929, "step": 63530 }, { "epoch": 1.4815248754043893, "grad_norm": 1.3988088369369507, "learning_rate": 5.063731890816389e-07, "loss": 0.1143, "step": 63540 }, { "epoch": 1.481758036781207, "grad_norm": 1.2154819965362549, "learning_rate": 5.062954672635702e-07, "loss": 0.1141, "step": 63550 }, { "epoch": 1.4819911981580252, "grad_norm": 1.4470434188842773, "learning_rate": 5.062177454455014e-07, "loss": 0.101, "step": 63560 }, { "epoch": 1.482224359534843, "grad_norm": 2.20082426071167, "learning_rate": 5.061400236274327e-07, "loss": 0.1061, "step": 63570 }, { "epoch": 1.482457520911661, "grad_norm": 1.363709807395935, "learning_rate": 5.06062301809364e-07, "loss": 0.115, "step": 63580 }, { "epoch": 1.482690682288479, "grad_norm": 4.054806232452393, "learning_rate": 5.059845799912952e-07, "loss": 0.0986, "step": 63590 }, { "epoch": 1.4829238436652967, "grad_norm": 1.226441740989685, "learning_rate": 5.059068581732263e-07, "loss": 0.1064, "step": 63600 }, { "epoch": 1.4831570050421148, "grad_norm": 2.8493218421936035, "learning_rate": 5.058291363551575e-07, "loss": 0.1091, "step": 63610 }, { "epoch": 1.4833901664189326, "grad_norm": 1.3239809274673462, "learning_rate": 5.057514145370888e-07, "loss": 0.112, "step": 63620 }, { "epoch": 1.4836233277957507, "grad_norm": 1.6557352542877197, "learning_rate": 5.056736927190201e-07, "loss": 0.1019, "step": 63630 }, { "epoch": 1.4838564891725685, "grad_norm": 1.4757550954818726, "learning_rate": 5.055959709009513e-07, "loss": 0.1094, "step": 63640 }, { "epoch": 1.4840896505493866, "grad_norm": 2.7209534645080566, "learning_rate": 5.055182490828826e-07, "loss": 0.1226, "step": 63650 }, { "epoch": 1.4843228119262044, "grad_norm": 1.936465859413147, "learning_rate": 5.054405272648137e-07, "loss": 0.1087, "step": 63660 }, { "epoch": 1.4845559733030225, "grad_norm": 1.3615939617156982, "learning_rate": 5.05362805446745e-07, "loss": 0.1057, "step": 63670 }, { "epoch": 1.4847891346798403, "grad_norm": 1.5619910955429077, "learning_rate": 5.052850836286762e-07, "loss": 0.1063, "step": 63680 }, { "epoch": 1.4850222960566581, "grad_norm": 1.6990866661071777, "learning_rate": 5.052073618106074e-07, "loss": 0.1096, "step": 63690 }, { "epoch": 1.4852554574334762, "grad_norm": 2.2025649547576904, "learning_rate": 5.051296399925387e-07, "loss": 0.1052, "step": 63700 }, { "epoch": 1.485488618810294, "grad_norm": 1.4365208148956299, "learning_rate": 5.0505191817447e-07, "loss": 0.1054, "step": 63710 }, { "epoch": 1.485721780187112, "grad_norm": 1.3274511098861694, "learning_rate": 5.049741963564011e-07, "loss": 0.1226, "step": 63720 }, { "epoch": 1.48595494156393, "grad_norm": 1.4897384643554688, "learning_rate": 5.048964745383324e-07, "loss": 0.1091, "step": 63730 }, { "epoch": 1.4861881029407478, "grad_norm": 1.3331247568130493, "learning_rate": 5.048187527202636e-07, "loss": 0.1032, "step": 63740 }, { "epoch": 1.4864212643175658, "grad_norm": 5.233120918273926, "learning_rate": 5.047410309021949e-07, "loss": 0.1091, "step": 63750 }, { "epoch": 1.4866544256943837, "grad_norm": 1.4986354112625122, "learning_rate": 5.04663309084126e-07, "loss": 0.1097, "step": 63760 }, { "epoch": 1.4868875870712017, "grad_norm": 1.230137825012207, "learning_rate": 5.045855872660573e-07, "loss": 0.116, "step": 63770 }, { "epoch": 1.4871207484480196, "grad_norm": 1.6808851957321167, "learning_rate": 5.045078654479885e-07, "loss": 0.1067, "step": 63780 }, { "epoch": 1.4873539098248374, "grad_norm": 1.9676432609558105, "learning_rate": 5.044301436299197e-07, "loss": 0.1139, "step": 63790 }, { "epoch": 1.4875870712016555, "grad_norm": 1.5584888458251953, "learning_rate": 5.04352421811851e-07, "loss": 0.1077, "step": 63800 }, { "epoch": 1.4878202325784735, "grad_norm": 1.2317097187042236, "learning_rate": 5.042746999937823e-07, "loss": 0.1068, "step": 63810 }, { "epoch": 1.4880533939552913, "grad_norm": 2.3638947010040283, "learning_rate": 5.041969781757135e-07, "loss": 0.1049, "step": 63820 }, { "epoch": 1.4882865553321092, "grad_norm": 3.1013827323913574, "learning_rate": 5.041192563576448e-07, "loss": 0.1017, "step": 63830 }, { "epoch": 1.4885197167089272, "grad_norm": 1.7247141599655151, "learning_rate": 5.040415345395758e-07, "loss": 0.1185, "step": 63840 }, { "epoch": 1.488752878085745, "grad_norm": 1.1755602359771729, "learning_rate": 5.039638127215071e-07, "loss": 0.1042, "step": 63850 }, { "epoch": 1.4889860394625631, "grad_norm": 1.721143364906311, "learning_rate": 5.038860909034384e-07, "loss": 0.1097, "step": 63860 }, { "epoch": 1.489219200839381, "grad_norm": 3.1222894191741943, "learning_rate": 5.038083690853696e-07, "loss": 0.1128, "step": 63870 }, { "epoch": 1.4894523622161988, "grad_norm": 1.848918080329895, "learning_rate": 5.037306472673009e-07, "loss": 0.1169, "step": 63880 }, { "epoch": 1.4896855235930169, "grad_norm": 1.3468163013458252, "learning_rate": 5.036529254492321e-07, "loss": 0.1151, "step": 63890 }, { "epoch": 1.4899186849698347, "grad_norm": 3.0470263957977295, "learning_rate": 5.035752036311633e-07, "loss": 0.1003, "step": 63900 }, { "epoch": 1.4901518463466528, "grad_norm": 2.1156997680664062, "learning_rate": 5.034974818130946e-07, "loss": 0.1039, "step": 63910 }, { "epoch": 1.4903850077234706, "grad_norm": 1.7090930938720703, "learning_rate": 5.034197599950257e-07, "loss": 0.115, "step": 63920 }, { "epoch": 1.4906181691002884, "grad_norm": 2.7594244480133057, "learning_rate": 5.03342038176957e-07, "loss": 0.1018, "step": 63930 }, { "epoch": 1.4908513304771065, "grad_norm": 1.5735472440719604, "learning_rate": 5.032643163588882e-07, "loss": 0.0983, "step": 63940 }, { "epoch": 1.4910844918539243, "grad_norm": 1.8249845504760742, "learning_rate": 5.031865945408195e-07, "loss": 0.1144, "step": 63950 }, { "epoch": 1.4913176532307424, "grad_norm": 2.257359266281128, "learning_rate": 5.031088727227508e-07, "loss": 0.1054, "step": 63960 }, { "epoch": 1.4915508146075602, "grad_norm": 1.2754954099655151, "learning_rate": 5.030311509046819e-07, "loss": 0.1121, "step": 63970 }, { "epoch": 1.491783975984378, "grad_norm": 1.5341757535934448, "learning_rate": 5.029534290866132e-07, "loss": 0.1255, "step": 63980 }, { "epoch": 1.4920171373611961, "grad_norm": 1.6519237756729126, "learning_rate": 5.028757072685444e-07, "loss": 0.1043, "step": 63990 }, { "epoch": 1.4922502987380142, "grad_norm": 3.297121524810791, "learning_rate": 5.027979854504756e-07, "loss": 0.1137, "step": 64000 }, { "epoch": 1.492483460114832, "grad_norm": 1.0027825832366943, "learning_rate": 5.027202636324069e-07, "loss": 0.0996, "step": 64010 }, { "epoch": 1.4927166214916499, "grad_norm": 1.3287371397018433, "learning_rate": 5.026425418143381e-07, "loss": 0.1209, "step": 64020 }, { "epoch": 1.492949782868468, "grad_norm": 2.4331696033477783, "learning_rate": 5.025648199962693e-07, "loss": 0.109, "step": 64030 }, { "epoch": 1.4931829442452857, "grad_norm": 1.272165298461914, "learning_rate": 5.024870981782005e-07, "loss": 0.1068, "step": 64040 }, { "epoch": 1.4934161056221038, "grad_norm": 1.0965193510055542, "learning_rate": 5.024093763601318e-07, "loss": 0.1012, "step": 64050 }, { "epoch": 1.4936492669989216, "grad_norm": 1.3797097206115723, "learning_rate": 5.023316545420631e-07, "loss": 0.1054, "step": 64060 }, { "epoch": 1.4938824283757395, "grad_norm": 2.143611431121826, "learning_rate": 5.022539327239943e-07, "loss": 0.1098, "step": 64070 }, { "epoch": 1.4941155897525575, "grad_norm": 1.24626886844635, "learning_rate": 5.021762109059255e-07, "loss": 0.1152, "step": 64080 }, { "epoch": 1.4943487511293754, "grad_norm": 1.7219884395599365, "learning_rate": 5.020984890878566e-07, "loss": 0.1126, "step": 64090 }, { "epoch": 1.4945819125061934, "grad_norm": 1.6216565370559692, "learning_rate": 5.020207672697879e-07, "loss": 0.1115, "step": 64100 }, { "epoch": 1.4948150738830113, "grad_norm": 2.4032795429229736, "learning_rate": 5.019430454517192e-07, "loss": 0.0996, "step": 64110 }, { "epoch": 1.495048235259829, "grad_norm": 1.4247300624847412, "learning_rate": 5.018653236336504e-07, "loss": 0.109, "step": 64120 }, { "epoch": 1.4952813966366472, "grad_norm": 1.483505368232727, "learning_rate": 5.017876018155817e-07, "loss": 0.1096, "step": 64130 }, { "epoch": 1.495514558013465, "grad_norm": 1.343104362487793, "learning_rate": 5.01709879997513e-07, "loss": 0.1123, "step": 64140 }, { "epoch": 1.495747719390283, "grad_norm": 1.4179506301879883, "learning_rate": 5.016321581794441e-07, "loss": 0.1048, "step": 64150 }, { "epoch": 1.495980880767101, "grad_norm": 1.4855067729949951, "learning_rate": 5.015544363613753e-07, "loss": 0.1079, "step": 64160 }, { "epoch": 1.4962140421439187, "grad_norm": 2.6450536251068115, "learning_rate": 5.014767145433065e-07, "loss": 0.0972, "step": 64170 }, { "epoch": 1.4964472035207368, "grad_norm": 1.6304303407669067, "learning_rate": 5.013989927252378e-07, "loss": 0.11, "step": 64180 }, { "epoch": 1.4966803648975548, "grad_norm": 2.0475335121154785, "learning_rate": 5.01321270907169e-07, "loss": 0.1181, "step": 64190 }, { "epoch": 1.4969135262743727, "grad_norm": 1.3360880613327026, "learning_rate": 5.012435490891003e-07, "loss": 0.1078, "step": 64200 }, { "epoch": 1.4971466876511905, "grad_norm": 1.0576157569885254, "learning_rate": 5.011658272710315e-07, "loss": 0.1078, "step": 64210 }, { "epoch": 1.4973798490280086, "grad_norm": 1.852280855178833, "learning_rate": 5.010881054529627e-07, "loss": 0.1168, "step": 64220 }, { "epoch": 1.4976130104048264, "grad_norm": 1.2254455089569092, "learning_rate": 5.01010383634894e-07, "loss": 0.1083, "step": 64230 }, { "epoch": 1.4978461717816445, "grad_norm": 1.7122008800506592, "learning_rate": 5.009326618168251e-07, "loss": 0.1091, "step": 64240 }, { "epoch": 1.4980793331584623, "grad_norm": 2.130373239517212, "learning_rate": 5.008549399987564e-07, "loss": 0.1071, "step": 64250 }, { "epoch": 1.4983124945352801, "grad_norm": 1.5222433805465698, "learning_rate": 5.007772181806877e-07, "loss": 0.1068, "step": 64260 }, { "epoch": 1.4985456559120982, "grad_norm": 1.4346880912780762, "learning_rate": 5.006994963626188e-07, "loss": 0.1158, "step": 64270 }, { "epoch": 1.498778817288916, "grad_norm": 2.9166085720062256, "learning_rate": 5.006217745445501e-07, "loss": 0.0985, "step": 64280 }, { "epoch": 1.499011978665734, "grad_norm": 2.0690598487854004, "learning_rate": 5.005440527264814e-07, "loss": 0.1124, "step": 64290 }, { "epoch": 1.499245140042552, "grad_norm": 1.3615410327911377, "learning_rate": 5.004663309084126e-07, "loss": 0.1031, "step": 64300 }, { "epoch": 1.4994783014193698, "grad_norm": 1.1722882986068726, "learning_rate": 5.003886090903439e-07, "loss": 0.1126, "step": 64310 }, { "epoch": 1.4997114627961878, "grad_norm": 2.90535569190979, "learning_rate": 5.00310887272275e-07, "loss": 0.0915, "step": 64320 }, { "epoch": 1.4999446241730057, "grad_norm": 2.177422285079956, "learning_rate": 5.002331654542062e-07, "loss": 0.1045, "step": 64330 }, { "epoch": 1.5001777855498237, "grad_norm": 1.3043063879013062, "learning_rate": 5.001554436361375e-07, "loss": 0.111, "step": 64340 }, { "epoch": 1.5004109469266416, "grad_norm": 1.760979175567627, "learning_rate": 5.000777218180687e-07, "loss": 0.12, "step": 64350 }, { "epoch": 1.5006441083034594, "grad_norm": 1.356188178062439, "learning_rate": 5e-07, "loss": 0.1087, "step": 64360 }, { "epoch": 1.5008772696802775, "grad_norm": 1.4664485454559326, "learning_rate": 4.999222781819312e-07, "loss": 0.1161, "step": 64370 }, { "epoch": 1.5011104310570955, "grad_norm": 1.818361759185791, "learning_rate": 4.998445563638625e-07, "loss": 0.1022, "step": 64380 }, { "epoch": 1.5013435924339134, "grad_norm": 1.2997559309005737, "learning_rate": 4.997668345457937e-07, "loss": 0.1004, "step": 64390 }, { "epoch": 1.5015767538107312, "grad_norm": 3.800960063934326, "learning_rate": 4.996891127277249e-07, "loss": 0.1162, "step": 64400 }, { "epoch": 1.501809915187549, "grad_norm": 1.9124637842178345, "learning_rate": 4.996113909096562e-07, "loss": 0.1132, "step": 64410 }, { "epoch": 1.502043076564367, "grad_norm": 1.3343380689620972, "learning_rate": 4.995336690915874e-07, "loss": 0.1056, "step": 64420 }, { "epoch": 1.5022762379411851, "grad_norm": 1.3818451166152954, "learning_rate": 4.994559472735186e-07, "loss": 0.1117, "step": 64430 }, { "epoch": 1.502509399318003, "grad_norm": 1.3016576766967773, "learning_rate": 4.993782254554499e-07, "loss": 0.1101, "step": 64440 }, { "epoch": 1.5027425606948208, "grad_norm": 2.385747194290161, "learning_rate": 4.993005036373811e-07, "loss": 0.1142, "step": 64450 }, { "epoch": 1.5029757220716389, "grad_norm": 1.7590478658676147, "learning_rate": 4.992227818193123e-07, "loss": 0.1064, "step": 64460 }, { "epoch": 1.5032088834484567, "grad_norm": 1.703138828277588, "learning_rate": 4.991450600012435e-07, "loss": 0.1042, "step": 64470 }, { "epoch": 1.5034420448252748, "grad_norm": 3.0211801528930664, "learning_rate": 4.990673381831748e-07, "loss": 0.1114, "step": 64480 }, { "epoch": 1.5036752062020926, "grad_norm": 1.3219152688980103, "learning_rate": 4.98989616365106e-07, "loss": 0.1098, "step": 64490 }, { "epoch": 1.5039083675789104, "grad_norm": 1.972762107849121, "learning_rate": 4.989118945470372e-07, "loss": 0.1018, "step": 64500 }, { "epoch": 1.5041415289557285, "grad_norm": 1.1094145774841309, "learning_rate": 4.988341727289685e-07, "loss": 0.0934, "step": 64510 }, { "epoch": 1.5043746903325466, "grad_norm": 1.5540351867675781, "learning_rate": 4.987564509108996e-07, "loss": 0.1141, "step": 64520 }, { "epoch": 1.5046078517093644, "grad_norm": 2.512673854827881, "learning_rate": 4.986787290928309e-07, "loss": 0.0988, "step": 64530 }, { "epoch": 1.5048410130861822, "grad_norm": 1.6728171110153198, "learning_rate": 4.986010072747622e-07, "loss": 0.1034, "step": 64540 }, { "epoch": 1.505074174463, "grad_norm": 1.9946235418319702, "learning_rate": 4.985232854566933e-07, "loss": 0.1158, "step": 64550 }, { "epoch": 1.5053073358398181, "grad_norm": 1.5981571674346924, "learning_rate": 4.984455636386246e-07, "loss": 0.1016, "step": 64560 }, { "epoch": 1.5055404972166362, "grad_norm": 0.9806923270225525, "learning_rate": 4.983678418205558e-07, "loss": 0.117, "step": 64570 }, { "epoch": 1.505773658593454, "grad_norm": 1.6372056007385254, "learning_rate": 4.982901200024871e-07, "loss": 0.0941, "step": 64580 }, { "epoch": 1.5060068199702719, "grad_norm": 1.6988391876220703, "learning_rate": 4.982123981844183e-07, "loss": 0.1098, "step": 64590 }, { "epoch": 1.5062399813470897, "grad_norm": 3.3176109790802, "learning_rate": 4.981346763663495e-07, "loss": 0.0959, "step": 64600 }, { "epoch": 1.5064731427239078, "grad_norm": 2.1550817489624023, "learning_rate": 4.980569545482808e-07, "loss": 0.105, "step": 64610 }, { "epoch": 1.5067063041007258, "grad_norm": 1.6543580293655396, "learning_rate": 4.97979232730212e-07, "loss": 0.1107, "step": 64620 }, { "epoch": 1.5069394654775436, "grad_norm": 2.632709264755249, "learning_rate": 4.979015109121432e-07, "loss": 0.1094, "step": 64630 }, { "epoch": 1.5071726268543615, "grad_norm": 1.163020133972168, "learning_rate": 4.978237890940745e-07, "loss": 0.1044, "step": 64640 }, { "epoch": 1.5074057882311795, "grad_norm": 2.556621551513672, "learning_rate": 4.977460672760057e-07, "loss": 0.1216, "step": 64650 }, { "epoch": 1.5076389496079974, "grad_norm": 1.41133451461792, "learning_rate": 4.97668345457937e-07, "loss": 0.0962, "step": 64660 }, { "epoch": 1.5078721109848154, "grad_norm": 1.4684598445892334, "learning_rate": 4.975906236398682e-07, "loss": 0.1141, "step": 64670 }, { "epoch": 1.5081052723616333, "grad_norm": 1.388434886932373, "learning_rate": 4.975129018217994e-07, "loss": 0.1137, "step": 64680 }, { "epoch": 1.5083384337384511, "grad_norm": 3.021927833557129, "learning_rate": 4.974351800037307e-07, "loss": 0.1075, "step": 64690 }, { "epoch": 1.5085715951152692, "grad_norm": 1.1300586462020874, "learning_rate": 4.973574581856618e-07, "loss": 0.1128, "step": 64700 }, { "epoch": 1.5088047564920872, "grad_norm": 2.9860963821411133, "learning_rate": 4.972797363675931e-07, "loss": 0.1099, "step": 64710 }, { "epoch": 1.509037917868905, "grad_norm": 1.3505405187606812, "learning_rate": 4.972020145495244e-07, "loss": 0.1074, "step": 64720 }, { "epoch": 1.509271079245723, "grad_norm": 1.4656338691711426, "learning_rate": 4.971242927314555e-07, "loss": 0.1076, "step": 64730 }, { "epoch": 1.5095042406225407, "grad_norm": 2.444610595703125, "learning_rate": 4.970465709133868e-07, "loss": 0.1082, "step": 64740 }, { "epoch": 1.5097374019993588, "grad_norm": 1.30136239528656, "learning_rate": 4.96968849095318e-07, "loss": 0.122, "step": 64750 }, { "epoch": 1.5099705633761769, "grad_norm": 1.4941169023513794, "learning_rate": 4.968911272772492e-07, "loss": 0.1091, "step": 64760 }, { "epoch": 1.5102037247529947, "grad_norm": 2.4517364501953125, "learning_rate": 4.968134054591805e-07, "loss": 0.1074, "step": 64770 }, { "epoch": 1.5104368861298125, "grad_norm": 1.5121018886566162, "learning_rate": 4.967356836411117e-07, "loss": 0.104, "step": 64780 }, { "epoch": 1.5106700475066304, "grad_norm": 1.409812092781067, "learning_rate": 4.966579618230429e-07, "loss": 0.1162, "step": 64790 }, { "epoch": 1.5109032088834484, "grad_norm": 2.652698516845703, "learning_rate": 4.965802400049741e-07, "loss": 0.124, "step": 64800 }, { "epoch": 1.5111363702602665, "grad_norm": 1.412421703338623, "learning_rate": 4.965025181869054e-07, "loss": 0.1068, "step": 64810 }, { "epoch": 1.5113695316370843, "grad_norm": 1.1395230293273926, "learning_rate": 4.964247963688367e-07, "loss": 0.1055, "step": 64820 }, { "epoch": 1.5116026930139022, "grad_norm": 1.3536796569824219, "learning_rate": 4.963470745507678e-07, "loss": 0.1112, "step": 64830 }, { "epoch": 1.5118358543907202, "grad_norm": 1.5692218542099, "learning_rate": 4.962693527326991e-07, "loss": 0.1121, "step": 64840 }, { "epoch": 1.5120690157675383, "grad_norm": 1.3752654790878296, "learning_rate": 4.961916309146303e-07, "loss": 0.1143, "step": 64850 }, { "epoch": 1.512302177144356, "grad_norm": 1.0586739778518677, "learning_rate": 4.961139090965616e-07, "loss": 0.1275, "step": 64860 }, { "epoch": 1.512535338521174, "grad_norm": 1.4903501272201538, "learning_rate": 4.960361872784928e-07, "loss": 0.1138, "step": 64870 }, { "epoch": 1.5127684998979918, "grad_norm": 1.5530400276184082, "learning_rate": 4.95958465460424e-07, "loss": 0.1136, "step": 64880 }, { "epoch": 1.5130016612748098, "grad_norm": 1.667760968208313, "learning_rate": 4.958807436423553e-07, "loss": 0.1016, "step": 64890 }, { "epoch": 1.513234822651628, "grad_norm": 1.4813865423202515, "learning_rate": 4.958030218242865e-07, "loss": 0.1072, "step": 64900 }, { "epoch": 1.5134679840284457, "grad_norm": 1.4996191263198853, "learning_rate": 4.957253000062177e-07, "loss": 0.0994, "step": 64910 }, { "epoch": 1.5137011454052636, "grad_norm": 2.281770944595337, "learning_rate": 4.95647578188149e-07, "loss": 0.106, "step": 64920 }, { "epoch": 1.5139343067820814, "grad_norm": 1.329086422920227, "learning_rate": 4.955698563700802e-07, "loss": 0.1224, "step": 64930 }, { "epoch": 1.5141674681588995, "grad_norm": 1.4091076850891113, "learning_rate": 4.954921345520115e-07, "loss": 0.1079, "step": 64940 }, { "epoch": 1.5144006295357175, "grad_norm": 1.4963114261627197, "learning_rate": 4.954144127339426e-07, "loss": 0.107, "step": 64950 }, { "epoch": 1.5146337909125354, "grad_norm": 4.1671247482299805, "learning_rate": 4.953366909158739e-07, "loss": 0.1162, "step": 64960 }, { "epoch": 1.5148669522893532, "grad_norm": 1.188429832458496, "learning_rate": 4.952589690978052e-07, "loss": 0.1157, "step": 64970 }, { "epoch": 1.515100113666171, "grad_norm": 1.3195935487747192, "learning_rate": 4.951812472797363e-07, "loss": 0.1046, "step": 64980 }, { "epoch": 1.515333275042989, "grad_norm": 1.8264813423156738, "learning_rate": 4.951035254616676e-07, "loss": 0.1118, "step": 64990 }, { "epoch": 1.5155664364198072, "grad_norm": 1.9841797351837158, "learning_rate": 4.950258036435988e-07, "loss": 0.1266, "step": 65000 }, { "epoch": 1.515799597796625, "grad_norm": 1.5309590101242065, "learning_rate": 4.9494808182553e-07, "loss": 0.1083, "step": 65010 }, { "epoch": 1.5160327591734428, "grad_norm": 2.965500831604004, "learning_rate": 4.948703600074613e-07, "loss": 0.1049, "step": 65020 }, { "epoch": 1.5162659205502609, "grad_norm": 2.4356842041015625, "learning_rate": 4.947926381893925e-07, "loss": 0.1246, "step": 65030 }, { "epoch": 1.516499081927079, "grad_norm": 1.5507696866989136, "learning_rate": 4.947149163713237e-07, "loss": 0.1123, "step": 65040 }, { "epoch": 1.5167322433038968, "grad_norm": 1.8765500783920288, "learning_rate": 4.94637194553255e-07, "loss": 0.1078, "step": 65050 }, { "epoch": 1.5169654046807146, "grad_norm": 1.3602396249771118, "learning_rate": 4.945594727351862e-07, "loss": 0.108, "step": 65060 }, { "epoch": 1.5171985660575324, "grad_norm": 2.7477922439575195, "learning_rate": 4.944817509171174e-07, "loss": 0.1062, "step": 65070 }, { "epoch": 1.5174317274343505, "grad_norm": 1.5354682207107544, "learning_rate": 4.944040290990486e-07, "loss": 0.1006, "step": 65080 }, { "epoch": 1.5176648888111686, "grad_norm": 1.5925713777542114, "learning_rate": 4.943263072809799e-07, "loss": 0.1052, "step": 65090 }, { "epoch": 1.5178980501879864, "grad_norm": 1.0118296146392822, "learning_rate": 4.942485854629112e-07, "loss": 0.1186, "step": 65100 }, { "epoch": 1.5181312115648042, "grad_norm": 2.4551966190338135, "learning_rate": 4.941708636448423e-07, "loss": 0.11, "step": 65110 }, { "epoch": 1.518364372941622, "grad_norm": 2.027878761291504, "learning_rate": 4.940931418267736e-07, "loss": 0.1072, "step": 65120 }, { "epoch": 1.5185975343184401, "grad_norm": 1.3287447690963745, "learning_rate": 4.940154200087048e-07, "loss": 0.1058, "step": 65130 }, { "epoch": 1.5188306956952582, "grad_norm": 1.4381264448165894, "learning_rate": 4.939376981906361e-07, "loss": 0.1102, "step": 65140 }, { "epoch": 1.519063857072076, "grad_norm": 3.23468279838562, "learning_rate": 4.938599763725673e-07, "loss": 0.1106, "step": 65150 }, { "epoch": 1.5192970184488939, "grad_norm": 4.8955888748168945, "learning_rate": 4.937822545544985e-07, "loss": 0.0994, "step": 65160 }, { "epoch": 1.519530179825712, "grad_norm": 2.792492151260376, "learning_rate": 4.937045327364298e-07, "loss": 0.114, "step": 65170 }, { "epoch": 1.5197633412025298, "grad_norm": 1.5079623460769653, "learning_rate": 4.93626810918361e-07, "loss": 0.1173, "step": 65180 }, { "epoch": 1.5199965025793478, "grad_norm": 1.4430456161499023, "learning_rate": 4.935490891002922e-07, "loss": 0.1064, "step": 65190 }, { "epoch": 1.5202296639561657, "grad_norm": 1.3500449657440186, "learning_rate": 4.934713672822235e-07, "loss": 0.1209, "step": 65200 }, { "epoch": 1.5204628253329835, "grad_norm": 2.3630964756011963, "learning_rate": 4.933936454641547e-07, "loss": 0.0987, "step": 65210 }, { "epoch": 1.5206959867098016, "grad_norm": 2.648045063018799, "learning_rate": 4.93315923646086e-07, "loss": 0.1085, "step": 65220 }, { "epoch": 1.5209291480866196, "grad_norm": 1.2467917203903198, "learning_rate": 4.932382018280171e-07, "loss": 0.1054, "step": 65230 }, { "epoch": 1.5211623094634374, "grad_norm": 1.8949023485183716, "learning_rate": 4.931604800099484e-07, "loss": 0.0956, "step": 65240 }, { "epoch": 1.5213954708402553, "grad_norm": 2.6411476135253906, "learning_rate": 4.930827581918797e-07, "loss": 0.1172, "step": 65250 }, { "epoch": 1.5216286322170731, "grad_norm": 1.396999478340149, "learning_rate": 4.930050363738108e-07, "loss": 0.1293, "step": 65260 }, { "epoch": 1.5218617935938912, "grad_norm": 3.316610336303711, "learning_rate": 4.929273145557421e-07, "loss": 0.1112, "step": 65270 }, { "epoch": 1.5220949549707092, "grad_norm": 1.2001060247421265, "learning_rate": 4.928495927376733e-07, "loss": 0.1172, "step": 65280 }, { "epoch": 1.522328116347527, "grad_norm": 1.1601252555847168, "learning_rate": 4.927718709196045e-07, "loss": 0.1031, "step": 65290 }, { "epoch": 1.522561277724345, "grad_norm": 1.333698034286499, "learning_rate": 4.926941491015358e-07, "loss": 0.1057, "step": 65300 }, { "epoch": 1.5227944391011627, "grad_norm": 2.7214832305908203, "learning_rate": 4.92616427283467e-07, "loss": 0.1161, "step": 65310 }, { "epoch": 1.5230276004779808, "grad_norm": 1.3678687810897827, "learning_rate": 4.925387054653982e-07, "loss": 0.1044, "step": 65320 }, { "epoch": 1.5232607618547989, "grad_norm": 1.2517294883728027, "learning_rate": 4.924609836473294e-07, "loss": 0.1047, "step": 65330 }, { "epoch": 1.5234939232316167, "grad_norm": 2.9821743965148926, "learning_rate": 4.923832618292607e-07, "loss": 0.1065, "step": 65340 }, { "epoch": 1.5237270846084345, "grad_norm": 1.2474395036697388, "learning_rate": 4.923055400111919e-07, "loss": 0.1118, "step": 65350 }, { "epoch": 1.5239602459852526, "grad_norm": 1.6103321313858032, "learning_rate": 4.922278181931231e-07, "loss": 0.1015, "step": 65360 }, { "epoch": 1.5241934073620704, "grad_norm": 1.5321877002716064, "learning_rate": 4.921500963750544e-07, "loss": 0.1066, "step": 65370 }, { "epoch": 1.5244265687388885, "grad_norm": 1.953239917755127, "learning_rate": 4.920723745569856e-07, "loss": 0.1118, "step": 65380 }, { "epoch": 1.5246597301157063, "grad_norm": 1.9511951208114624, "learning_rate": 4.919946527389168e-07, "loss": 0.1248, "step": 65390 }, { "epoch": 1.5248928914925242, "grad_norm": 3.566131353378296, "learning_rate": 4.919169309208481e-07, "loss": 0.1057, "step": 65400 }, { "epoch": 1.5251260528693422, "grad_norm": 4.007047653198242, "learning_rate": 4.918392091027793e-07, "loss": 0.1142, "step": 65410 }, { "epoch": 1.5253592142461603, "grad_norm": 1.216563105583191, "learning_rate": 4.917614872847106e-07, "loss": 0.1158, "step": 65420 }, { "epoch": 1.5255923756229781, "grad_norm": 1.6077678203582764, "learning_rate": 4.916837654666417e-07, "loss": 0.1132, "step": 65430 }, { "epoch": 1.525825536999796, "grad_norm": 1.96685791015625, "learning_rate": 4.91606043648573e-07, "loss": 0.1088, "step": 65440 }, { "epoch": 1.5260586983766138, "grad_norm": 2.9640445709228516, "learning_rate": 4.915283218305043e-07, "loss": 0.1106, "step": 65450 }, { "epoch": 1.5262918597534318, "grad_norm": 1.0827597379684448, "learning_rate": 4.914506000124355e-07, "loss": 0.1105, "step": 65460 }, { "epoch": 1.52652502113025, "grad_norm": 1.3634483814239502, "learning_rate": 4.913728781943667e-07, "loss": 0.1138, "step": 65470 }, { "epoch": 1.5267581825070677, "grad_norm": 3.5098776817321777, "learning_rate": 4.91295156376298e-07, "loss": 0.1071, "step": 65480 }, { "epoch": 1.5269913438838856, "grad_norm": 1.341281533241272, "learning_rate": 4.912174345582292e-07, "loss": 0.1114, "step": 65490 }, { "epoch": 1.5272245052607034, "grad_norm": 1.4266352653503418, "learning_rate": 4.911397127401604e-07, "loss": 0.1221, "step": 65500 }, { "epoch": 1.5274576666375215, "grad_norm": 1.4917939901351929, "learning_rate": 4.910619909220916e-07, "loss": 0.1086, "step": 65510 }, { "epoch": 1.5276908280143395, "grad_norm": 1.1962265968322754, "learning_rate": 4.909842691040229e-07, "loss": 0.1149, "step": 65520 }, { "epoch": 1.5279239893911574, "grad_norm": 1.2417579889297485, "learning_rate": 4.90906547285954e-07, "loss": 0.1058, "step": 65530 }, { "epoch": 1.5281571507679752, "grad_norm": 1.9589005708694458, "learning_rate": 4.908288254678853e-07, "loss": 0.1249, "step": 65540 }, { "epoch": 1.5283903121447933, "grad_norm": 1.6138023138046265, "learning_rate": 4.907511036498166e-07, "loss": 0.1039, "step": 65550 }, { "epoch": 1.528623473521611, "grad_norm": 1.1300811767578125, "learning_rate": 4.906733818317477e-07, "loss": 0.1124, "step": 65560 }, { "epoch": 1.5288566348984292, "grad_norm": 1.318924903869629, "learning_rate": 4.90595660013679e-07, "loss": 0.1047, "step": 65570 }, { "epoch": 1.529089796275247, "grad_norm": 3.4989991188049316, "learning_rate": 4.905179381956103e-07, "loss": 0.1105, "step": 65580 }, { "epoch": 1.5293229576520648, "grad_norm": 1.322418451309204, "learning_rate": 4.904402163775414e-07, "loss": 0.1238, "step": 65590 }, { "epoch": 1.5295561190288829, "grad_norm": Infinity, "learning_rate": 4.903702667412795e-07, "loss": 0.103, "step": 65600 }, { "epoch": 1.529789280405701, "grad_norm": 1.9217867851257324, "learning_rate": 4.902925449232108e-07, "loss": 0.108, "step": 65610 }, { "epoch": 1.5300224417825188, "grad_norm": 1.020045518875122, "learning_rate": 4.902148231051421e-07, "loss": 0.1058, "step": 65620 }, { "epoch": 1.5302556031593366, "grad_norm": 2.3327906131744385, "learning_rate": 4.901371012870733e-07, "loss": 0.1148, "step": 65630 }, { "epoch": 1.5304887645361545, "grad_norm": 2.697277545928955, "learning_rate": 4.900593794690045e-07, "loss": 0.0984, "step": 65640 }, { "epoch": 1.5307219259129725, "grad_norm": 1.830256700515747, "learning_rate": 4.899816576509357e-07, "loss": 0.1109, "step": 65650 }, { "epoch": 1.5309550872897906, "grad_norm": 1.214893102645874, "learning_rate": 4.89903935832867e-07, "loss": 0.1058, "step": 65660 }, { "epoch": 1.5311882486666084, "grad_norm": 1.9385594129562378, "learning_rate": 4.898262140147983e-07, "loss": 0.1087, "step": 65670 }, { "epoch": 1.5314214100434262, "grad_norm": 4.644259452819824, "learning_rate": 4.897484921967294e-07, "loss": 0.1117, "step": 65680 }, { "epoch": 1.531654571420244, "grad_norm": 1.4005309343338013, "learning_rate": 4.896707703786607e-07, "loss": 0.1128, "step": 65690 }, { "epoch": 1.5318877327970621, "grad_norm": 1.0705156326293945, "learning_rate": 4.895930485605919e-07, "loss": 0.1094, "step": 65700 }, { "epoch": 1.5321208941738802, "grad_norm": 1.297295331954956, "learning_rate": 4.895153267425232e-07, "loss": 0.1057, "step": 65710 }, { "epoch": 1.532354055550698, "grad_norm": 1.7028508186340332, "learning_rate": 4.894376049244544e-07, "loss": 0.1101, "step": 65720 }, { "epoch": 1.5325872169275159, "grad_norm": 1.2443760633468628, "learning_rate": 4.893598831063856e-07, "loss": 0.1004, "step": 65730 }, { "epoch": 1.532820378304334, "grad_norm": 3.068061351776123, "learning_rate": 4.892821612883169e-07, "loss": 0.111, "step": 65740 }, { "epoch": 1.5330535396811518, "grad_norm": 2.2441022396087646, "learning_rate": 4.89204439470248e-07, "loss": 0.1171, "step": 65750 }, { "epoch": 1.5332867010579698, "grad_norm": 1.8159384727478027, "learning_rate": 4.891267176521793e-07, "loss": 0.1081, "step": 65760 }, { "epoch": 1.5335198624347877, "grad_norm": 1.6112552881240845, "learning_rate": 4.890489958341106e-07, "loss": 0.1267, "step": 65770 }, { "epoch": 1.5337530238116055, "grad_norm": 2.687253952026367, "learning_rate": 4.889712740160417e-07, "loss": 0.0971, "step": 65780 }, { "epoch": 1.5339861851884236, "grad_norm": 1.4844287633895874, "learning_rate": 4.88893552197973e-07, "loss": 0.1016, "step": 65790 }, { "epoch": 1.5342193465652416, "grad_norm": 1.7080540657043457, "learning_rate": 4.888158303799042e-07, "loss": 0.1035, "step": 65800 }, { "epoch": 1.5344525079420595, "grad_norm": 1.707552433013916, "learning_rate": 4.887381085618354e-07, "loss": 0.1121, "step": 65810 }, { "epoch": 1.5346856693188773, "grad_norm": 1.6571040153503418, "learning_rate": 4.886603867437667e-07, "loss": 0.1109, "step": 65820 }, { "epoch": 1.5349188306956951, "grad_norm": 1.3060053586959839, "learning_rate": 4.885826649256979e-07, "loss": 0.1152, "step": 65830 }, { "epoch": 1.5351519920725132, "grad_norm": 3.443014621734619, "learning_rate": 4.885049431076291e-07, "loss": 0.1027, "step": 65840 }, { "epoch": 1.5353851534493312, "grad_norm": 1.5373404026031494, "learning_rate": 4.884272212895603e-07, "loss": 0.1055, "step": 65850 }, { "epoch": 1.535618314826149, "grad_norm": 3.9152374267578125, "learning_rate": 4.883494994714916e-07, "loss": 0.1119, "step": 65860 }, { "epoch": 1.535851476202967, "grad_norm": 1.5682859420776367, "learning_rate": 4.882717776534229e-07, "loss": 0.1051, "step": 65870 }, { "epoch": 1.5360846375797848, "grad_norm": 1.3977845907211304, "learning_rate": 4.88194055835354e-07, "loss": 0.1264, "step": 65880 }, { "epoch": 1.5363177989566028, "grad_norm": 2.0536184310913086, "learning_rate": 4.881163340172853e-07, "loss": 0.1132, "step": 65890 }, { "epoch": 1.5365509603334209, "grad_norm": 2.136014223098755, "learning_rate": 4.880386121992166e-07, "loss": 0.1141, "step": 65900 }, { "epoch": 1.5367841217102387, "grad_norm": 1.9335486888885498, "learning_rate": 4.879608903811478e-07, "loss": 0.1145, "step": 65910 }, { "epoch": 1.5370172830870565, "grad_norm": 1.6037651300430298, "learning_rate": 4.87883168563079e-07, "loss": 0.1033, "step": 65920 }, { "epoch": 1.5372504444638746, "grad_norm": 1.3674119710922241, "learning_rate": 4.878054467450102e-07, "loss": 0.1055, "step": 65930 }, { "epoch": 1.5374836058406924, "grad_norm": 1.5874524116516113, "learning_rate": 4.877277249269415e-07, "loss": 0.1142, "step": 65940 }, { "epoch": 1.5377167672175105, "grad_norm": 1.4305881261825562, "learning_rate": 4.876500031088728e-07, "loss": 0.1119, "step": 65950 }, { "epoch": 1.5379499285943283, "grad_norm": 3.0098705291748047, "learning_rate": 4.875722812908039e-07, "loss": 0.1132, "step": 65960 }, { "epoch": 1.5381830899711462, "grad_norm": 2.000351905822754, "learning_rate": 4.874945594727352e-07, "loss": 0.1068, "step": 65970 }, { "epoch": 1.5384162513479642, "grad_norm": 2.3560426235198975, "learning_rate": 4.874168376546664e-07, "loss": 0.1029, "step": 65980 }, { "epoch": 1.5386494127247823, "grad_norm": 1.5077922344207764, "learning_rate": 4.873391158365976e-07, "loss": 0.107, "step": 65990 }, { "epoch": 1.5388825741016001, "grad_norm": 1.201866865158081, "learning_rate": 4.872613940185289e-07, "loss": 0.1089, "step": 66000 }, { "epoch": 1.539115735478418, "grad_norm": 1.4615085124969482, "learning_rate": 4.871836722004601e-07, "loss": 0.1126, "step": 66010 }, { "epoch": 1.5393488968552358, "grad_norm": 2.1532249450683594, "learning_rate": 4.871059503823914e-07, "loss": 0.0984, "step": 66020 }, { "epoch": 1.5395820582320539, "grad_norm": 1.735585331916809, "learning_rate": 4.870282285643225e-07, "loss": 0.1053, "step": 66030 }, { "epoch": 1.539815219608872, "grad_norm": 2.1930124759674072, "learning_rate": 4.869505067462538e-07, "loss": 0.1117, "step": 66040 }, { "epoch": 1.5400483809856897, "grad_norm": 4.106577396392822, "learning_rate": 4.868727849281851e-07, "loss": 0.1155, "step": 66050 }, { "epoch": 1.5402815423625076, "grad_norm": 1.3263996839523315, "learning_rate": 4.867950631101162e-07, "loss": 0.114, "step": 66060 }, { "epoch": 1.5405147037393254, "grad_norm": 0.9262128472328186, "learning_rate": 4.867173412920475e-07, "loss": 0.1023, "step": 66070 }, { "epoch": 1.5407478651161435, "grad_norm": 1.1499289274215698, "learning_rate": 4.866396194739787e-07, "loss": 0.108, "step": 66080 }, { "epoch": 1.5409810264929615, "grad_norm": 1.1579158306121826, "learning_rate": 4.865618976559099e-07, "loss": 0.1141, "step": 66090 }, { "epoch": 1.5412141878697794, "grad_norm": 1.7086281776428223, "learning_rate": 4.864841758378412e-07, "loss": 0.1077, "step": 66100 }, { "epoch": 1.5414473492465972, "grad_norm": 1.0310617685317993, "learning_rate": 4.864064540197724e-07, "loss": 0.1067, "step": 66110 }, { "epoch": 1.5416805106234153, "grad_norm": 2.0639891624450684, "learning_rate": 4.863287322017036e-07, "loss": 0.1089, "step": 66120 }, { "epoch": 1.5419136720002333, "grad_norm": 2.6034703254699707, "learning_rate": 4.862510103836348e-07, "loss": 0.0982, "step": 66130 }, { "epoch": 1.5421468333770512, "grad_norm": 1.613176703453064, "learning_rate": 4.861732885655661e-07, "loss": 0.1035, "step": 66140 }, { "epoch": 1.542379994753869, "grad_norm": 1.3181579113006592, "learning_rate": 4.860955667474974e-07, "loss": 0.105, "step": 66150 }, { "epoch": 1.5426131561306868, "grad_norm": 1.4739004373550415, "learning_rate": 4.860178449294285e-07, "loss": 0.1154, "step": 66160 }, { "epoch": 1.542846317507505, "grad_norm": 1.4711315631866455, "learning_rate": 4.859401231113598e-07, "loss": 0.1033, "step": 66170 }, { "epoch": 1.543079478884323, "grad_norm": 1.1687562465667725, "learning_rate": 4.85862401293291e-07, "loss": 0.1013, "step": 66180 }, { "epoch": 1.5433126402611408, "grad_norm": 1.579971432685852, "learning_rate": 4.857846794752223e-07, "loss": 0.1211, "step": 66190 }, { "epoch": 1.5435458016379586, "grad_norm": 2.8739356994628906, "learning_rate": 4.857069576571535e-07, "loss": 0.1188, "step": 66200 }, { "epoch": 1.5437789630147765, "grad_norm": 1.7079346179962158, "learning_rate": 4.856292358390847e-07, "loss": 0.1087, "step": 66210 }, { "epoch": 1.5440121243915945, "grad_norm": 1.8078702688217163, "learning_rate": 4.85551514021016e-07, "loss": 0.1044, "step": 66220 }, { "epoch": 1.5442452857684126, "grad_norm": 1.4032307863235474, "learning_rate": 4.854737922029473e-07, "loss": 0.1037, "step": 66230 }, { "epoch": 1.5444784471452304, "grad_norm": 1.7845381498336792, "learning_rate": 4.853960703848784e-07, "loss": 0.0988, "step": 66240 }, { "epoch": 1.5447116085220483, "grad_norm": 5.051512718200684, "learning_rate": 4.853183485668097e-07, "loss": 0.1063, "step": 66250 }, { "epoch": 1.544944769898866, "grad_norm": 1.1552051305770874, "learning_rate": 4.852406267487409e-07, "loss": 0.0973, "step": 66260 }, { "epoch": 1.5451779312756841, "grad_norm": 1.522037386894226, "learning_rate": 4.851629049306721e-07, "loss": 0.1133, "step": 66270 }, { "epoch": 1.5454110926525022, "grad_norm": 1.9192315340042114, "learning_rate": 4.850851831126034e-07, "loss": 0.1072, "step": 66280 }, { "epoch": 1.54564425402932, "grad_norm": 1.2495183944702148, "learning_rate": 4.850074612945346e-07, "loss": 0.1028, "step": 66290 }, { "epoch": 1.5458774154061379, "grad_norm": 1.4001480340957642, "learning_rate": 4.849297394764658e-07, "loss": 0.1146, "step": 66300 }, { "epoch": 1.546110576782956, "grad_norm": 2.1853084564208984, "learning_rate": 4.84852017658397e-07, "loss": 0.1037, "step": 66310 }, { "epoch": 1.546343738159774, "grad_norm": 1.3618228435516357, "learning_rate": 4.847742958403283e-07, "loss": 0.1127, "step": 66320 }, { "epoch": 1.5465768995365918, "grad_norm": 1.9803727865219116, "learning_rate": 4.846965740222594e-07, "loss": 0.1114, "step": 66330 }, { "epoch": 1.5468100609134097, "grad_norm": 3.803490161895752, "learning_rate": 4.846188522041907e-07, "loss": 0.1187, "step": 66340 }, { "epoch": 1.5470432222902275, "grad_norm": 1.533204197883606, "learning_rate": 4.84541130386122e-07, "loss": 0.1172, "step": 66350 }, { "epoch": 1.5472763836670456, "grad_norm": 1.933903694152832, "learning_rate": 4.844634085680531e-07, "loss": 0.1064, "step": 66360 }, { "epoch": 1.5475095450438636, "grad_norm": 1.428910732269287, "learning_rate": 4.843856867499844e-07, "loss": 0.101, "step": 66370 }, { "epoch": 1.5477427064206815, "grad_norm": 1.8688197135925293, "learning_rate": 4.843079649319157e-07, "loss": 0.1085, "step": 66380 }, { "epoch": 1.5479758677974993, "grad_norm": 3.1855876445770264, "learning_rate": 4.842302431138469e-07, "loss": 0.1214, "step": 66390 }, { "epoch": 1.5482090291743171, "grad_norm": 1.6824231147766113, "learning_rate": 4.841525212957781e-07, "loss": 0.1093, "step": 66400 }, { "epoch": 1.5484421905511352, "grad_norm": 1.5172778367996216, "learning_rate": 4.840747994777093e-07, "loss": 0.1046, "step": 66410 }, { "epoch": 1.5486753519279532, "grad_norm": 4.936899185180664, "learning_rate": 4.839970776596406e-07, "loss": 0.114, "step": 66420 }, { "epoch": 1.548908513304771, "grad_norm": 2.4485397338867188, "learning_rate": 4.839193558415719e-07, "loss": 0.1097, "step": 66430 }, { "epoch": 1.549141674681589, "grad_norm": 1.747002363204956, "learning_rate": 4.83841634023503e-07, "loss": 0.1111, "step": 66440 }, { "epoch": 1.549374836058407, "grad_norm": 2.05770206451416, "learning_rate": 4.837639122054343e-07, "loss": 0.1113, "step": 66450 }, { "epoch": 1.5496079974352248, "grad_norm": 1.4731457233428955, "learning_rate": 4.836861903873655e-07, "loss": 0.1114, "step": 66460 }, { "epoch": 1.5498411588120429, "grad_norm": 1.6100928783416748, "learning_rate": 4.836084685692968e-07, "loss": 0.1062, "step": 66470 }, { "epoch": 1.5500743201888607, "grad_norm": 1.520995855331421, "learning_rate": 4.83530746751228e-07, "loss": 0.1095, "step": 66480 }, { "epoch": 1.5503074815656785, "grad_norm": 1.448103904724121, "learning_rate": 4.834530249331592e-07, "loss": 0.112, "step": 66490 }, { "epoch": 1.5505406429424966, "grad_norm": 1.7381221055984497, "learning_rate": 4.833753031150905e-07, "loss": 0.1153, "step": 66500 }, { "epoch": 1.5507738043193147, "grad_norm": 1.1943470239639282, "learning_rate": 4.832975812970217e-07, "loss": 0.1179, "step": 66510 }, { "epoch": 1.5510069656961325, "grad_norm": 1.0455831289291382, "learning_rate": 4.832198594789529e-07, "loss": 0.1014, "step": 66520 }, { "epoch": 1.5512401270729503, "grad_norm": 1.5327270030975342, "learning_rate": 4.831421376608842e-07, "loss": 0.1109, "step": 66530 }, { "epoch": 1.5514732884497682, "grad_norm": 1.694098949432373, "learning_rate": 4.830644158428154e-07, "loss": 0.1094, "step": 66540 }, { "epoch": 1.5517064498265862, "grad_norm": 1.3939590454101562, "learning_rate": 4.829866940247466e-07, "loss": 0.1072, "step": 66550 }, { "epoch": 1.5519396112034043, "grad_norm": 3.5734715461730957, "learning_rate": 4.829089722066778e-07, "loss": 0.1254, "step": 66560 }, { "epoch": 1.5521727725802221, "grad_norm": 1.1845754384994507, "learning_rate": 4.828312503886091e-07, "loss": 0.1085, "step": 66570 }, { "epoch": 1.55240593395704, "grad_norm": 1.2960994243621826, "learning_rate": 4.827535285705403e-07, "loss": 0.1213, "step": 66580 }, { "epoch": 1.5526390953338578, "grad_norm": 2.0277929306030273, "learning_rate": 4.826758067524715e-07, "loss": 0.1154, "step": 66590 }, { "epoch": 1.5528722567106759, "grad_norm": 2.237910270690918, "learning_rate": 4.825980849344028e-07, "loss": 0.1055, "step": 66600 }, { "epoch": 1.553105418087494, "grad_norm": 1.7156234979629517, "learning_rate": 4.825203631163339e-07, "loss": 0.0992, "step": 66610 }, { "epoch": 1.5533385794643118, "grad_norm": 1.8060063123703003, "learning_rate": 4.824426412982652e-07, "loss": 0.0957, "step": 66620 }, { "epoch": 1.5535717408411296, "grad_norm": 1.8545781373977661, "learning_rate": 4.823649194801965e-07, "loss": 0.1003, "step": 66630 }, { "epoch": 1.5538049022179476, "grad_norm": 1.5918302536010742, "learning_rate": 4.822871976621276e-07, "loss": 0.1166, "step": 66640 }, { "epoch": 1.5540380635947655, "grad_norm": 1.4655803442001343, "learning_rate": 4.822094758440589e-07, "loss": 0.1052, "step": 66650 }, { "epoch": 1.5542712249715835, "grad_norm": 2.228459119796753, "learning_rate": 4.821317540259901e-07, "loss": 0.1117, "step": 66660 }, { "epoch": 1.5545043863484014, "grad_norm": 1.3074594736099243, "learning_rate": 4.820540322079214e-07, "loss": 0.1156, "step": 66670 }, { "epoch": 1.5547375477252192, "grad_norm": 2.5138401985168457, "learning_rate": 4.819763103898526e-07, "loss": 0.1182, "step": 66680 }, { "epoch": 1.5549707091020373, "grad_norm": 1.1380141973495483, "learning_rate": 4.818985885717838e-07, "loss": 0.1066, "step": 66690 }, { "epoch": 1.5552038704788553, "grad_norm": 4.124020099639893, "learning_rate": 4.818208667537151e-07, "loss": 0.1006, "step": 66700 }, { "epoch": 1.5554370318556732, "grad_norm": 1.1282342672348022, "learning_rate": 4.817431449356464e-07, "loss": 0.1159, "step": 66710 }, { "epoch": 1.555670193232491, "grad_norm": 1.401216983795166, "learning_rate": 4.816654231175775e-07, "loss": 0.0954, "step": 66720 }, { "epoch": 1.5559033546093088, "grad_norm": 1.3435635566711426, "learning_rate": 4.815877012995088e-07, "loss": 0.1188, "step": 66730 }, { "epoch": 1.556136515986127, "grad_norm": 1.2904821634292603, "learning_rate": 4.8150997948144e-07, "loss": 0.1116, "step": 66740 }, { "epoch": 1.556369677362945, "grad_norm": 1.6365461349487305, "learning_rate": 4.814322576633713e-07, "loss": 0.1034, "step": 66750 }, { "epoch": 1.5566028387397628, "grad_norm": 2.163837432861328, "learning_rate": 4.813545358453025e-07, "loss": 0.1061, "step": 66760 }, { "epoch": 1.5568360001165806, "grad_norm": 2.2743077278137207, "learning_rate": 4.812768140272337e-07, "loss": 0.1122, "step": 66770 }, { "epoch": 1.5570691614933985, "grad_norm": 2.906503677368164, "learning_rate": 4.81199092209165e-07, "loss": 0.1008, "step": 66780 }, { "epoch": 1.5573023228702165, "grad_norm": 1.686531662940979, "learning_rate": 4.811213703910961e-07, "loss": 0.0985, "step": 66790 }, { "epoch": 1.5575354842470346, "grad_norm": 1.495732069015503, "learning_rate": 4.810436485730274e-07, "loss": 0.1091, "step": 66800 }, { "epoch": 1.5577686456238524, "grad_norm": 1.3162951469421387, "learning_rate": 4.809659267549587e-07, "loss": 0.1048, "step": 66810 }, { "epoch": 1.5580018070006703, "grad_norm": 1.678838849067688, "learning_rate": 4.808882049368898e-07, "loss": 0.1048, "step": 66820 }, { "epoch": 1.5582349683774883, "grad_norm": 2.1502468585968018, "learning_rate": 4.808104831188211e-07, "loss": 0.1061, "step": 66830 }, { "epoch": 1.5584681297543062, "grad_norm": 3.3500094413757324, "learning_rate": 4.807327613007523e-07, "loss": 0.1078, "step": 66840 }, { "epoch": 1.5587012911311242, "grad_norm": 2.0276825428009033, "learning_rate": 4.806550394826835e-07, "loss": 0.1077, "step": 66850 }, { "epoch": 1.558934452507942, "grad_norm": 1.379044532775879, "learning_rate": 4.805773176646148e-07, "loss": 0.1093, "step": 66860 }, { "epoch": 1.5591676138847599, "grad_norm": 1.8613731861114502, "learning_rate": 4.80499595846546e-07, "loss": 0.1153, "step": 66870 }, { "epoch": 1.559400775261578, "grad_norm": 3.3973987102508545, "learning_rate": 4.804218740284772e-07, "loss": 0.1126, "step": 66880 }, { "epoch": 1.559633936638396, "grad_norm": 2.9745514392852783, "learning_rate": 4.803441522104084e-07, "loss": 0.112, "step": 66890 }, { "epoch": 1.5598670980152138, "grad_norm": 1.0659071207046509, "learning_rate": 4.802664303923397e-07, "loss": 0.1101, "step": 66900 }, { "epoch": 1.5601002593920317, "grad_norm": 1.106929898262024, "learning_rate": 4.80188708574271e-07, "loss": 0.1091, "step": 66910 }, { "epoch": 1.5603334207688495, "grad_norm": 1.119478702545166, "learning_rate": 4.801109867562021e-07, "loss": 0.1074, "step": 66920 }, { "epoch": 1.5605665821456676, "grad_norm": 1.6249045133590698, "learning_rate": 4.800332649381334e-07, "loss": 0.1096, "step": 66930 }, { "epoch": 1.5607997435224856, "grad_norm": 1.457614779472351, "learning_rate": 4.799555431200646e-07, "loss": 0.109, "step": 66940 }, { "epoch": 1.5610329048993035, "grad_norm": 1.5855046510696411, "learning_rate": 4.798778213019959e-07, "loss": 0.1048, "step": 66950 }, { "epoch": 1.5612660662761213, "grad_norm": 2.619755268096924, "learning_rate": 4.798000994839271e-07, "loss": 0.0992, "step": 66960 }, { "epoch": 1.5614992276529391, "grad_norm": 1.4403516054153442, "learning_rate": 4.797223776658583e-07, "loss": 0.1201, "step": 66970 }, { "epoch": 1.5617323890297572, "grad_norm": 1.4882580041885376, "learning_rate": 4.796446558477896e-07, "loss": 0.1044, "step": 66980 }, { "epoch": 1.5619655504065753, "grad_norm": 1.906935453414917, "learning_rate": 4.795669340297208e-07, "loss": 0.1177, "step": 66990 }, { "epoch": 1.562198711783393, "grad_norm": 3.216353416442871, "learning_rate": 4.79489212211652e-07, "loss": 0.1126, "step": 67000 }, { "epoch": 1.562431873160211, "grad_norm": 2.871863842010498, "learning_rate": 4.794114903935833e-07, "loss": 0.1075, "step": 67010 }, { "epoch": 1.562665034537029, "grad_norm": 1.8635932207107544, "learning_rate": 4.793337685755145e-07, "loss": 0.1032, "step": 67020 }, { "epoch": 1.5628981959138468, "grad_norm": 1.87334406375885, "learning_rate": 4.792560467574458e-07, "loss": 0.1166, "step": 67030 }, { "epoch": 1.5631313572906649, "grad_norm": 2.1780145168304443, "learning_rate": 4.791783249393769e-07, "loss": 0.1055, "step": 67040 }, { "epoch": 1.5633645186674827, "grad_norm": 1.820684552192688, "learning_rate": 4.791006031213082e-07, "loss": 0.1033, "step": 67050 }, { "epoch": 1.5635976800443006, "grad_norm": 1.4261062145233154, "learning_rate": 4.790228813032395e-07, "loss": 0.1027, "step": 67060 }, { "epoch": 1.5638308414211186, "grad_norm": 1.8028850555419922, "learning_rate": 4.789451594851706e-07, "loss": 0.1161, "step": 67070 }, { "epoch": 1.5640640027979367, "grad_norm": 2.1224582195281982, "learning_rate": 4.788674376671019e-07, "loss": 0.107, "step": 67080 }, { "epoch": 1.5642971641747545, "grad_norm": 1.162418246269226, "learning_rate": 4.787897158490331e-07, "loss": 0.099, "step": 67090 }, { "epoch": 1.5645303255515723, "grad_norm": 1.7988313436508179, "learning_rate": 4.787119940309643e-07, "loss": 0.1027, "step": 67100 }, { "epoch": 1.5647634869283902, "grad_norm": 1.6678032875061035, "learning_rate": 4.786342722128956e-07, "loss": 0.0968, "step": 67110 }, { "epoch": 1.5649966483052082, "grad_norm": 1.0960841178894043, "learning_rate": 4.785565503948268e-07, "loss": 0.1044, "step": 67120 }, { "epoch": 1.5652298096820263, "grad_norm": 1.1505810022354126, "learning_rate": 4.78478828576758e-07, "loss": 0.1032, "step": 67130 }, { "epoch": 1.5654629710588441, "grad_norm": 3.657463550567627, "learning_rate": 4.784011067586892e-07, "loss": 0.1084, "step": 67140 }, { "epoch": 1.565696132435662, "grad_norm": 1.1861659288406372, "learning_rate": 4.783233849406205e-07, "loss": 0.1117, "step": 67150 }, { "epoch": 1.5659292938124798, "grad_norm": 2.3805418014526367, "learning_rate": 4.782456631225518e-07, "loss": 0.1192, "step": 67160 }, { "epoch": 1.5661624551892979, "grad_norm": 1.8697866201400757, "learning_rate": 4.781679413044829e-07, "loss": 0.1023, "step": 67170 }, { "epoch": 1.566395616566116, "grad_norm": 1.1572610139846802, "learning_rate": 4.780902194864142e-07, "loss": 0.1124, "step": 67180 }, { "epoch": 1.5666287779429338, "grad_norm": 1.362936019897461, "learning_rate": 4.780124976683455e-07, "loss": 0.1096, "step": 67190 }, { "epoch": 1.5668619393197516, "grad_norm": 2.362034559249878, "learning_rate": 4.779347758502767e-07, "loss": 0.1127, "step": 67200 }, { "epoch": 1.5670951006965697, "grad_norm": 1.8089390993118286, "learning_rate": 4.778570540322079e-07, "loss": 0.1125, "step": 67210 }, { "epoch": 1.5673282620733875, "grad_norm": 1.5409882068634033, "learning_rate": 4.777793322141391e-07, "loss": 0.1065, "step": 67220 }, { "epoch": 1.5675614234502055, "grad_norm": 2.1973354816436768, "learning_rate": 4.777016103960704e-07, "loss": 0.1035, "step": 67230 }, { "epoch": 1.5677945848270234, "grad_norm": 1.1955245733261108, "learning_rate": 4.776238885780017e-07, "loss": 0.1028, "step": 67240 }, { "epoch": 1.5680277462038412, "grad_norm": 1.8643549680709839, "learning_rate": 4.775461667599328e-07, "loss": 0.112, "step": 67250 }, { "epoch": 1.5682609075806593, "grad_norm": 3.3813536167144775, "learning_rate": 4.774684449418641e-07, "loss": 0.1039, "step": 67260 }, { "epoch": 1.5684940689574773, "grad_norm": 1.1844143867492676, "learning_rate": 4.773907231237953e-07, "loss": 0.113, "step": 67270 }, { "epoch": 1.5687272303342952, "grad_norm": 2.009080410003662, "learning_rate": 4.773130013057266e-07, "loss": 0.1032, "step": 67280 }, { "epoch": 1.568960391711113, "grad_norm": 1.249680519104004, "learning_rate": 4.772352794876578e-07, "loss": 0.0987, "step": 67290 }, { "epoch": 1.5691935530879308, "grad_norm": 0.9175291657447815, "learning_rate": 4.77157557669589e-07, "loss": 0.102, "step": 67300 }, { "epoch": 1.569426714464749, "grad_norm": 1.288917064666748, "learning_rate": 4.770798358515203e-07, "loss": 0.1167, "step": 67310 }, { "epoch": 1.569659875841567, "grad_norm": 1.6716902256011963, "learning_rate": 4.770021140334514e-07, "loss": 0.114, "step": 67320 }, { "epoch": 1.5698930372183848, "grad_norm": 1.1896082162857056, "learning_rate": 4.769243922153827e-07, "loss": 0.117, "step": 67330 }, { "epoch": 1.5701261985952026, "grad_norm": 1.8447186946868896, "learning_rate": 4.76846670397314e-07, "loss": 0.104, "step": 67340 }, { "epoch": 1.5703593599720205, "grad_norm": 1.8198976516723633, "learning_rate": 4.7676894857924517e-07, "loss": 0.1076, "step": 67350 }, { "epoch": 1.5705925213488385, "grad_norm": 1.3495118618011475, "learning_rate": 4.766912267611764e-07, "loss": 0.1131, "step": 67360 }, { "epoch": 1.5708256827256566, "grad_norm": 1.0121235847473145, "learning_rate": 4.766135049431076e-07, "loss": 0.114, "step": 67370 }, { "epoch": 1.5710588441024744, "grad_norm": 1.4779115915298462, "learning_rate": 4.7653578312503885e-07, "loss": 0.1069, "step": 67380 }, { "epoch": 1.5712920054792923, "grad_norm": 1.5196740627288818, "learning_rate": 4.7645806130697006e-07, "loss": 0.107, "step": 67390 }, { "epoch": 1.5715251668561103, "grad_norm": 2.0428261756896973, "learning_rate": 4.763803394889013e-07, "loss": 0.1153, "step": 67400 }, { "epoch": 1.5717583282329284, "grad_norm": 1.175517201423645, "learning_rate": 4.7630261767083253e-07, "loss": 0.112, "step": 67410 }, { "epoch": 1.5719914896097462, "grad_norm": 1.150946021080017, "learning_rate": 4.7622489585276374e-07, "loss": 0.1116, "step": 67420 }, { "epoch": 1.572224650986564, "grad_norm": 1.247743010520935, "learning_rate": 4.76147174034695e-07, "loss": 0.1075, "step": 67430 }, { "epoch": 1.572457812363382, "grad_norm": 1.3359938859939575, "learning_rate": 4.7606945221662627e-07, "loss": 0.1031, "step": 67440 }, { "epoch": 1.5726909737402, "grad_norm": 1.1922340393066406, "learning_rate": 4.759917303985574e-07, "loss": 0.1175, "step": 67450 }, { "epoch": 1.572924135117018, "grad_norm": 1.7514218091964722, "learning_rate": 4.759140085804887e-07, "loss": 0.1003, "step": 67460 }, { "epoch": 1.5731572964938358, "grad_norm": 2.1795339584350586, "learning_rate": 4.7583628676241995e-07, "loss": 0.1114, "step": 67470 }, { "epoch": 1.5733904578706537, "grad_norm": 1.7968671321868896, "learning_rate": 4.7575856494435116e-07, "loss": 0.1058, "step": 67480 }, { "epoch": 1.5736236192474715, "grad_norm": 2.3842504024505615, "learning_rate": 4.7568084312628236e-07, "loss": 0.1156, "step": 67490 }, { "epoch": 1.5738567806242896, "grad_norm": 1.9194642305374146, "learning_rate": 4.7560312130821363e-07, "loss": 0.1052, "step": 67500 }, { "epoch": 1.5740899420011076, "grad_norm": 1.7219198942184448, "learning_rate": 4.755253994901449e-07, "loss": 0.1057, "step": 67510 }, { "epoch": 1.5743231033779255, "grad_norm": 1.6910974979400635, "learning_rate": 4.754476776720761e-07, "loss": 0.1058, "step": 67520 }, { "epoch": 1.5745562647547433, "grad_norm": 1.111215353012085, "learning_rate": 4.753699558540073e-07, "loss": 0.0987, "step": 67530 }, { "epoch": 1.5747894261315611, "grad_norm": 1.0830451250076294, "learning_rate": 4.7529223403593857e-07, "loss": 0.1039, "step": 67540 }, { "epoch": 1.5750225875083792, "grad_norm": 2.4551103115081787, "learning_rate": 4.752145122178698e-07, "loss": 0.1149, "step": 67550 }, { "epoch": 1.5752557488851973, "grad_norm": 0.9360976815223694, "learning_rate": 4.7513679039980104e-07, "loss": 0.1021, "step": 67560 }, { "epoch": 1.575488910262015, "grad_norm": 1.5761512517929077, "learning_rate": 4.7505906858173225e-07, "loss": 0.1098, "step": 67570 }, { "epoch": 1.575722071638833, "grad_norm": 3.5059945583343506, "learning_rate": 4.7498134676366346e-07, "loss": 0.0938, "step": 67580 }, { "epoch": 1.575955233015651, "grad_norm": 2.309446096420288, "learning_rate": 4.749036249455947e-07, "loss": 0.1034, "step": 67590 }, { "epoch": 1.576188394392469, "grad_norm": 2.0291686058044434, "learning_rate": 4.74825903127526e-07, "loss": 0.1108, "step": 67600 }, { "epoch": 1.5764215557692869, "grad_norm": 1.183502197265625, "learning_rate": 4.7475595349126406e-07, "loss": 0.1035, "step": 67610 }, { "epoch": 1.5766547171461047, "grad_norm": 1.9660533666610718, "learning_rate": 4.7467823167319527e-07, "loss": 0.11, "step": 67620 }, { "epoch": 1.5768878785229226, "grad_norm": 1.5964157581329346, "learning_rate": 4.7460050985512653e-07, "loss": 0.1007, "step": 67630 }, { "epoch": 1.5771210398997406, "grad_norm": 1.2447930574417114, "learning_rate": 4.7452278803705774e-07, "loss": 0.1016, "step": 67640 }, { "epoch": 1.5773542012765587, "grad_norm": 2.210869073867798, "learning_rate": 4.74445066218989e-07, "loss": 0.111, "step": 67650 }, { "epoch": 1.5775873626533765, "grad_norm": 2.9201645851135254, "learning_rate": 4.743673444009202e-07, "loss": 0.1113, "step": 67660 }, { "epoch": 1.5778205240301943, "grad_norm": 2.6274304389953613, "learning_rate": 4.742896225828514e-07, "loss": 0.1177, "step": 67670 }, { "epoch": 1.5780536854070122, "grad_norm": 3.531982898712158, "learning_rate": 4.742119007647827e-07, "loss": 0.119, "step": 67680 }, { "epoch": 1.5782868467838302, "grad_norm": 1.4556219577789307, "learning_rate": 4.7413417894671394e-07, "loss": 0.1062, "step": 67690 }, { "epoch": 1.5785200081606483, "grad_norm": 1.526472568511963, "learning_rate": 4.740564571286451e-07, "loss": 0.1236, "step": 67700 }, { "epoch": 1.5787531695374661, "grad_norm": 1.932268500328064, "learning_rate": 4.7397873531057636e-07, "loss": 0.1049, "step": 67710 }, { "epoch": 1.578986330914284, "grad_norm": 1.6782562732696533, "learning_rate": 4.739010134925076e-07, "loss": 0.0942, "step": 67720 }, { "epoch": 1.579219492291102, "grad_norm": 4.489398002624512, "learning_rate": 4.7382329167443883e-07, "loss": 0.1138, "step": 67730 }, { "epoch": 1.5794526536679199, "grad_norm": 1.4359285831451416, "learning_rate": 4.7374556985637004e-07, "loss": 0.1138, "step": 67740 }, { "epoch": 1.579685815044738, "grad_norm": 1.5754692554473877, "learning_rate": 4.736678480383013e-07, "loss": 0.0991, "step": 67750 }, { "epoch": 1.5799189764215558, "grad_norm": 2.37017560005188, "learning_rate": 4.735901262202325e-07, "loss": 0.1039, "step": 67760 }, { "epoch": 1.5801521377983736, "grad_norm": 1.2413848638534546, "learning_rate": 4.735124044021638e-07, "loss": 0.1087, "step": 67770 }, { "epoch": 1.5803852991751917, "grad_norm": 1.6611738204956055, "learning_rate": 4.73434682584095e-07, "loss": 0.1119, "step": 67780 }, { "epoch": 1.5806184605520097, "grad_norm": 1.888283133506775, "learning_rate": 4.733569607660262e-07, "loss": 0.1107, "step": 67790 }, { "epoch": 1.5808516219288276, "grad_norm": 1.2322145700454712, "learning_rate": 4.7327923894795745e-07, "loss": 0.0963, "step": 67800 }, { "epoch": 1.5810847833056454, "grad_norm": 2.4055051803588867, "learning_rate": 4.732015171298887e-07, "loss": 0.1073, "step": 67810 }, { "epoch": 1.5813179446824632, "grad_norm": 2.2210471630096436, "learning_rate": 4.7312379531181987e-07, "loss": 0.1, "step": 67820 }, { "epoch": 1.5815511060592813, "grad_norm": 1.0303009748458862, "learning_rate": 4.7304607349375113e-07, "loss": 0.1028, "step": 67830 }, { "epoch": 1.5817842674360993, "grad_norm": 2.1494548320770264, "learning_rate": 4.729683516756824e-07, "loss": 0.1019, "step": 67840 }, { "epoch": 1.5820174288129172, "grad_norm": 1.5922942161560059, "learning_rate": 4.728906298576136e-07, "loss": 0.119, "step": 67850 }, { "epoch": 1.582250590189735, "grad_norm": 1.3714606761932373, "learning_rate": 4.728129080395448e-07, "loss": 0.1065, "step": 67860 }, { "epoch": 1.5824837515665529, "grad_norm": 2.907914876937866, "learning_rate": 4.727351862214761e-07, "loss": 0.1159, "step": 67870 }, { "epoch": 1.582716912943371, "grad_norm": 1.3227380514144897, "learning_rate": 4.726574644034073e-07, "loss": 0.1037, "step": 67880 }, { "epoch": 1.582950074320189, "grad_norm": 2.252082586288452, "learning_rate": 4.7257974258533855e-07, "loss": 0.1125, "step": 67890 }, { "epoch": 1.5831832356970068, "grad_norm": 2.343876361846924, "learning_rate": 4.7250202076726976e-07, "loss": 0.1077, "step": 67900 }, { "epoch": 1.5834163970738246, "grad_norm": 1.208563208580017, "learning_rate": 4.72424298949201e-07, "loss": 0.0922, "step": 67910 }, { "epoch": 1.5836495584506427, "grad_norm": 1.1336712837219238, "learning_rate": 4.7234657713113223e-07, "loss": 0.1112, "step": 67920 }, { "epoch": 1.5838827198274605, "grad_norm": 2.411961317062378, "learning_rate": 4.722688553130635e-07, "loss": 0.1242, "step": 67930 }, { "epoch": 1.5841158812042786, "grad_norm": 1.3343485593795776, "learning_rate": 4.721911334949947e-07, "loss": 0.1086, "step": 67940 }, { "epoch": 1.5843490425810964, "grad_norm": 1.328244686126709, "learning_rate": 4.721134116769259e-07, "loss": 0.1126, "step": 67950 }, { "epoch": 1.5845822039579143, "grad_norm": 1.3782886266708374, "learning_rate": 4.7203568985885717e-07, "loss": 0.1078, "step": 67960 }, { "epoch": 1.5848153653347323, "grad_norm": 1.7094545364379883, "learning_rate": 4.7195796804078843e-07, "loss": 0.0969, "step": 67970 }, { "epoch": 1.5850485267115504, "grad_norm": 1.8228861093521118, "learning_rate": 4.718802462227196e-07, "loss": 0.1159, "step": 67980 }, { "epoch": 1.5852816880883682, "grad_norm": 1.903275489807129, "learning_rate": 4.7180252440465085e-07, "loss": 0.105, "step": 67990 }, { "epoch": 1.585514849465186, "grad_norm": 1.6368656158447266, "learning_rate": 4.717248025865821e-07, "loss": 0.1005, "step": 68000 }, { "epoch": 1.585748010842004, "grad_norm": 4.263312339782715, "learning_rate": 4.716470807685133e-07, "loss": 0.1089, "step": 68010 }, { "epoch": 1.585981172218822, "grad_norm": 1.5183426141738892, "learning_rate": 4.7156935895044453e-07, "loss": 0.113, "step": 68020 }, { "epoch": 1.58621433359564, "grad_norm": 1.058186411857605, "learning_rate": 4.714916371323758e-07, "loss": 0.099, "step": 68030 }, { "epoch": 1.5864474949724579, "grad_norm": 1.8503508567810059, "learning_rate": 4.71413915314307e-07, "loss": 0.0989, "step": 68040 }, { "epoch": 1.5866806563492757, "grad_norm": 1.1743967533111572, "learning_rate": 4.7133619349623827e-07, "loss": 0.101, "step": 68050 }, { "epoch": 1.5869138177260935, "grad_norm": 1.535286545753479, "learning_rate": 4.712584716781695e-07, "loss": 0.1098, "step": 68060 }, { "epoch": 1.5871469791029116, "grad_norm": 1.504652500152588, "learning_rate": 4.711807498601007e-07, "loss": 0.1107, "step": 68070 }, { "epoch": 1.5873801404797296, "grad_norm": 1.9699889421463013, "learning_rate": 4.7110302804203195e-07, "loss": 0.1077, "step": 68080 }, { "epoch": 1.5876133018565475, "grad_norm": 2.4655094146728516, "learning_rate": 4.710253062239632e-07, "loss": 0.1029, "step": 68090 }, { "epoch": 1.5878464632333653, "grad_norm": 1.3922734260559082, "learning_rate": 4.7094758440589437e-07, "loss": 0.1107, "step": 68100 }, { "epoch": 1.5880796246101834, "grad_norm": 1.9035321474075317, "learning_rate": 4.7086986258782563e-07, "loss": 0.1077, "step": 68110 }, { "epoch": 1.5883127859870012, "grad_norm": 1.3008514642715454, "learning_rate": 4.707921407697569e-07, "loss": 0.1118, "step": 68120 }, { "epoch": 1.5885459473638193, "grad_norm": 1.6967204809188843, "learning_rate": 4.707144189516881e-07, "loss": 0.11, "step": 68130 }, { "epoch": 1.588779108740637, "grad_norm": 2.182121515274048, "learning_rate": 4.706366971336193e-07, "loss": 0.1087, "step": 68140 }, { "epoch": 1.589012270117455, "grad_norm": 1.6709822416305542, "learning_rate": 4.7055897531555057e-07, "loss": 0.1047, "step": 68150 }, { "epoch": 1.589245431494273, "grad_norm": 1.1291698217391968, "learning_rate": 4.704812534974818e-07, "loss": 0.0976, "step": 68160 }, { "epoch": 1.589478592871091, "grad_norm": 3.4367105960845947, "learning_rate": 4.7040353167941304e-07, "loss": 0.1181, "step": 68170 }, { "epoch": 1.589711754247909, "grad_norm": 1.6400463581085205, "learning_rate": 4.7032580986134425e-07, "loss": 0.1157, "step": 68180 }, { "epoch": 1.5899449156247267, "grad_norm": 1.522522211074829, "learning_rate": 4.7024808804327546e-07, "loss": 0.0901, "step": 68190 }, { "epoch": 1.5901780770015446, "grad_norm": 1.5485875606536865, "learning_rate": 4.701703662252067e-07, "loss": 0.1069, "step": 68200 }, { "epoch": 1.5904112383783626, "grad_norm": 1.1761624813079834, "learning_rate": 4.70092644407138e-07, "loss": 0.105, "step": 68210 }, { "epoch": 1.5906443997551807, "grad_norm": 1.3136876821517944, "learning_rate": 4.7001492258906914e-07, "loss": 0.1111, "step": 68220 }, { "epoch": 1.5908775611319985, "grad_norm": 1.6531646251678467, "learning_rate": 4.699372007710004e-07, "loss": 0.1108, "step": 68230 }, { "epoch": 1.5911107225088164, "grad_norm": 1.3424835205078125, "learning_rate": 4.6985947895293167e-07, "loss": 0.1149, "step": 68240 }, { "epoch": 1.5913438838856342, "grad_norm": 1.5120010375976562, "learning_rate": 4.697817571348629e-07, "loss": 0.1027, "step": 68250 }, { "epoch": 1.5915770452624523, "grad_norm": 1.1139543056488037, "learning_rate": 4.697040353167941e-07, "loss": 0.1149, "step": 68260 }, { "epoch": 1.5918102066392703, "grad_norm": 1.3202104568481445, "learning_rate": 4.6962631349872535e-07, "loss": 0.1027, "step": 68270 }, { "epoch": 1.5920433680160881, "grad_norm": 2.3665547370910645, "learning_rate": 4.6954859168065656e-07, "loss": 0.1114, "step": 68280 }, { "epoch": 1.592276529392906, "grad_norm": 1.5695042610168457, "learning_rate": 4.694708698625878e-07, "loss": 0.1122, "step": 68290 }, { "epoch": 1.592509690769724, "grad_norm": 1.4936909675598145, "learning_rate": 4.6939314804451903e-07, "loss": 0.1187, "step": 68300 }, { "epoch": 1.5927428521465419, "grad_norm": 1.4364782571792603, "learning_rate": 4.6931542622645024e-07, "loss": 0.1138, "step": 68310 }, { "epoch": 1.59297601352336, "grad_norm": 1.3901406526565552, "learning_rate": 4.692377044083815e-07, "loss": 0.0971, "step": 68320 }, { "epoch": 1.5932091749001778, "grad_norm": 1.4449931383132935, "learning_rate": 4.6915998259031276e-07, "loss": 0.0972, "step": 68330 }, { "epoch": 1.5934423362769956, "grad_norm": 2.193572998046875, "learning_rate": 4.690822607722439e-07, "loss": 0.109, "step": 68340 }, { "epoch": 1.5936754976538137, "grad_norm": 1.963632583618164, "learning_rate": 4.690045389541752e-07, "loss": 0.1106, "step": 68350 }, { "epoch": 1.5939086590306317, "grad_norm": 1.6875261068344116, "learning_rate": 4.6892681713610644e-07, "loss": 0.1002, "step": 68360 }, { "epoch": 1.5941418204074496, "grad_norm": 4.402089595794678, "learning_rate": 4.688490953180377e-07, "loss": 0.1141, "step": 68370 }, { "epoch": 1.5943749817842674, "grad_norm": 2.5236663818359375, "learning_rate": 4.6877137349996886e-07, "loss": 0.1059, "step": 68380 }, { "epoch": 1.5946081431610852, "grad_norm": 1.5418715476989746, "learning_rate": 4.686936516819001e-07, "loss": 0.1118, "step": 68390 }, { "epoch": 1.5948413045379033, "grad_norm": 2.2014811038970947, "learning_rate": 4.686159298638314e-07, "loss": 0.1145, "step": 68400 }, { "epoch": 1.5950744659147214, "grad_norm": 3.6010823249816895, "learning_rate": 4.685382080457626e-07, "loss": 0.1045, "step": 68410 }, { "epoch": 1.5953076272915392, "grad_norm": 2.8477671146392822, "learning_rate": 4.684604862276938e-07, "loss": 0.112, "step": 68420 }, { "epoch": 1.595540788668357, "grad_norm": 1.8894561529159546, "learning_rate": 4.6838276440962506e-07, "loss": 0.0989, "step": 68430 }, { "epoch": 1.5957739500451749, "grad_norm": 1.4584777355194092, "learning_rate": 4.6830504259155627e-07, "loss": 0.1095, "step": 68440 }, { "epoch": 1.596007111421993, "grad_norm": 1.5815211534500122, "learning_rate": 4.6822732077348754e-07, "loss": 0.1015, "step": 68450 }, { "epoch": 1.596240272798811, "grad_norm": 2.6657345294952393, "learning_rate": 4.6814959895541874e-07, "loss": 0.1096, "step": 68460 }, { "epoch": 1.5964734341756288, "grad_norm": 1.852246880531311, "learning_rate": 4.6807187713734995e-07, "loss": 0.1147, "step": 68470 }, { "epoch": 1.5967065955524467, "grad_norm": 1.6855219602584839, "learning_rate": 4.679941553192812e-07, "loss": 0.111, "step": 68480 }, { "epoch": 1.5969397569292647, "grad_norm": 2.361829996109009, "learning_rate": 4.679164335012125e-07, "loss": 0.1079, "step": 68490 }, { "epoch": 1.5971729183060825, "grad_norm": 2.3879475593566895, "learning_rate": 4.6783871168314363e-07, "loss": 0.0997, "step": 68500 }, { "epoch": 1.5974060796829006, "grad_norm": 1.6306540966033936, "learning_rate": 4.677609898650749e-07, "loss": 0.1072, "step": 68510 }, { "epoch": 1.5976392410597184, "grad_norm": 1.8137789964675903, "learning_rate": 4.6768326804700616e-07, "loss": 0.1014, "step": 68520 }, { "epoch": 1.5978724024365363, "grad_norm": 1.3795074224472046, "learning_rate": 4.6760554622893737e-07, "loss": 0.0946, "step": 68530 }, { "epoch": 1.5981055638133543, "grad_norm": 2.1508994102478027, "learning_rate": 4.675278244108686e-07, "loss": 0.0921, "step": 68540 }, { "epoch": 1.5983387251901724, "grad_norm": 2.397588014602661, "learning_rate": 4.6745010259279984e-07, "loss": 0.1023, "step": 68550 }, { "epoch": 1.5985718865669902, "grad_norm": 1.4890600442886353, "learning_rate": 4.6737238077473105e-07, "loss": 0.0998, "step": 68560 }, { "epoch": 1.598805047943808, "grad_norm": 1.137418270111084, "learning_rate": 4.672946589566623e-07, "loss": 0.1013, "step": 68570 }, { "epoch": 1.599038209320626, "grad_norm": 1.9081517457962036, "learning_rate": 4.672169371385935e-07, "loss": 0.1151, "step": 68580 }, { "epoch": 1.599271370697444, "grad_norm": 1.4350148439407349, "learning_rate": 4.6713921532052473e-07, "loss": 0.1081, "step": 68590 }, { "epoch": 1.599504532074262, "grad_norm": 1.1113444566726685, "learning_rate": 4.67061493502456e-07, "loss": 0.1155, "step": 68600 }, { "epoch": 1.5997376934510799, "grad_norm": 1.7211952209472656, "learning_rate": 4.6698377168438725e-07, "loss": 0.1038, "step": 68610 }, { "epoch": 1.5999708548278977, "grad_norm": 2.0621962547302246, "learning_rate": 4.669060498663184e-07, "loss": 0.1098, "step": 68620 }, { "epoch": 1.6002040162047155, "grad_norm": 2.797079563140869, "learning_rate": 4.6682832804824967e-07, "loss": 0.1082, "step": 68630 }, { "epoch": 1.6004371775815336, "grad_norm": 2.123638868331909, "learning_rate": 4.6675060623018093e-07, "loss": 0.1166, "step": 68640 }, { "epoch": 1.6006703389583516, "grad_norm": 2.245112180709839, "learning_rate": 4.6667288441211214e-07, "loss": 0.1044, "step": 68650 }, { "epoch": 1.6009035003351695, "grad_norm": 1.8008314371109009, "learning_rate": 4.6659516259404335e-07, "loss": 0.116, "step": 68660 }, { "epoch": 1.6011366617119873, "grad_norm": 1.2704640626907349, "learning_rate": 4.665174407759746e-07, "loss": 0.0955, "step": 68670 }, { "epoch": 1.6013698230888054, "grad_norm": 1.629655361175537, "learning_rate": 4.664397189579058e-07, "loss": 0.1039, "step": 68680 }, { "epoch": 1.6016029844656234, "grad_norm": 1.0146961212158203, "learning_rate": 4.663619971398371e-07, "loss": 0.1078, "step": 68690 }, { "epoch": 1.6018361458424413, "grad_norm": 1.9233728647232056, "learning_rate": 4.662842753217683e-07, "loss": 0.1086, "step": 68700 }, { "epoch": 1.602069307219259, "grad_norm": 1.4350981712341309, "learning_rate": 4.662065535036995e-07, "loss": 0.122, "step": 68710 }, { "epoch": 1.602302468596077, "grad_norm": 1.4365241527557373, "learning_rate": 4.6612883168563077e-07, "loss": 0.1053, "step": 68720 }, { "epoch": 1.602535629972895, "grad_norm": 1.1238726377487183, "learning_rate": 4.6605110986756203e-07, "loss": 0.1, "step": 68730 }, { "epoch": 1.602768791349713, "grad_norm": 3.079216957092285, "learning_rate": 4.659733880494932e-07, "loss": 0.1236, "step": 68740 }, { "epoch": 1.603001952726531, "grad_norm": 1.0186842679977417, "learning_rate": 4.6589566623142445e-07, "loss": 0.1019, "step": 68750 }, { "epoch": 1.6032351141033487, "grad_norm": 1.8381812572479248, "learning_rate": 4.658179444133557e-07, "loss": 0.1187, "step": 68760 }, { "epoch": 1.6034682754801666, "grad_norm": 2.4689998626708984, "learning_rate": 4.6574022259528697e-07, "loss": 0.1008, "step": 68770 }, { "epoch": 1.6037014368569846, "grad_norm": 1.2665891647338867, "learning_rate": 4.6566250077721813e-07, "loss": 0.1162, "step": 68780 }, { "epoch": 1.6039345982338027, "grad_norm": 1.4266360998153687, "learning_rate": 4.655847789591494e-07, "loss": 0.1186, "step": 68790 }, { "epoch": 1.6041677596106205, "grad_norm": 1.523736834526062, "learning_rate": 4.6550705714108065e-07, "loss": 0.101, "step": 68800 }, { "epoch": 1.6044009209874384, "grad_norm": 1.3561828136444092, "learning_rate": 4.6542933532301186e-07, "loss": 0.11, "step": 68810 }, { "epoch": 1.6046340823642562, "grad_norm": 1.7635774612426758, "learning_rate": 4.6535161350494307e-07, "loss": 0.1254, "step": 68820 }, { "epoch": 1.6048672437410743, "grad_norm": 1.8802645206451416, "learning_rate": 4.6527389168687433e-07, "loss": 0.1115, "step": 68830 }, { "epoch": 1.6051004051178923, "grad_norm": 1.801672101020813, "learning_rate": 4.6519616986880554e-07, "loss": 0.1061, "step": 68840 }, { "epoch": 1.6053335664947102, "grad_norm": 2.372563362121582, "learning_rate": 4.651184480507368e-07, "loss": 0.1062, "step": 68850 }, { "epoch": 1.605566727871528, "grad_norm": 2.0433011054992676, "learning_rate": 4.65040726232668e-07, "loss": 0.1183, "step": 68860 }, { "epoch": 1.605799889248346, "grad_norm": 3.588603973388672, "learning_rate": 4.649630044145992e-07, "loss": 0.1132, "step": 68870 }, { "epoch": 1.606033050625164, "grad_norm": 1.8934053182601929, "learning_rate": 4.648852825965305e-07, "loss": 0.099, "step": 68880 }, { "epoch": 1.606266212001982, "grad_norm": 4.285525321960449, "learning_rate": 4.6480756077846175e-07, "loss": 0.1113, "step": 68890 }, { "epoch": 1.6064993733787998, "grad_norm": 1.4977738857269287, "learning_rate": 4.647298389603929e-07, "loss": 0.1089, "step": 68900 }, { "epoch": 1.6067325347556176, "grad_norm": 1.2052651643753052, "learning_rate": 4.6465211714232416e-07, "loss": 0.1041, "step": 68910 }, { "epoch": 1.6069656961324357, "grad_norm": 1.4587329626083374, "learning_rate": 4.6457439532425543e-07, "loss": 0.1102, "step": 68920 }, { "epoch": 1.6071988575092537, "grad_norm": 2.5195913314819336, "learning_rate": 4.6449667350618664e-07, "loss": 0.1041, "step": 68930 }, { "epoch": 1.6074320188860716, "grad_norm": 1.2725539207458496, "learning_rate": 4.6441895168811785e-07, "loss": 0.0996, "step": 68940 }, { "epoch": 1.6076651802628894, "grad_norm": 1.3805863857269287, "learning_rate": 4.643412298700491e-07, "loss": 0.0974, "step": 68950 }, { "epoch": 1.6078983416397072, "grad_norm": 1.1898192167282104, "learning_rate": 4.642635080519803e-07, "loss": 0.1006, "step": 68960 }, { "epoch": 1.6081315030165253, "grad_norm": 1.3902828693389893, "learning_rate": 4.641857862339116e-07, "loss": 0.1134, "step": 68970 }, { "epoch": 1.6083646643933434, "grad_norm": 2.0310585498809814, "learning_rate": 4.641080644158428e-07, "loss": 0.1153, "step": 68980 }, { "epoch": 1.6085978257701612, "grad_norm": 3.193455457687378, "learning_rate": 4.64030342597774e-07, "loss": 0.1002, "step": 68990 }, { "epoch": 1.608830987146979, "grad_norm": 2.258652925491333, "learning_rate": 4.6395262077970526e-07, "loss": 0.1049, "step": 69000 }, { "epoch": 1.6090641485237969, "grad_norm": 1.4050923585891724, "learning_rate": 4.638748989616365e-07, "loss": 0.1162, "step": 69010 }, { "epoch": 1.609297309900615, "grad_norm": 1.941277265548706, "learning_rate": 4.637971771435677e-07, "loss": 0.11, "step": 69020 }, { "epoch": 1.609530471277433, "grad_norm": 2.7472646236419678, "learning_rate": 4.6371945532549894e-07, "loss": 0.1045, "step": 69030 }, { "epoch": 1.6097636326542508, "grad_norm": 1.5712831020355225, "learning_rate": 4.636417335074302e-07, "loss": 0.1048, "step": 69040 }, { "epoch": 1.6099967940310687, "grad_norm": 2.8925588130950928, "learning_rate": 4.635640116893614e-07, "loss": 0.1085, "step": 69050 }, { "epoch": 1.6102299554078867, "grad_norm": 3.123363733291626, "learning_rate": 4.634862898712926e-07, "loss": 0.1132, "step": 69060 }, { "epoch": 1.6104631167847048, "grad_norm": 1.4624485969543457, "learning_rate": 4.634085680532239e-07, "loss": 0.1034, "step": 69070 }, { "epoch": 1.6106962781615226, "grad_norm": 1.3268479108810425, "learning_rate": 4.633308462351551e-07, "loss": 0.111, "step": 69080 }, { "epoch": 1.6109294395383404, "grad_norm": 1.4969615936279297, "learning_rate": 4.6325312441708635e-07, "loss": 0.1168, "step": 69090 }, { "epoch": 1.6111626009151583, "grad_norm": 1.3681665658950806, "learning_rate": 4.6317540259901756e-07, "loss": 0.1132, "step": 69100 }, { "epoch": 1.6113957622919763, "grad_norm": 1.4912745952606201, "learning_rate": 4.6309768078094877e-07, "loss": 0.1015, "step": 69110 }, { "epoch": 1.6116289236687944, "grad_norm": 1.7084941864013672, "learning_rate": 4.6301995896288003e-07, "loss": 0.1013, "step": 69120 }, { "epoch": 1.6118620850456122, "grad_norm": 2.1330173015594482, "learning_rate": 4.629422371448113e-07, "loss": 0.1065, "step": 69130 }, { "epoch": 1.61209524642243, "grad_norm": 1.7113819122314453, "learning_rate": 4.6286451532674245e-07, "loss": 0.1119, "step": 69140 }, { "epoch": 1.612328407799248, "grad_norm": 0.9926828742027283, "learning_rate": 4.627867935086737e-07, "loss": 0.1057, "step": 69150 }, { "epoch": 1.612561569176066, "grad_norm": 1.6919447183609009, "learning_rate": 4.62709071690605e-07, "loss": 0.1148, "step": 69160 }, { "epoch": 1.612794730552884, "grad_norm": 4.000744342803955, "learning_rate": 4.6263134987253624e-07, "loss": 0.1037, "step": 69170 }, { "epoch": 1.6130278919297019, "grad_norm": 1.18421471118927, "learning_rate": 4.625536280544674e-07, "loss": 0.1078, "step": 69180 }, { "epoch": 1.6132610533065197, "grad_norm": 1.9383422136306763, "learning_rate": 4.6247590623639866e-07, "loss": 0.1195, "step": 69190 }, { "epoch": 1.6134942146833378, "grad_norm": 1.4186166524887085, "learning_rate": 4.623981844183299e-07, "loss": 0.1072, "step": 69200 }, { "epoch": 1.6137273760601556, "grad_norm": 4.694772720336914, "learning_rate": 4.6232046260026113e-07, "loss": 0.0985, "step": 69210 }, { "epoch": 1.6139605374369737, "grad_norm": 2.281386375427246, "learning_rate": 4.6224274078219234e-07, "loss": 0.1203, "step": 69220 }, { "epoch": 1.6141936988137915, "grad_norm": 1.5579631328582764, "learning_rate": 4.621650189641236e-07, "loss": 0.0979, "step": 69230 }, { "epoch": 1.6144268601906093, "grad_norm": 1.3430812358856201, "learning_rate": 4.620872971460548e-07, "loss": 0.1008, "step": 69240 }, { "epoch": 1.6146600215674274, "grad_norm": 2.3199880123138428, "learning_rate": 4.6200957532798607e-07, "loss": 0.1076, "step": 69250 }, { "epoch": 1.6148931829442454, "grad_norm": 1.4729657173156738, "learning_rate": 4.619318535099173e-07, "loss": 0.1168, "step": 69260 }, { "epoch": 1.6151263443210633, "grad_norm": 2.2225537300109863, "learning_rate": 4.618541316918485e-07, "loss": 0.1129, "step": 69270 }, { "epoch": 1.6153595056978811, "grad_norm": 1.3361332416534424, "learning_rate": 4.6177640987377975e-07, "loss": 0.1065, "step": 69280 }, { "epoch": 1.615592667074699, "grad_norm": 1.2170971632003784, "learning_rate": 4.61698688055711e-07, "loss": 0.0991, "step": 69290 }, { "epoch": 1.615825828451517, "grad_norm": 1.6603044271469116, "learning_rate": 4.6162096623764217e-07, "loss": 0.1144, "step": 69300 }, { "epoch": 1.616058989828335, "grad_norm": 1.7435307502746582, "learning_rate": 4.6154324441957343e-07, "loss": 0.1165, "step": 69310 }, { "epoch": 1.616292151205153, "grad_norm": 1.1125860214233398, "learning_rate": 4.614655226015047e-07, "loss": 0.1018, "step": 69320 }, { "epoch": 1.6165253125819707, "grad_norm": 1.6688798666000366, "learning_rate": 4.613878007834359e-07, "loss": 0.1142, "step": 69330 }, { "epoch": 1.6167584739587886, "grad_norm": 1.3171372413635254, "learning_rate": 4.613100789653671e-07, "loss": 0.1028, "step": 69340 }, { "epoch": 1.6169916353356066, "grad_norm": 1.1213338375091553, "learning_rate": 4.612323571472984e-07, "loss": 0.104, "step": 69350 }, { "epoch": 1.6172247967124247, "grad_norm": 2.594392776489258, "learning_rate": 4.611546353292296e-07, "loss": 0.1187, "step": 69360 }, { "epoch": 1.6174579580892425, "grad_norm": 2.1224234104156494, "learning_rate": 4.6107691351116085e-07, "loss": 0.1063, "step": 69370 }, { "epoch": 1.6176911194660604, "grad_norm": 2.155958890914917, "learning_rate": 4.6099919169309206e-07, "loss": 0.1164, "step": 69380 }, { "epoch": 1.6179242808428784, "grad_norm": 2.537400722503662, "learning_rate": 4.6092146987502327e-07, "loss": 0.1037, "step": 69390 }, { "epoch": 1.6181574422196963, "grad_norm": 1.0456689596176147, "learning_rate": 4.6084374805695453e-07, "loss": 0.105, "step": 69400 }, { "epoch": 1.6183906035965143, "grad_norm": 1.6790188550949097, "learning_rate": 4.607660262388858e-07, "loss": 0.1049, "step": 69410 }, { "epoch": 1.6186237649733322, "grad_norm": 1.3994332551956177, "learning_rate": 4.6068830442081695e-07, "loss": 0.1051, "step": 69420 }, { "epoch": 1.61885692635015, "grad_norm": 1.6095149517059326, "learning_rate": 4.606105826027482e-07, "loss": 0.11, "step": 69430 }, { "epoch": 1.619090087726968, "grad_norm": 2.3644566535949707, "learning_rate": 4.6053286078467947e-07, "loss": 0.1085, "step": 69440 }, { "epoch": 1.619323249103786, "grad_norm": 2.6368680000305176, "learning_rate": 4.604551389666107e-07, "loss": 0.114, "step": 69450 }, { "epoch": 1.619556410480604, "grad_norm": 2.4841952323913574, "learning_rate": 4.603851893303488e-07, "loss": 0.0969, "step": 69460 }, { "epoch": 1.6197895718574218, "grad_norm": 2.4747886657714844, "learning_rate": 4.6030746751228e-07, "loss": 0.0988, "step": 69470 }, { "epoch": 1.6200227332342396, "grad_norm": 1.1619206666946411, "learning_rate": 4.602297456942112e-07, "loss": 0.1053, "step": 69480 }, { "epoch": 1.6202558946110577, "grad_norm": 2.302812099456787, "learning_rate": 4.601520238761425e-07, "loss": 0.1076, "step": 69490 }, { "epoch": 1.6204890559878757, "grad_norm": 1.941768765449524, "learning_rate": 4.6007430205807375e-07, "loss": 0.1247, "step": 69500 }, { "epoch": 1.6207222173646936, "grad_norm": 3.5253567695617676, "learning_rate": 4.599965802400049e-07, "loss": 0.1156, "step": 69510 }, { "epoch": 1.6209553787415114, "grad_norm": 1.3115131855010986, "learning_rate": 4.5991885842193617e-07, "loss": 0.1073, "step": 69520 }, { "epoch": 1.6211885401183292, "grad_norm": 1.3536536693572998, "learning_rate": 4.5984113660386743e-07, "loss": 0.1022, "step": 69530 }, { "epoch": 1.6214217014951473, "grad_norm": 1.0929309129714966, "learning_rate": 4.597634147857987e-07, "loss": 0.1121, "step": 69540 }, { "epoch": 1.6216548628719654, "grad_norm": 1.9386059045791626, "learning_rate": 4.5968569296772985e-07, "loss": 0.1072, "step": 69550 }, { "epoch": 1.6218880242487832, "grad_norm": 1.4206987619400024, "learning_rate": 4.596079711496611e-07, "loss": 0.1128, "step": 69560 }, { "epoch": 1.622121185625601, "grad_norm": 1.9498858451843262, "learning_rate": 4.5953024933159237e-07, "loss": 0.1171, "step": 69570 }, { "epoch": 1.622354347002419, "grad_norm": 3.9757933616638184, "learning_rate": 4.594525275135236e-07, "loss": 0.1024, "step": 69580 }, { "epoch": 1.622587508379237, "grad_norm": 2.114332675933838, "learning_rate": 4.593748056954548e-07, "loss": 0.0907, "step": 69590 }, { "epoch": 1.622820669756055, "grad_norm": 2.2567758560180664, "learning_rate": 4.5929708387738605e-07, "loss": 0.1091, "step": 69600 }, { "epoch": 1.6230538311328728, "grad_norm": 1.5105903148651123, "learning_rate": 4.5921936205931726e-07, "loss": 0.1026, "step": 69610 }, { "epoch": 1.6232869925096907, "grad_norm": 1.2491509914398193, "learning_rate": 4.591416402412485e-07, "loss": 0.1084, "step": 69620 }, { "epoch": 1.6235201538865087, "grad_norm": 2.0238516330718994, "learning_rate": 4.5906391842317973e-07, "loss": 0.1192, "step": 69630 }, { "epoch": 1.6237533152633268, "grad_norm": 1.2950268983840942, "learning_rate": 4.5898619660511094e-07, "loss": 0.1096, "step": 69640 }, { "epoch": 1.6239864766401446, "grad_norm": 1.1068620681762695, "learning_rate": 4.589084747870422e-07, "loss": 0.1044, "step": 69650 }, { "epoch": 1.6242196380169625, "grad_norm": 1.2690657377243042, "learning_rate": 4.5883075296897347e-07, "loss": 0.1059, "step": 69660 }, { "epoch": 1.6244527993937803, "grad_norm": 2.9441475868225098, "learning_rate": 4.587530311509046e-07, "loss": 0.1029, "step": 69670 }, { "epoch": 1.6246859607705983, "grad_norm": 1.3151776790618896, "learning_rate": 4.586753093328359e-07, "loss": 0.1028, "step": 69680 }, { "epoch": 1.6249191221474164, "grad_norm": 1.310211420059204, "learning_rate": 4.5859758751476715e-07, "loss": 0.1009, "step": 69690 }, { "epoch": 1.6251522835242342, "grad_norm": 1.6717298030853271, "learning_rate": 4.5851986569669836e-07, "loss": 0.1108, "step": 69700 }, { "epoch": 1.625385444901052, "grad_norm": 1.7549976110458374, "learning_rate": 4.5844214387862956e-07, "loss": 0.1011, "step": 69710 }, { "epoch": 1.62561860627787, "grad_norm": 2.1316401958465576, "learning_rate": 4.5836442206056083e-07, "loss": 0.1099, "step": 69720 }, { "epoch": 1.625851767654688, "grad_norm": 1.6062157154083252, "learning_rate": 4.5828670024249204e-07, "loss": 0.0961, "step": 69730 }, { "epoch": 1.626084929031506, "grad_norm": 2.582611322402954, "learning_rate": 4.582089784244233e-07, "loss": 0.1123, "step": 69740 }, { "epoch": 1.6263180904083239, "grad_norm": 1.1576193571090698, "learning_rate": 4.581312566063545e-07, "loss": 0.109, "step": 69750 }, { "epoch": 1.6265512517851417, "grad_norm": 0.974769651889801, "learning_rate": 4.580535347882857e-07, "loss": 0.0951, "step": 69760 }, { "epoch": 1.6267844131619598, "grad_norm": 1.4495385885238647, "learning_rate": 4.57975812970217e-07, "loss": 0.1192, "step": 69770 }, { "epoch": 1.6270175745387776, "grad_norm": 1.4923174381256104, "learning_rate": 4.5789809115214824e-07, "loss": 0.108, "step": 69780 }, { "epoch": 1.6272507359155957, "grad_norm": 1.125556230545044, "learning_rate": 4.578203693340794e-07, "loss": 0.116, "step": 69790 }, { "epoch": 1.6274838972924135, "grad_norm": 1.8946292400360107, "learning_rate": 4.5774264751601066e-07, "loss": 0.1115, "step": 69800 }, { "epoch": 1.6277170586692313, "grad_norm": 1.4057269096374512, "learning_rate": 4.576649256979419e-07, "loss": 0.1053, "step": 69810 }, { "epoch": 1.6279502200460494, "grad_norm": 1.421675443649292, "learning_rate": 4.5758720387987313e-07, "loss": 0.1057, "step": 69820 }, { "epoch": 1.6281833814228674, "grad_norm": 2.1527607440948486, "learning_rate": 4.5750948206180434e-07, "loss": 0.1159, "step": 69830 }, { "epoch": 1.6284165427996853, "grad_norm": 1.0995930433273315, "learning_rate": 4.574317602437356e-07, "loss": 0.1055, "step": 69840 }, { "epoch": 1.6286497041765031, "grad_norm": 1.1796598434448242, "learning_rate": 4.573540384256668e-07, "loss": 0.0984, "step": 69850 }, { "epoch": 1.628882865553321, "grad_norm": 1.7826030254364014, "learning_rate": 4.572763166075981e-07, "loss": 0.1125, "step": 69860 }, { "epoch": 1.629116026930139, "grad_norm": 1.2168687582015991, "learning_rate": 4.571985947895293e-07, "loss": 0.1075, "step": 69870 }, { "epoch": 1.629349188306957, "grad_norm": 1.311477780342102, "learning_rate": 4.571208729714605e-07, "loss": 0.1105, "step": 69880 }, { "epoch": 1.629582349683775, "grad_norm": 2.4002439975738525, "learning_rate": 4.5704315115339175e-07, "loss": 0.1001, "step": 69890 }, { "epoch": 1.6298155110605927, "grad_norm": 1.6525484323501587, "learning_rate": 4.56965429335323e-07, "loss": 0.1044, "step": 69900 }, { "epoch": 1.6300486724374106, "grad_norm": 1.5050127506256104, "learning_rate": 4.568877075172542e-07, "loss": 0.1034, "step": 69910 }, { "epoch": 1.6302818338142286, "grad_norm": 2.633427143096924, "learning_rate": 4.5680998569918543e-07, "loss": 0.1065, "step": 69920 }, { "epoch": 1.6305149951910467, "grad_norm": 1.4163813591003418, "learning_rate": 4.567322638811167e-07, "loss": 0.1105, "step": 69930 }, { "epoch": 1.6307481565678645, "grad_norm": 1.5737659931182861, "learning_rate": 4.566545420630479e-07, "loss": 0.1026, "step": 69940 }, { "epoch": 1.6309813179446824, "grad_norm": 1.7150208950042725, "learning_rate": 4.5657682024497917e-07, "loss": 0.108, "step": 69950 }, { "epoch": 1.6312144793215004, "grad_norm": 1.1943405866622925, "learning_rate": 4.564990984269104e-07, "loss": 0.1044, "step": 69960 }, { "epoch": 1.6314476406983183, "grad_norm": 1.2266175746917725, "learning_rate": 4.564213766088416e-07, "loss": 0.1048, "step": 69970 }, { "epoch": 1.6316808020751363, "grad_norm": 1.144209623336792, "learning_rate": 4.5634365479077285e-07, "loss": 0.1084, "step": 69980 }, { "epoch": 1.6319139634519542, "grad_norm": 1.2050718069076538, "learning_rate": 4.562659329727041e-07, "loss": 0.102, "step": 69990 }, { "epoch": 1.632147124828772, "grad_norm": 1.2724629640579224, "learning_rate": 4.561882111546353e-07, "loss": 0.1074, "step": 70000 }, { "epoch": 1.632147124828772, "eval_accuracy": 0.9472036451906588, "eval_f1": 0.9622807473029092, "eval_loss": 0.13738703727722168, "eval_runtime": 3905.4547, "eval_samples_per_second": 468.554, "eval_steps_per_second": 58.569, "step": 70000 }, { "epoch": 1.63238028620559, "grad_norm": 1.5057648420333862, "learning_rate": 4.5611048933656653e-07, "loss": 0.1059, "step": 70010 }, { "epoch": 1.6326134475824081, "grad_norm": 1.516053318977356, "learning_rate": 4.560327675184978e-07, "loss": 0.1105, "step": 70020 }, { "epoch": 1.632846608959226, "grad_norm": 2.209014415740967, "learning_rate": 4.5595504570042905e-07, "loss": 0.1119, "step": 70030 }, { "epoch": 1.6330797703360438, "grad_norm": 1.7754616737365723, "learning_rate": 4.558773238823602e-07, "loss": 0.1057, "step": 70040 }, { "epoch": 1.6333129317128616, "grad_norm": 1.6988086700439453, "learning_rate": 4.5579960206429147e-07, "loss": 0.0998, "step": 70050 }, { "epoch": 1.6335460930896797, "grad_norm": 1.266814947128296, "learning_rate": 4.5572188024622273e-07, "loss": 0.1143, "step": 70060 }, { "epoch": 1.6337792544664977, "grad_norm": 1.277779459953308, "learning_rate": 4.5564415842815394e-07, "loss": 0.0986, "step": 70070 }, { "epoch": 1.6340124158433156, "grad_norm": 1.2741596698760986, "learning_rate": 4.5556643661008515e-07, "loss": 0.1101, "step": 70080 }, { "epoch": 1.6342455772201334, "grad_norm": 1.2954684495925903, "learning_rate": 4.554887147920164e-07, "loss": 0.1085, "step": 70090 }, { "epoch": 1.6344787385969513, "grad_norm": 1.3624229431152344, "learning_rate": 4.554109929739476e-07, "loss": 0.1017, "step": 70100 }, { "epoch": 1.6347118999737693, "grad_norm": 2.5524678230285645, "learning_rate": 4.553332711558789e-07, "loss": 0.1141, "step": 70110 }, { "epoch": 1.6349450613505874, "grad_norm": 1.7184187173843384, "learning_rate": 4.552555493378101e-07, "loss": 0.1002, "step": 70120 }, { "epoch": 1.6351782227274052, "grad_norm": 1.201096534729004, "learning_rate": 4.551778275197413e-07, "loss": 0.1077, "step": 70130 }, { "epoch": 1.635411384104223, "grad_norm": 2.23122501373291, "learning_rate": 4.5510010570167257e-07, "loss": 0.107, "step": 70140 }, { "epoch": 1.635644545481041, "grad_norm": 2.16243052482605, "learning_rate": 4.5502238388360383e-07, "loss": 0.1064, "step": 70150 }, { "epoch": 1.6358777068578592, "grad_norm": 1.8141769170761108, "learning_rate": 4.54944662065535e-07, "loss": 0.107, "step": 70160 }, { "epoch": 1.636110868234677, "grad_norm": 1.9961572885513306, "learning_rate": 4.5486694024746625e-07, "loss": 0.108, "step": 70170 }, { "epoch": 1.6363440296114948, "grad_norm": 2.1799428462982178, "learning_rate": 4.547892184293975e-07, "loss": 0.1089, "step": 70180 }, { "epoch": 1.6365771909883127, "grad_norm": 2.390254020690918, "learning_rate": 4.547114966113287e-07, "loss": 0.092, "step": 70190 }, { "epoch": 1.6368103523651307, "grad_norm": 2.4161787033081055, "learning_rate": 4.5463377479325993e-07, "loss": 0.1124, "step": 70200 }, { "epoch": 1.6370435137419488, "grad_norm": 2.0812060832977295, "learning_rate": 4.545560529751912e-07, "loss": 0.1024, "step": 70210 }, { "epoch": 1.6372766751187666, "grad_norm": 1.1057604551315308, "learning_rate": 4.544783311571224e-07, "loss": 0.1109, "step": 70220 }, { "epoch": 1.6375098364955845, "grad_norm": 2.04935884475708, "learning_rate": 4.5440060933905366e-07, "loss": 0.1125, "step": 70230 }, { "epoch": 1.6377429978724023, "grad_norm": 1.6461511850357056, "learning_rate": 4.5432288752098487e-07, "loss": 0.109, "step": 70240 }, { "epoch": 1.6379761592492204, "grad_norm": 2.7405078411102295, "learning_rate": 4.542451657029161e-07, "loss": 0.1061, "step": 70250 }, { "epoch": 1.6382093206260384, "grad_norm": 2.4207639694213867, "learning_rate": 4.5416744388484734e-07, "loss": 0.1101, "step": 70260 }, { "epoch": 1.6384424820028562, "grad_norm": 1.7145272493362427, "learning_rate": 4.540897220667786e-07, "loss": 0.1088, "step": 70270 }, { "epoch": 1.638675643379674, "grad_norm": 2.3335089683532715, "learning_rate": 4.5401200024870976e-07, "loss": 0.0906, "step": 70280 }, { "epoch": 1.638908804756492, "grad_norm": 1.1664159297943115, "learning_rate": 4.53934278430641e-07, "loss": 0.1145, "step": 70290 }, { "epoch": 1.63914196613331, "grad_norm": 2.135991334915161, "learning_rate": 4.538565566125723e-07, "loss": 0.0946, "step": 70300 }, { "epoch": 1.639375127510128, "grad_norm": 1.3392064571380615, "learning_rate": 4.537788347945035e-07, "loss": 0.0999, "step": 70310 }, { "epoch": 1.6396082888869459, "grad_norm": 1.3921887874603271, "learning_rate": 4.537011129764347e-07, "loss": 0.1021, "step": 70320 }, { "epoch": 1.6398414502637637, "grad_norm": 1.7826125621795654, "learning_rate": 4.5362339115836597e-07, "loss": 0.1031, "step": 70330 }, { "epoch": 1.6400746116405818, "grad_norm": 1.5798838138580322, "learning_rate": 4.535456693402972e-07, "loss": 0.116, "step": 70340 }, { "epoch": 1.6403077730173998, "grad_norm": 1.1202572584152222, "learning_rate": 4.5346794752222844e-07, "loss": 0.097, "step": 70350 }, { "epoch": 1.6405409343942177, "grad_norm": 1.2218939065933228, "learning_rate": 4.5339022570415965e-07, "loss": 0.1013, "step": 70360 }, { "epoch": 1.6407740957710355, "grad_norm": 2.1514861583709717, "learning_rate": 4.5331250388609086e-07, "loss": 0.1025, "step": 70370 }, { "epoch": 1.6410072571478533, "grad_norm": 1.8363436460494995, "learning_rate": 4.532347820680221e-07, "loss": 0.1189, "step": 70380 }, { "epoch": 1.6412404185246714, "grad_norm": 1.5872009992599487, "learning_rate": 4.531570602499534e-07, "loss": 0.1031, "step": 70390 }, { "epoch": 1.6414735799014895, "grad_norm": 1.8064125776290894, "learning_rate": 4.5307933843188454e-07, "loss": 0.1023, "step": 70400 }, { "epoch": 1.6417067412783073, "grad_norm": 2.0496058464050293, "learning_rate": 4.530016166138158e-07, "loss": 0.1031, "step": 70410 }, { "epoch": 1.6419399026551251, "grad_norm": 1.5054304599761963, "learning_rate": 4.5292389479574706e-07, "loss": 0.1018, "step": 70420 }, { "epoch": 1.642173064031943, "grad_norm": 2.7296619415283203, "learning_rate": 4.528461729776783e-07, "loss": 0.1065, "step": 70430 }, { "epoch": 1.642406225408761, "grad_norm": 1.34160315990448, "learning_rate": 4.527684511596095e-07, "loss": 0.1064, "step": 70440 }, { "epoch": 1.642639386785579, "grad_norm": 1.670505404472351, "learning_rate": 4.5269072934154074e-07, "loss": 0.1259, "step": 70450 }, { "epoch": 1.642872548162397, "grad_norm": 2.5534918308258057, "learning_rate": 4.52613007523472e-07, "loss": 0.1124, "step": 70460 }, { "epoch": 1.6431057095392148, "grad_norm": 2.2734286785125732, "learning_rate": 4.525352857054032e-07, "loss": 0.1033, "step": 70470 }, { "epoch": 1.6433388709160328, "grad_norm": 1.6783521175384521, "learning_rate": 4.524575638873344e-07, "loss": 0.113, "step": 70480 }, { "epoch": 1.6435720322928506, "grad_norm": 2.764122486114502, "learning_rate": 4.523798420692657e-07, "loss": 0.1067, "step": 70490 }, { "epoch": 1.6438051936696687, "grad_norm": 1.397057056427002, "learning_rate": 4.523021202511969e-07, "loss": 0.101, "step": 70500 }, { "epoch": 1.6440383550464865, "grad_norm": 2.972811460494995, "learning_rate": 4.5222439843312815e-07, "loss": 0.1103, "step": 70510 }, { "epoch": 1.6442715164233044, "grad_norm": 1.7475005388259888, "learning_rate": 4.5214667661505936e-07, "loss": 0.12, "step": 70520 }, { "epoch": 1.6445046778001224, "grad_norm": 1.151753544807434, "learning_rate": 4.5206895479699057e-07, "loss": 0.1032, "step": 70530 }, { "epoch": 1.6447378391769405, "grad_norm": 1.4441790580749512, "learning_rate": 4.5199123297892184e-07, "loss": 0.111, "step": 70540 }, { "epoch": 1.6449710005537583, "grad_norm": 1.3263943195343018, "learning_rate": 4.519135111608531e-07, "loss": 0.1099, "step": 70550 }, { "epoch": 1.6452041619305762, "grad_norm": 4.36986780166626, "learning_rate": 4.5183578934278425e-07, "loss": 0.1314, "step": 70560 }, { "epoch": 1.645437323307394, "grad_norm": 1.339924693107605, "learning_rate": 4.517580675247155e-07, "loss": 0.0984, "step": 70570 }, { "epoch": 1.645670484684212, "grad_norm": 1.3200234174728394, "learning_rate": 4.516803457066468e-07, "loss": 0.1058, "step": 70580 }, { "epoch": 1.6459036460610301, "grad_norm": 1.9751765727996826, "learning_rate": 4.51602623888578e-07, "loss": 0.1012, "step": 70590 }, { "epoch": 1.646136807437848, "grad_norm": 1.4195512533187866, "learning_rate": 4.515249020705092e-07, "loss": 0.1013, "step": 70600 }, { "epoch": 1.6463699688146658, "grad_norm": 1.6852692365646362, "learning_rate": 4.5144718025244046e-07, "loss": 0.1121, "step": 70610 }, { "epoch": 1.6466031301914836, "grad_norm": 1.3814066648483276, "learning_rate": 4.5136945843437167e-07, "loss": 0.105, "step": 70620 }, { "epoch": 1.6468362915683017, "grad_norm": 1.4297856092453003, "learning_rate": 4.5129173661630293e-07, "loss": 0.1004, "step": 70630 }, { "epoch": 1.6470694529451197, "grad_norm": 2.808270215988159, "learning_rate": 4.5121401479823414e-07, "loss": 0.0986, "step": 70640 }, { "epoch": 1.6473026143219376, "grad_norm": 2.1313674449920654, "learning_rate": 4.5113629298016535e-07, "loss": 0.1003, "step": 70650 }, { "epoch": 1.6475357756987554, "grad_norm": 2.0119073390960693, "learning_rate": 4.510585711620966e-07, "loss": 0.1085, "step": 70660 }, { "epoch": 1.6477689370755735, "grad_norm": 1.7971059083938599, "learning_rate": 4.5098084934402787e-07, "loss": 0.1007, "step": 70670 }, { "epoch": 1.6480020984523913, "grad_norm": 1.5907526016235352, "learning_rate": 4.5090312752595903e-07, "loss": 0.1007, "step": 70680 }, { "epoch": 1.6482352598292094, "grad_norm": 1.805198073387146, "learning_rate": 4.508254057078903e-07, "loss": 0.1094, "step": 70690 }, { "epoch": 1.6484684212060272, "grad_norm": 1.801368236541748, "learning_rate": 4.5074768388982155e-07, "loss": 0.102, "step": 70700 }, { "epoch": 1.648701582582845, "grad_norm": 1.328190565109253, "learning_rate": 4.5066996207175276e-07, "loss": 0.1113, "step": 70710 }, { "epoch": 1.648934743959663, "grad_norm": 2.1829795837402344, "learning_rate": 4.5059224025368397e-07, "loss": 0.1125, "step": 70720 }, { "epoch": 1.6491679053364812, "grad_norm": 1.3197089433670044, "learning_rate": 4.5051451843561523e-07, "loss": 0.1168, "step": 70730 }, { "epoch": 1.649401066713299, "grad_norm": 1.4150073528289795, "learning_rate": 4.5043679661754644e-07, "loss": 0.1031, "step": 70740 }, { "epoch": 1.6496342280901168, "grad_norm": 2.1256699562072754, "learning_rate": 4.503590747994777e-07, "loss": 0.1111, "step": 70750 }, { "epoch": 1.6498673894669347, "grad_norm": 1.1502879858016968, "learning_rate": 4.502813529814089e-07, "loss": 0.1072, "step": 70760 }, { "epoch": 1.6501005508437527, "grad_norm": 1.4587665796279907, "learning_rate": 4.502036311633401e-07, "loss": 0.1054, "step": 70770 }, { "epoch": 1.6503337122205708, "grad_norm": 2.0114657878875732, "learning_rate": 4.501259093452714e-07, "loss": 0.1065, "step": 70780 }, { "epoch": 1.6505668735973886, "grad_norm": 1.8849512338638306, "learning_rate": 4.5004818752720265e-07, "loss": 0.1173, "step": 70790 }, { "epoch": 1.6508000349742065, "grad_norm": 2.069772720336914, "learning_rate": 4.499704657091338e-07, "loss": 0.1137, "step": 70800 }, { "epoch": 1.6510331963510243, "grad_norm": 1.2747715711593628, "learning_rate": 4.4989274389106507e-07, "loss": 0.1074, "step": 70810 }, { "epoch": 1.6512663577278424, "grad_norm": 1.5847811698913574, "learning_rate": 4.4981502207299633e-07, "loss": 0.1152, "step": 70820 }, { "epoch": 1.6514995191046604, "grad_norm": 1.5106874704360962, "learning_rate": 4.497373002549276e-07, "loss": 0.1066, "step": 70830 }, { "epoch": 1.6517326804814783, "grad_norm": 1.5541998147964478, "learning_rate": 4.4965957843685875e-07, "loss": 0.1143, "step": 70840 }, { "epoch": 1.651965841858296, "grad_norm": 1.672301173210144, "learning_rate": 4.4958185661879e-07, "loss": 0.1057, "step": 70850 }, { "epoch": 1.6521990032351141, "grad_norm": 1.7733972072601318, "learning_rate": 4.4950413480072127e-07, "loss": 0.0993, "step": 70860 }, { "epoch": 1.652432164611932, "grad_norm": 1.1464216709136963, "learning_rate": 4.494264129826525e-07, "loss": 0.1066, "step": 70870 }, { "epoch": 1.65266532598875, "grad_norm": 1.8646481037139893, "learning_rate": 4.493486911645837e-07, "loss": 0.117, "step": 70880 }, { "epoch": 1.6528984873655679, "grad_norm": 1.172478437423706, "learning_rate": 4.4927096934651495e-07, "loss": 0.1012, "step": 70890 }, { "epoch": 1.6531316487423857, "grad_norm": 2.232576608657837, "learning_rate": 4.4919324752844616e-07, "loss": 0.1176, "step": 70900 }, { "epoch": 1.6533648101192038, "grad_norm": 2.2210917472839355, "learning_rate": 4.491155257103774e-07, "loss": 0.1007, "step": 70910 }, { "epoch": 1.6535979714960218, "grad_norm": 3.2131130695343018, "learning_rate": 4.4903780389230863e-07, "loss": 0.1106, "step": 70920 }, { "epoch": 1.6538311328728397, "grad_norm": 1.536812424659729, "learning_rate": 4.4896008207423984e-07, "loss": 0.1009, "step": 70930 }, { "epoch": 1.6540642942496575, "grad_norm": 1.4830352067947388, "learning_rate": 4.488823602561711e-07, "loss": 0.1077, "step": 70940 }, { "epoch": 1.6542974556264753, "grad_norm": 1.279786467552185, "learning_rate": 4.4880463843810237e-07, "loss": 0.1056, "step": 70950 }, { "epoch": 1.6545306170032934, "grad_norm": 1.6255909204483032, "learning_rate": 4.487269166200335e-07, "loss": 0.1085, "step": 70960 }, { "epoch": 1.6547637783801115, "grad_norm": 1.7349194288253784, "learning_rate": 4.486491948019648e-07, "loss": 0.0957, "step": 70970 }, { "epoch": 1.6549969397569293, "grad_norm": 1.24934720993042, "learning_rate": 4.4857147298389605e-07, "loss": 0.1037, "step": 70980 }, { "epoch": 1.6552301011337471, "grad_norm": 1.5735772848129272, "learning_rate": 4.4849375116582726e-07, "loss": 0.0944, "step": 70990 }, { "epoch": 1.655463262510565, "grad_norm": 1.5065572261810303, "learning_rate": 4.4841602934775846e-07, "loss": 0.1081, "step": 71000 }, { "epoch": 1.655696423887383, "grad_norm": 3.3349552154541016, "learning_rate": 4.4833830752968973e-07, "loss": 0.1069, "step": 71010 }, { "epoch": 1.655929585264201, "grad_norm": 1.383182168006897, "learning_rate": 4.4826058571162094e-07, "loss": 0.1085, "step": 71020 }, { "epoch": 1.656162746641019, "grad_norm": 1.1827986240386963, "learning_rate": 4.481828638935522e-07, "loss": 0.1101, "step": 71030 }, { "epoch": 1.6563959080178368, "grad_norm": 2.4210948944091797, "learning_rate": 4.481051420754834e-07, "loss": 0.1043, "step": 71040 }, { "epoch": 1.6566290693946548, "grad_norm": 1.3602627515792847, "learning_rate": 4.480274202574146e-07, "loss": 0.1128, "step": 71050 }, { "epoch": 1.6568622307714727, "grad_norm": 1.3582792282104492, "learning_rate": 4.479496984393459e-07, "loss": 0.1109, "step": 71060 }, { "epoch": 1.6570953921482907, "grad_norm": 1.3040035963058472, "learning_rate": 4.4787197662127714e-07, "loss": 0.1256, "step": 71070 }, { "epoch": 1.6573285535251085, "grad_norm": 1.2263416051864624, "learning_rate": 4.477942548032083e-07, "loss": 0.1167, "step": 71080 }, { "epoch": 1.6575617149019264, "grad_norm": 2.007521867752075, "learning_rate": 4.4771653298513956e-07, "loss": 0.1148, "step": 71090 }, { "epoch": 1.6577948762787444, "grad_norm": 1.4960299730300903, "learning_rate": 4.476388111670708e-07, "loss": 0.1101, "step": 71100 }, { "epoch": 1.6580280376555625, "grad_norm": 1.9421859979629517, "learning_rate": 4.4756108934900203e-07, "loss": 0.1042, "step": 71110 }, { "epoch": 1.6582611990323803, "grad_norm": 4.612299919128418, "learning_rate": 4.4748336753093324e-07, "loss": 0.098, "step": 71120 }, { "epoch": 1.6584943604091982, "grad_norm": 1.4448659420013428, "learning_rate": 4.474056457128645e-07, "loss": 0.0996, "step": 71130 }, { "epoch": 1.658727521786016, "grad_norm": 1.2029331922531128, "learning_rate": 4.473279238947957e-07, "loss": 0.113, "step": 71140 }, { "epoch": 1.658960683162834, "grad_norm": 1.1134552955627441, "learning_rate": 4.47250202076727e-07, "loss": 0.1038, "step": 71150 }, { "epoch": 1.6591938445396521, "grad_norm": 1.6572896242141724, "learning_rate": 4.471724802586582e-07, "loss": 0.1128, "step": 71160 }, { "epoch": 1.65942700591647, "grad_norm": 1.7156819105148315, "learning_rate": 4.470947584405894e-07, "loss": 0.1166, "step": 71170 }, { "epoch": 1.6596601672932878, "grad_norm": 1.3106516599655151, "learning_rate": 4.4701703662252065e-07, "loss": 0.1086, "step": 71180 }, { "epoch": 1.6598933286701056, "grad_norm": 2.158043146133423, "learning_rate": 4.469393148044519e-07, "loss": 0.1076, "step": 71190 }, { "epoch": 1.6601264900469237, "grad_norm": 1.3637505769729614, "learning_rate": 4.4686159298638307e-07, "loss": 0.1051, "step": 71200 }, { "epoch": 1.6603596514237418, "grad_norm": 1.172257900238037, "learning_rate": 4.4678387116831433e-07, "loss": 0.105, "step": 71210 }, { "epoch": 1.6605928128005596, "grad_norm": 2.482283115386963, "learning_rate": 4.467061493502456e-07, "loss": 0.0964, "step": 71220 }, { "epoch": 1.6608259741773774, "grad_norm": 2.004554510116577, "learning_rate": 4.466284275321768e-07, "loss": 0.1037, "step": 71230 }, { "epoch": 1.6610591355541955, "grad_norm": 1.6113288402557373, "learning_rate": 4.46550705714108e-07, "loss": 0.098, "step": 71240 }, { "epoch": 1.6612922969310133, "grad_norm": 1.1713218688964844, "learning_rate": 4.464729838960393e-07, "loss": 0.1007, "step": 71250 }, { "epoch": 1.6615254583078314, "grad_norm": 2.081048011779785, "learning_rate": 4.4639526207797054e-07, "loss": 0.1075, "step": 71260 }, { "epoch": 1.6617586196846492, "grad_norm": 1.1194753646850586, "learning_rate": 4.4631754025990175e-07, "loss": 0.0996, "step": 71270 }, { "epoch": 1.661991781061467, "grad_norm": 1.869719386100769, "learning_rate": 4.4623981844183296e-07, "loss": 0.1153, "step": 71280 }, { "epoch": 1.6622249424382851, "grad_norm": 1.3312703371047974, "learning_rate": 4.461620966237642e-07, "loss": 0.1064, "step": 71290 }, { "epoch": 1.6624581038151032, "grad_norm": 1.3159016370773315, "learning_rate": 4.4608437480569543e-07, "loss": 0.1168, "step": 71300 }, { "epoch": 1.662691265191921, "grad_norm": 2.1438333988189697, "learning_rate": 4.460066529876267e-07, "loss": 0.1117, "step": 71310 }, { "epoch": 1.6629244265687388, "grad_norm": 1.898714542388916, "learning_rate": 4.459289311695579e-07, "loss": 0.1154, "step": 71320 }, { "epoch": 1.6631575879455567, "grad_norm": 1.0577887296676636, "learning_rate": 4.458512093514891e-07, "loss": 0.1074, "step": 71330 }, { "epoch": 1.6633907493223747, "grad_norm": 0.9011525511741638, "learning_rate": 4.4577348753342037e-07, "loss": 0.0979, "step": 71340 }, { "epoch": 1.6636239106991928, "grad_norm": 1.2911450862884521, "learning_rate": 4.4569576571535163e-07, "loss": 0.1044, "step": 71350 }, { "epoch": 1.6638570720760106, "grad_norm": 1.2422219514846802, "learning_rate": 4.456180438972828e-07, "loss": 0.1105, "step": 71360 }, { "epoch": 1.6640902334528285, "grad_norm": 1.163221836090088, "learning_rate": 4.4554032207921405e-07, "loss": 0.0948, "step": 71370 }, { "epoch": 1.6643233948296463, "grad_norm": 0.9396967887878418, "learning_rate": 4.454626002611453e-07, "loss": 0.1033, "step": 71380 }, { "epoch": 1.6645565562064644, "grad_norm": 1.9898325204849243, "learning_rate": 4.453848784430765e-07, "loss": 0.1058, "step": 71390 }, { "epoch": 1.6647897175832824, "grad_norm": 1.4069658517837524, "learning_rate": 4.4530715662500773e-07, "loss": 0.1041, "step": 71400 }, { "epoch": 1.6650228789601003, "grad_norm": 2.7698135375976562, "learning_rate": 4.45229434806939e-07, "loss": 0.0872, "step": 71410 }, { "epoch": 1.665256040336918, "grad_norm": 3.6345410346984863, "learning_rate": 4.451517129888702e-07, "loss": 0.1071, "step": 71420 }, { "epoch": 1.6654892017137362, "grad_norm": 1.3191447257995605, "learning_rate": 4.4507399117080147e-07, "loss": 0.1047, "step": 71430 }, { "epoch": 1.6657223630905542, "grad_norm": 1.216135025024414, "learning_rate": 4.449962693527327e-07, "loss": 0.1162, "step": 71440 }, { "epoch": 1.665955524467372, "grad_norm": 1.6756800413131714, "learning_rate": 4.449185475346639e-07, "loss": 0.1014, "step": 71450 }, { "epoch": 1.6661886858441899, "grad_norm": 1.265684723854065, "learning_rate": 4.4484082571659515e-07, "loss": 0.1105, "step": 71460 }, { "epoch": 1.6664218472210077, "grad_norm": 1.811397671699524, "learning_rate": 4.447631038985264e-07, "loss": 0.1163, "step": 71470 }, { "epoch": 1.6666550085978258, "grad_norm": 3.0026662349700928, "learning_rate": 4.4468538208045757e-07, "loss": 0.1049, "step": 71480 }, { "epoch": 1.6668881699746438, "grad_norm": 1.3745654821395874, "learning_rate": 4.4460766026238883e-07, "loss": 0.1116, "step": 71490 }, { "epoch": 1.6671213313514617, "grad_norm": 2.7892065048217773, "learning_rate": 4.445299384443201e-07, "loss": 0.0963, "step": 71500 }, { "epoch": 1.6673544927282795, "grad_norm": 2.863764762878418, "learning_rate": 4.444522166262513e-07, "loss": 0.1005, "step": 71510 }, { "epoch": 1.6675876541050974, "grad_norm": 1.7052680253982544, "learning_rate": 4.443744948081825e-07, "loss": 0.0968, "step": 71520 }, { "epoch": 1.6678208154819154, "grad_norm": 1.4375524520874023, "learning_rate": 4.4429677299011377e-07, "loss": 0.1107, "step": 71530 }, { "epoch": 1.6680539768587335, "grad_norm": 1.511561632156372, "learning_rate": 4.44219051172045e-07, "loss": 0.1045, "step": 71540 }, { "epoch": 1.6682871382355513, "grad_norm": 3.288654327392578, "learning_rate": 4.4414132935397624e-07, "loss": 0.1101, "step": 71550 }, { "epoch": 1.6685202996123691, "grad_norm": 1.3257739543914795, "learning_rate": 4.4406360753590745e-07, "loss": 0.1126, "step": 71560 }, { "epoch": 1.668753460989187, "grad_norm": 2.5430188179016113, "learning_rate": 4.4398588571783866e-07, "loss": 0.1073, "step": 71570 }, { "epoch": 1.668986622366005, "grad_norm": 2.3211348056793213, "learning_rate": 4.439081638997699e-07, "loss": 0.0993, "step": 71580 }, { "epoch": 1.669219783742823, "grad_norm": 1.3001441955566406, "learning_rate": 4.438304420817012e-07, "loss": 0.1007, "step": 71590 }, { "epoch": 1.669452945119641, "grad_norm": 1.6168540716171265, "learning_rate": 4.4375272026363234e-07, "loss": 0.1045, "step": 71600 }, { "epoch": 1.6696861064964588, "grad_norm": 1.2848376035690308, "learning_rate": 4.436749984455636e-07, "loss": 0.0951, "step": 71610 }, { "epoch": 1.6699192678732768, "grad_norm": 1.5435880422592163, "learning_rate": 4.4359727662749487e-07, "loss": 0.1149, "step": 71620 }, { "epoch": 1.6701524292500949, "grad_norm": 1.4065402746200562, "learning_rate": 4.435195548094261e-07, "loss": 0.098, "step": 71630 }, { "epoch": 1.6703855906269127, "grad_norm": 1.9426355361938477, "learning_rate": 4.434418329913573e-07, "loss": 0.1123, "step": 71640 }, { "epoch": 1.6706187520037306, "grad_norm": 1.630222201347351, "learning_rate": 4.4336411117328855e-07, "loss": 0.1122, "step": 71650 }, { "epoch": 1.6708519133805484, "grad_norm": 1.849568247795105, "learning_rate": 4.4328638935521976e-07, "loss": 0.0962, "step": 71660 }, { "epoch": 1.6710850747573665, "grad_norm": 1.5697983503341675, "learning_rate": 4.43208667537151e-07, "loss": 0.1135, "step": 71670 }, { "epoch": 1.6713182361341845, "grad_norm": 1.8397878408432007, "learning_rate": 4.431309457190822e-07, "loss": 0.1013, "step": 71680 }, { "epoch": 1.6715513975110023, "grad_norm": 2.08483624458313, "learning_rate": 4.4305322390101344e-07, "loss": 0.114, "step": 71690 }, { "epoch": 1.6717845588878202, "grad_norm": 3.3873679637908936, "learning_rate": 4.429755020829447e-07, "loss": 0.1074, "step": 71700 }, { "epoch": 1.672017720264638, "grad_norm": 1.7289446592330933, "learning_rate": 4.4289778026487596e-07, "loss": 0.113, "step": 71710 }, { "epoch": 1.672250881641456, "grad_norm": 1.3411002159118652, "learning_rate": 4.4282005844680717e-07, "loss": 0.1022, "step": 71720 }, { "epoch": 1.6724840430182741, "grad_norm": 1.1405028104782104, "learning_rate": 4.427423366287384e-07, "loss": 0.1042, "step": 71730 }, { "epoch": 1.672717204395092, "grad_norm": 1.1719890832901, "learning_rate": 4.4266461481066964e-07, "loss": 0.1134, "step": 71740 }, { "epoch": 1.6729503657719098, "grad_norm": 1.4736586809158325, "learning_rate": 4.425868929926009e-07, "loss": 0.1153, "step": 71750 }, { "epoch": 1.6731835271487279, "grad_norm": 1.3682429790496826, "learning_rate": 4.4250917117453206e-07, "loss": 0.1066, "step": 71760 }, { "epoch": 1.6734166885255457, "grad_norm": 2.3200788497924805, "learning_rate": 4.424314493564633e-07, "loss": 0.1163, "step": 71770 }, { "epoch": 1.6736498499023638, "grad_norm": 1.5931111574172974, "learning_rate": 4.423537275383946e-07, "loss": 0.1057, "step": 71780 }, { "epoch": 1.6738830112791816, "grad_norm": 1.551906704902649, "learning_rate": 4.422760057203258e-07, "loss": 0.1017, "step": 71790 }, { "epoch": 1.6741161726559994, "grad_norm": 1.5586591958999634, "learning_rate": 4.42198283902257e-07, "loss": 0.116, "step": 71800 }, { "epoch": 1.6743493340328175, "grad_norm": 2.4894769191741943, "learning_rate": 4.4212056208418826e-07, "loss": 0.0996, "step": 71810 }, { "epoch": 1.6745824954096356, "grad_norm": 1.3108855485916138, "learning_rate": 4.4204284026611947e-07, "loss": 0.1088, "step": 71820 }, { "epoch": 1.6748156567864534, "grad_norm": 1.5222022533416748, "learning_rate": 4.4196511844805074e-07, "loss": 0.1116, "step": 71830 }, { "epoch": 1.6750488181632712, "grad_norm": 2.804331064224243, "learning_rate": 4.4188739662998194e-07, "loss": 0.1112, "step": 71840 }, { "epoch": 1.675281979540089, "grad_norm": 1.7223730087280273, "learning_rate": 4.4180967481191315e-07, "loss": 0.1154, "step": 71850 }, { "epoch": 1.6755151409169071, "grad_norm": 2.547168493270874, "learning_rate": 4.417319529938444e-07, "loss": 0.1127, "step": 71860 }, { "epoch": 1.6757483022937252, "grad_norm": 2.9768764972686768, "learning_rate": 4.416542311757757e-07, "loss": 0.1055, "step": 71870 }, { "epoch": 1.675981463670543, "grad_norm": 1.9161301851272583, "learning_rate": 4.4157650935770683e-07, "loss": 0.1091, "step": 71880 }, { "epoch": 1.6762146250473609, "grad_norm": 1.7659153938293457, "learning_rate": 4.414987875396381e-07, "loss": 0.1063, "step": 71890 }, { "epoch": 1.6764477864241787, "grad_norm": 1.30110502243042, "learning_rate": 4.4142106572156936e-07, "loss": 0.0991, "step": 71900 }, { "epoch": 1.6766809478009967, "grad_norm": 1.7976195812225342, "learning_rate": 4.4134334390350057e-07, "loss": 0.1032, "step": 71910 }, { "epoch": 1.6769141091778148, "grad_norm": 3.2363228797912598, "learning_rate": 4.412656220854318e-07, "loss": 0.1212, "step": 71920 }, { "epoch": 1.6771472705546326, "grad_norm": 1.5433653593063354, "learning_rate": 4.4118790026736304e-07, "loss": 0.1182, "step": 71930 }, { "epoch": 1.6773804319314505, "grad_norm": 2.621002674102783, "learning_rate": 4.4111017844929425e-07, "loss": 0.112, "step": 71940 }, { "epoch": 1.6776135933082685, "grad_norm": 3.029512643814087, "learning_rate": 4.410324566312255e-07, "loss": 0.1141, "step": 71950 }, { "epoch": 1.6778467546850864, "grad_norm": 1.2731022834777832, "learning_rate": 4.409547348131567e-07, "loss": 0.1021, "step": 71960 }, { "epoch": 1.6780799160619044, "grad_norm": 1.3570947647094727, "learning_rate": 4.4087701299508793e-07, "loss": 0.1118, "step": 71970 }, { "epoch": 1.6783130774387223, "grad_norm": 1.4402037858963013, "learning_rate": 4.407992911770192e-07, "loss": 0.1029, "step": 71980 }, { "epoch": 1.67854623881554, "grad_norm": 1.6072627305984497, "learning_rate": 4.4072156935895045e-07, "loss": 0.0993, "step": 71990 }, { "epoch": 1.6787794001923582, "grad_norm": 1.3014850616455078, "learning_rate": 4.406438475408816e-07, "loss": 0.1066, "step": 72000 }, { "epoch": 1.6790125615691762, "grad_norm": 2.0337700843811035, "learning_rate": 4.4056612572281287e-07, "loss": 0.1048, "step": 72010 }, { "epoch": 1.679245722945994, "grad_norm": 0.98380047082901, "learning_rate": 4.4048840390474413e-07, "loss": 0.1001, "step": 72020 }, { "epoch": 1.679478884322812, "grad_norm": 1.6167705059051514, "learning_rate": 4.4041068208667534e-07, "loss": 0.1021, "step": 72030 }, { "epoch": 1.6797120456996297, "grad_norm": 2.8375847339630127, "learning_rate": 4.4033296026860655e-07, "loss": 0.1162, "step": 72040 }, { "epoch": 1.6799452070764478, "grad_norm": 1.5524688959121704, "learning_rate": 4.402552384505378e-07, "loss": 0.1007, "step": 72050 }, { "epoch": 1.6801783684532658, "grad_norm": 2.0088231563568115, "learning_rate": 4.40177516632469e-07, "loss": 0.1042, "step": 72060 }, { "epoch": 1.6804115298300837, "grad_norm": 2.8658409118652344, "learning_rate": 4.400997948144003e-07, "loss": 0.1176, "step": 72070 }, { "epoch": 1.6806446912069015, "grad_norm": 2.704442024230957, "learning_rate": 4.400220729963315e-07, "loss": 0.1083, "step": 72080 }, { "epoch": 1.6808778525837194, "grad_norm": 2.838975191116333, "learning_rate": 4.399443511782627e-07, "loss": 0.097, "step": 72090 }, { "epoch": 1.6811110139605374, "grad_norm": 2.2956995964050293, "learning_rate": 4.3986662936019397e-07, "loss": 0.1053, "step": 72100 }, { "epoch": 1.6813441753373555, "grad_norm": 2.7546920776367188, "learning_rate": 4.3978890754212523e-07, "loss": 0.1151, "step": 72110 }, { "epoch": 1.6815773367141733, "grad_norm": 1.8440221548080444, "learning_rate": 4.397111857240564e-07, "loss": 0.1137, "step": 72120 }, { "epoch": 1.6818104980909911, "grad_norm": 1.5845692157745361, "learning_rate": 4.3963346390598765e-07, "loss": 0.1003, "step": 72130 }, { "epoch": 1.6820436594678092, "grad_norm": 1.415692687034607, "learning_rate": 4.395557420879189e-07, "loss": 0.1092, "step": 72140 }, { "epoch": 1.682276820844627, "grad_norm": 2.380638837814331, "learning_rate": 4.3947802026985017e-07, "loss": 0.0968, "step": 72150 }, { "epoch": 1.682509982221445, "grad_norm": 1.650002121925354, "learning_rate": 4.3940029845178133e-07, "loss": 0.1099, "step": 72160 }, { "epoch": 1.682743143598263, "grad_norm": 1.4139306545257568, "learning_rate": 4.393225766337126e-07, "loss": 0.1046, "step": 72170 }, { "epoch": 1.6829763049750808, "grad_norm": 1.5366188287734985, "learning_rate": 4.3924485481564385e-07, "loss": 0.1036, "step": 72180 }, { "epoch": 1.6832094663518988, "grad_norm": 1.2163333892822266, "learning_rate": 4.3916713299757506e-07, "loss": 0.1095, "step": 72190 }, { "epoch": 1.683442627728717, "grad_norm": 3.026930332183838, "learning_rate": 4.3908941117950627e-07, "loss": 0.1176, "step": 72200 }, { "epoch": 1.6836757891055347, "grad_norm": 1.2272096872329712, "learning_rate": 4.3901168936143753e-07, "loss": 0.1028, "step": 72210 }, { "epoch": 1.6839089504823526, "grad_norm": 2.729527711868286, "learning_rate": 4.3893396754336874e-07, "loss": 0.0992, "step": 72220 }, { "epoch": 1.6841421118591704, "grad_norm": 2.828812599182129, "learning_rate": 4.388562457253e-07, "loss": 0.1085, "step": 72230 }, { "epoch": 1.6843752732359885, "grad_norm": 1.8288962841033936, "learning_rate": 4.387785239072312e-07, "loss": 0.0987, "step": 72240 }, { "epoch": 1.6846084346128065, "grad_norm": 1.1628962755203247, "learning_rate": 4.387008020891624e-07, "loss": 0.1046, "step": 72250 }, { "epoch": 1.6848415959896244, "grad_norm": 1.1922309398651123, "learning_rate": 4.386230802710937e-07, "loss": 0.1103, "step": 72260 }, { "epoch": 1.6850747573664422, "grad_norm": 3.0299088954925537, "learning_rate": 4.3854535845302495e-07, "loss": 0.111, "step": 72270 }, { "epoch": 1.68530791874326, "grad_norm": 1.7609409093856812, "learning_rate": 4.384676366349561e-07, "loss": 0.1071, "step": 72280 }, { "epoch": 1.685541080120078, "grad_norm": 1.4974414110183716, "learning_rate": 4.3838991481688736e-07, "loss": 0.1115, "step": 72290 }, { "epoch": 1.6857742414968961, "grad_norm": 1.253353238105774, "learning_rate": 4.3831219299881863e-07, "loss": 0.1036, "step": 72300 }, { "epoch": 1.686007402873714, "grad_norm": 1.7461163997650146, "learning_rate": 4.3823447118074984e-07, "loss": 0.1039, "step": 72310 }, { "epoch": 1.6862405642505318, "grad_norm": 2.2115087509155273, "learning_rate": 4.381567493626811e-07, "loss": 0.1051, "step": 72320 }, { "epoch": 1.6864737256273499, "grad_norm": 1.2261227369308472, "learning_rate": 4.380790275446123e-07, "loss": 0.1057, "step": 72330 }, { "epoch": 1.6867068870041677, "grad_norm": 1.5130265951156616, "learning_rate": 4.380013057265435e-07, "loss": 0.0957, "step": 72340 }, { "epoch": 1.6869400483809858, "grad_norm": 1.4996674060821533, "learning_rate": 4.379235839084748e-07, "loss": 0.1161, "step": 72350 }, { "epoch": 1.6871732097578036, "grad_norm": 1.4846553802490234, "learning_rate": 4.3784586209040604e-07, "loss": 0.1073, "step": 72360 }, { "epoch": 1.6874063711346214, "grad_norm": 1.3076670169830322, "learning_rate": 4.377681402723372e-07, "loss": 0.1003, "step": 72370 }, { "epoch": 1.6876395325114395, "grad_norm": 1.332086205482483, "learning_rate": 4.3769041845426846e-07, "loss": 0.1058, "step": 72380 }, { "epoch": 1.6878726938882576, "grad_norm": 2.4212002754211426, "learning_rate": 4.376126966361997e-07, "loss": 0.1038, "step": 72390 }, { "epoch": 1.6881058552650754, "grad_norm": 1.1730129718780518, "learning_rate": 4.3753497481813093e-07, "loss": 0.1027, "step": 72400 }, { "epoch": 1.6883390166418932, "grad_norm": 1.203163743019104, "learning_rate": 4.3745725300006214e-07, "loss": 0.1055, "step": 72410 }, { "epoch": 1.688572178018711, "grad_norm": 1.5036214590072632, "learning_rate": 4.373795311819934e-07, "loss": 0.1107, "step": 72420 }, { "epoch": 1.6888053393955291, "grad_norm": 1.315467357635498, "learning_rate": 4.373018093639246e-07, "loss": 0.1104, "step": 72430 }, { "epoch": 1.6890385007723472, "grad_norm": 1.6290708780288696, "learning_rate": 4.3722408754585587e-07, "loss": 0.1103, "step": 72440 }, { "epoch": 1.689271662149165, "grad_norm": 1.8478100299835205, "learning_rate": 4.371463657277871e-07, "loss": 0.1018, "step": 72450 }, { "epoch": 1.6895048235259829, "grad_norm": 1.4921433925628662, "learning_rate": 4.370686439097183e-07, "loss": 0.1048, "step": 72460 }, { "epoch": 1.6897379849028007, "grad_norm": 1.4048700332641602, "learning_rate": 4.3699092209164955e-07, "loss": 0.1058, "step": 72470 }, { "epoch": 1.6899711462796188, "grad_norm": 1.8283097743988037, "learning_rate": 4.369132002735808e-07, "loss": 0.1034, "step": 72480 }, { "epoch": 1.6902043076564368, "grad_norm": 2.2922284603118896, "learning_rate": 4.3683547845551197e-07, "loss": 0.1102, "step": 72490 }, { "epoch": 1.6904374690332546, "grad_norm": 3.1078743934631348, "learning_rate": 4.3675775663744323e-07, "loss": 0.1109, "step": 72500 }, { "epoch": 1.6906706304100725, "grad_norm": 1.3088452816009521, "learning_rate": 4.366800348193745e-07, "loss": 0.1008, "step": 72510 }, { "epoch": 1.6909037917868905, "grad_norm": 1.9936519861221313, "learning_rate": 4.3660231300130576e-07, "loss": 0.106, "step": 72520 }, { "epoch": 1.6911369531637084, "grad_norm": 1.709848403930664, "learning_rate": 4.365245911832369e-07, "loss": 0.1067, "step": 72530 }, { "epoch": 1.6913701145405264, "grad_norm": 2.2323505878448486, "learning_rate": 4.364468693651682e-07, "loss": 0.0915, "step": 72540 }, { "epoch": 1.6916032759173443, "grad_norm": 1.5516188144683838, "learning_rate": 4.3636914754709944e-07, "loss": 0.108, "step": 72550 }, { "epoch": 1.691836437294162, "grad_norm": 2.1666934490203857, "learning_rate": 4.3629142572903065e-07, "loss": 0.1037, "step": 72560 }, { "epoch": 1.6920695986709802, "grad_norm": 1.1331377029418945, "learning_rate": 4.3621370391096186e-07, "loss": 0.0984, "step": 72570 }, { "epoch": 1.6923027600477982, "grad_norm": 1.382967472076416, "learning_rate": 4.361359820928931e-07, "loss": 0.1096, "step": 72580 }, { "epoch": 1.692535921424616, "grad_norm": 1.8169333934783936, "learning_rate": 4.3605826027482433e-07, "loss": 0.1078, "step": 72590 }, { "epoch": 1.692769082801434, "grad_norm": 1.5977461338043213, "learning_rate": 4.359805384567556e-07, "loss": 0.1106, "step": 72600 }, { "epoch": 1.6930022441782517, "grad_norm": 2.979811668395996, "learning_rate": 4.359028166386868e-07, "loss": 0.1134, "step": 72610 }, { "epoch": 1.6932354055550698, "grad_norm": 1.2795424461364746, "learning_rate": 4.35825094820618e-07, "loss": 0.0921, "step": 72620 }, { "epoch": 1.6934685669318879, "grad_norm": 1.7859083414077759, "learning_rate": 4.3574737300254927e-07, "loss": 0.1111, "step": 72630 }, { "epoch": 1.6937017283087057, "grad_norm": 1.517374873161316, "learning_rate": 4.3566965118448053e-07, "loss": 0.107, "step": 72640 }, { "epoch": 1.6939348896855235, "grad_norm": 2.347257614135742, "learning_rate": 4.355919293664117e-07, "loss": 0.1082, "step": 72650 }, { "epoch": 1.6941680510623414, "grad_norm": 1.7262760400772095, "learning_rate": 4.3551420754834295e-07, "loss": 0.1052, "step": 72660 }, { "epoch": 1.6944012124391594, "grad_norm": 1.470744013786316, "learning_rate": 4.354364857302742e-07, "loss": 0.1058, "step": 72670 }, { "epoch": 1.6946343738159775, "grad_norm": 0.9846439957618713, "learning_rate": 4.353587639122054e-07, "loss": 0.1073, "step": 72680 }, { "epoch": 1.6948675351927953, "grad_norm": 1.709530234336853, "learning_rate": 4.3528104209413663e-07, "loss": 0.0961, "step": 72690 }, { "epoch": 1.6951006965696132, "grad_norm": 1.9755125045776367, "learning_rate": 4.352033202760679e-07, "loss": 0.111, "step": 72700 }, { "epoch": 1.6953338579464312, "grad_norm": 1.4087239503860474, "learning_rate": 4.351255984579991e-07, "loss": 0.1045, "step": 72710 }, { "epoch": 1.6955670193232493, "grad_norm": 1.4517325162887573, "learning_rate": 4.3504787663993037e-07, "loss": 0.0991, "step": 72720 }, { "epoch": 1.695800180700067, "grad_norm": 1.7983366250991821, "learning_rate": 4.349701548218616e-07, "loss": 0.1037, "step": 72730 }, { "epoch": 1.696033342076885, "grad_norm": 1.4275062084197998, "learning_rate": 4.348924330037928e-07, "loss": 0.1192, "step": 72740 }, { "epoch": 1.6962665034537028, "grad_norm": 1.2746983766555786, "learning_rate": 4.3481471118572405e-07, "loss": 0.1035, "step": 72750 }, { "epoch": 1.6964996648305208, "grad_norm": 1.6545300483703613, "learning_rate": 4.347369893676553e-07, "loss": 0.1149, "step": 72760 }, { "epoch": 1.696732826207339, "grad_norm": 0.9724535346031189, "learning_rate": 4.3465926754958647e-07, "loss": 0.0975, "step": 72770 }, { "epoch": 1.6969659875841567, "grad_norm": 1.5036413669586182, "learning_rate": 4.3458154573151773e-07, "loss": 0.0978, "step": 72780 }, { "epoch": 1.6971991489609746, "grad_norm": 1.4551302194595337, "learning_rate": 4.34503823913449e-07, "loss": 0.1224, "step": 72790 }, { "epoch": 1.6974323103377924, "grad_norm": 1.4668879508972168, "learning_rate": 4.344261020953802e-07, "loss": 0.1123, "step": 72800 }, { "epoch": 1.6976654717146105, "grad_norm": 2.042556047439575, "learning_rate": 4.343483802773114e-07, "loss": 0.1046, "step": 72810 }, { "epoch": 1.6978986330914285, "grad_norm": 1.2744297981262207, "learning_rate": 4.3427065845924267e-07, "loss": 0.0972, "step": 72820 }, { "epoch": 1.6981317944682464, "grad_norm": 1.4540187120437622, "learning_rate": 4.341929366411739e-07, "loss": 0.1055, "step": 72830 }, { "epoch": 1.6983649558450642, "grad_norm": 3.6487419605255127, "learning_rate": 4.3411521482310514e-07, "loss": 0.1125, "step": 72840 }, { "epoch": 1.698598117221882, "grad_norm": 2.257809638977051, "learning_rate": 4.3403749300503635e-07, "loss": 0.1059, "step": 72850 }, { "epoch": 1.6988312785987, "grad_norm": 1.6836472749710083, "learning_rate": 4.3395977118696756e-07, "loss": 0.1103, "step": 72860 }, { "epoch": 1.6990644399755181, "grad_norm": 1.291939377784729, "learning_rate": 4.338820493688988e-07, "loss": 0.1082, "step": 72870 }, { "epoch": 1.699297601352336, "grad_norm": 2.859013795852661, "learning_rate": 4.338043275508301e-07, "loss": 0.1097, "step": 72880 }, { "epoch": 1.6995307627291538, "grad_norm": 2.098142385482788, "learning_rate": 4.3372660573276124e-07, "loss": 0.098, "step": 72890 }, { "epoch": 1.6997639241059719, "grad_norm": 1.3354519605636597, "learning_rate": 4.336488839146925e-07, "loss": 0.1033, "step": 72900 }, { "epoch": 1.69999708548279, "grad_norm": 1.504804015159607, "learning_rate": 4.3357116209662377e-07, "loss": 0.0978, "step": 72910 }, { "epoch": 1.7002302468596078, "grad_norm": 3.927147150039673, "learning_rate": 4.33493440278555e-07, "loss": 0.1203, "step": 72920 }, { "epoch": 1.7004634082364256, "grad_norm": 1.1079092025756836, "learning_rate": 4.334157184604862e-07, "loss": 0.0994, "step": 72930 }, { "epoch": 1.7006965696132434, "grad_norm": 2.774555206298828, "learning_rate": 4.3333799664241745e-07, "loss": 0.1127, "step": 72940 }, { "epoch": 1.7009297309900615, "grad_norm": 2.3160479068756104, "learning_rate": 4.332602748243487e-07, "loss": 0.1039, "step": 72950 }, { "epoch": 1.7011628923668796, "grad_norm": 1.558762788772583, "learning_rate": 4.331825530062799e-07, "loss": 0.0973, "step": 72960 }, { "epoch": 1.7013960537436974, "grad_norm": 1.2206361293792725, "learning_rate": 4.331048311882111e-07, "loss": 0.1015, "step": 72970 }, { "epoch": 1.7016292151205152, "grad_norm": 1.4983023405075073, "learning_rate": 4.330271093701424e-07, "loss": 0.1112, "step": 72980 }, { "epoch": 1.701862376497333, "grad_norm": 1.320002555847168, "learning_rate": 4.329493875520736e-07, "loss": 0.0996, "step": 72990 }, { "epoch": 1.7020955378741511, "grad_norm": 2.3485851287841797, "learning_rate": 4.3287166573400486e-07, "loss": 0.1007, "step": 73000 }, { "epoch": 1.7023286992509692, "grad_norm": 2.5176827907562256, "learning_rate": 4.3279394391593607e-07, "loss": 0.1062, "step": 73010 }, { "epoch": 1.702561860627787, "grad_norm": 1.5978151559829712, "learning_rate": 4.327162220978673e-07, "loss": 0.1149, "step": 73020 }, { "epoch": 1.7027950220046049, "grad_norm": 1.874725580215454, "learning_rate": 4.3263850027979854e-07, "loss": 0.1023, "step": 73030 }, { "epoch": 1.703028183381423, "grad_norm": 1.8335155248641968, "learning_rate": 4.325607784617298e-07, "loss": 0.0943, "step": 73040 }, { "epoch": 1.7032613447582408, "grad_norm": 1.0337344408035278, "learning_rate": 4.3248305664366096e-07, "loss": 0.0996, "step": 73050 }, { "epoch": 1.7034945061350588, "grad_norm": 1.6080917119979858, "learning_rate": 4.324053348255922e-07, "loss": 0.1004, "step": 73060 }, { "epoch": 1.7037276675118767, "grad_norm": 1.1862412691116333, "learning_rate": 4.323276130075235e-07, "loss": 0.0945, "step": 73070 }, { "epoch": 1.7039608288886945, "grad_norm": 2.327665328979492, "learning_rate": 4.322498911894547e-07, "loss": 0.1125, "step": 73080 }, { "epoch": 1.7041939902655125, "grad_norm": 2.262849807739258, "learning_rate": 4.321721693713859e-07, "loss": 0.1155, "step": 73090 }, { "epoch": 1.7044271516423306, "grad_norm": 1.5374855995178223, "learning_rate": 4.3209444755331716e-07, "loss": 0.1157, "step": 73100 }, { "epoch": 1.7046603130191484, "grad_norm": 1.7661911249160767, "learning_rate": 4.3201672573524837e-07, "loss": 0.1108, "step": 73110 }, { "epoch": 1.7048934743959663, "grad_norm": 1.8568910360336304, "learning_rate": 4.3193900391717964e-07, "loss": 0.1015, "step": 73120 }, { "epoch": 1.7051266357727841, "grad_norm": 1.1415889263153076, "learning_rate": 4.3186128209911084e-07, "loss": 0.1147, "step": 73130 }, { "epoch": 1.7053597971496022, "grad_norm": 1.2220187187194824, "learning_rate": 4.3178356028104205e-07, "loss": 0.1033, "step": 73140 }, { "epoch": 1.7055929585264202, "grad_norm": 2.6966590881347656, "learning_rate": 4.317058384629733e-07, "loss": 0.1193, "step": 73150 }, { "epoch": 1.705826119903238, "grad_norm": 2.6177592277526855, "learning_rate": 4.316281166449046e-07, "loss": 0.118, "step": 73160 }, { "epoch": 1.706059281280056, "grad_norm": 1.4949657917022705, "learning_rate": 4.3155039482683573e-07, "loss": 0.1187, "step": 73170 }, { "epoch": 1.7062924426568737, "grad_norm": 1.6703355312347412, "learning_rate": 4.31472673008767e-07, "loss": 0.1083, "step": 73180 }, { "epoch": 1.7065256040336918, "grad_norm": 1.430437684059143, "learning_rate": 4.3139495119069826e-07, "loss": 0.1023, "step": 73190 }, { "epoch": 1.7067587654105099, "grad_norm": 1.3510349988937378, "learning_rate": 4.3131722937262947e-07, "loss": 0.1048, "step": 73200 }, { "epoch": 1.7069919267873277, "grad_norm": 1.0364243984222412, "learning_rate": 4.312395075545607e-07, "loss": 0.0997, "step": 73210 }, { "epoch": 1.7072250881641455, "grad_norm": 1.2286068201065063, "learning_rate": 4.3116178573649194e-07, "loss": 0.1005, "step": 73220 }, { "epoch": 1.7074582495409636, "grad_norm": 1.095473289489746, "learning_rate": 4.3108406391842315e-07, "loss": 0.1086, "step": 73230 }, { "epoch": 1.7076914109177814, "grad_norm": 1.4812973737716675, "learning_rate": 4.310063421003544e-07, "loss": 0.1055, "step": 73240 }, { "epoch": 1.7079245722945995, "grad_norm": 2.8766117095947266, "learning_rate": 4.309286202822856e-07, "loss": 0.1149, "step": 73250 }, { "epoch": 1.7081577336714173, "grad_norm": 1.4790983200073242, "learning_rate": 4.3085089846421683e-07, "loss": 0.1029, "step": 73260 }, { "epoch": 1.7083908950482352, "grad_norm": 1.0316872596740723, "learning_rate": 4.307731766461481e-07, "loss": 0.1097, "step": 73270 }, { "epoch": 1.7086240564250532, "grad_norm": 2.5049960613250732, "learning_rate": 4.3069545482807935e-07, "loss": 0.1109, "step": 73280 }, { "epoch": 1.7088572178018713, "grad_norm": 1.3111093044281006, "learning_rate": 4.306177330100105e-07, "loss": 0.104, "step": 73290 }, { "epoch": 1.7090903791786891, "grad_norm": 1.5907319784164429, "learning_rate": 4.3054001119194177e-07, "loss": 0.1197, "step": 73300 }, { "epoch": 1.709323540555507, "grad_norm": 2.8198306560516357, "learning_rate": 4.3046228937387303e-07, "loss": 0.1052, "step": 73310 }, { "epoch": 1.7095567019323248, "grad_norm": 1.201341152191162, "learning_rate": 4.3038456755580424e-07, "loss": 0.1036, "step": 73320 }, { "epoch": 1.7097898633091428, "grad_norm": 1.9800996780395508, "learning_rate": 4.3030684573773545e-07, "loss": 0.1041, "step": 73330 }, { "epoch": 1.710023024685961, "grad_norm": 1.3161484003067017, "learning_rate": 4.302291239196667e-07, "loss": 0.1018, "step": 73340 }, { "epoch": 1.7102561860627787, "grad_norm": 1.5459327697753906, "learning_rate": 4.301514021015979e-07, "loss": 0.1021, "step": 73350 }, { "epoch": 1.7104893474395966, "grad_norm": 1.529059648513794, "learning_rate": 4.300736802835292e-07, "loss": 0.1066, "step": 73360 }, { "epoch": 1.7107225088164144, "grad_norm": 1.2883696556091309, "learning_rate": 4.299959584654604e-07, "loss": 0.1148, "step": 73370 }, { "epoch": 1.7109556701932325, "grad_norm": 1.2929340600967407, "learning_rate": 4.299182366473916e-07, "loss": 0.1154, "step": 73380 }, { "epoch": 1.7111888315700505, "grad_norm": 1.8052852153778076, "learning_rate": 4.2984051482932287e-07, "loss": 0.1005, "step": 73390 }, { "epoch": 1.7114219929468684, "grad_norm": 2.534895420074463, "learning_rate": 4.2976279301125413e-07, "loss": 0.1075, "step": 73400 }, { "epoch": 1.7116551543236862, "grad_norm": 1.3064721822738647, "learning_rate": 4.2968507119318534e-07, "loss": 0.107, "step": 73410 }, { "epoch": 1.7118883157005043, "grad_norm": 1.3443282842636108, "learning_rate": 4.2960734937511655e-07, "loss": 0.1162, "step": 73420 }, { "epoch": 1.712121477077322, "grad_norm": 2.740504264831543, "learning_rate": 4.295296275570478e-07, "loss": 0.1011, "step": 73430 }, { "epoch": 1.7123546384541402, "grad_norm": 2.0623018741607666, "learning_rate": 4.2945190573897907e-07, "loss": 0.0938, "step": 73440 }, { "epoch": 1.712587799830958, "grad_norm": 1.2177345752716064, "learning_rate": 4.2937418392091023e-07, "loss": 0.0939, "step": 73450 }, { "epoch": 1.7128209612077758, "grad_norm": 3.477651357650757, "learning_rate": 4.2930423428464835e-07, "loss": 0.1129, "step": 73460 }, { "epoch": 1.7130541225845939, "grad_norm": 2.505338668823242, "learning_rate": 4.292265124665796e-07, "loss": 0.1049, "step": 73470 }, { "epoch": 1.713287283961412, "grad_norm": 3.114400863647461, "learning_rate": 4.291487906485108e-07, "loss": 0.113, "step": 73480 }, { "epoch": 1.7135204453382298, "grad_norm": 2.1485204696655273, "learning_rate": 4.290710688304421e-07, "loss": 0.0958, "step": 73490 }, { "epoch": 1.7137536067150476, "grad_norm": 1.4997748136520386, "learning_rate": 4.289933470123733e-07, "loss": 0.1168, "step": 73500 }, { "epoch": 1.7139867680918655, "grad_norm": 2.249905824661255, "learning_rate": 4.289156251943045e-07, "loss": 0.1121, "step": 73510 }, { "epoch": 1.7142199294686835, "grad_norm": 2.81575083732605, "learning_rate": 4.2883790337623577e-07, "loss": 0.108, "step": 73520 }, { "epoch": 1.7144530908455016, "grad_norm": 1.4738465547561646, "learning_rate": 4.2876018155816703e-07, "loss": 0.0998, "step": 73530 }, { "epoch": 1.7146862522223194, "grad_norm": 1.5208683013916016, "learning_rate": 4.286824597400982e-07, "loss": 0.1106, "step": 73540 }, { "epoch": 1.7149194135991372, "grad_norm": 1.2299400568008423, "learning_rate": 4.2860473792202945e-07, "loss": 0.1144, "step": 73550 }, { "epoch": 1.715152574975955, "grad_norm": 1.8780683279037476, "learning_rate": 4.285270161039607e-07, "loss": 0.109, "step": 73560 }, { "epoch": 1.7153857363527731, "grad_norm": 2.2967474460601807, "learning_rate": 4.284492942858919e-07, "loss": 0.1061, "step": 73570 }, { "epoch": 1.7156188977295912, "grad_norm": 1.7327324151992798, "learning_rate": 4.2837157246782313e-07, "loss": 0.1105, "step": 73580 }, { "epoch": 1.715852059106409, "grad_norm": 1.37686288356781, "learning_rate": 4.282938506497544e-07, "loss": 0.1011, "step": 73590 }, { "epoch": 1.7160852204832269, "grad_norm": 1.3408308029174805, "learning_rate": 4.282161288316856e-07, "loss": 0.1088, "step": 73600 }, { "epoch": 1.716318381860045, "grad_norm": 1.463625431060791, "learning_rate": 4.2813840701361686e-07, "loss": 0.1059, "step": 73610 }, { "epoch": 1.7165515432368628, "grad_norm": 1.180044174194336, "learning_rate": 4.2806068519554807e-07, "loss": 0.0973, "step": 73620 }, { "epoch": 1.7167847046136808, "grad_norm": 1.3852087259292603, "learning_rate": 4.279829633774793e-07, "loss": 0.1063, "step": 73630 }, { "epoch": 1.7170178659904987, "grad_norm": 1.6755291223526, "learning_rate": 4.2790524155941054e-07, "loss": 0.0958, "step": 73640 }, { "epoch": 1.7172510273673165, "grad_norm": 1.161192774772644, "learning_rate": 4.278275197413418e-07, "loss": 0.0958, "step": 73650 }, { "epoch": 1.7174841887441346, "grad_norm": 1.2311683893203735, "learning_rate": 4.2774979792327296e-07, "loss": 0.1033, "step": 73660 }, { "epoch": 1.7177173501209526, "grad_norm": 1.5327483415603638, "learning_rate": 4.276720761052042e-07, "loss": 0.1056, "step": 73670 }, { "epoch": 1.7179505114977704, "grad_norm": 1.028943657875061, "learning_rate": 4.275943542871355e-07, "loss": 0.1074, "step": 73680 }, { "epoch": 1.7181836728745883, "grad_norm": 2.0246353149414062, "learning_rate": 4.275166324690667e-07, "loss": 0.1067, "step": 73690 }, { "epoch": 1.7184168342514061, "grad_norm": 1.6026886701583862, "learning_rate": 4.274389106509979e-07, "loss": 0.0959, "step": 73700 }, { "epoch": 1.7186499956282242, "grad_norm": 1.172930359840393, "learning_rate": 4.2736118883292917e-07, "loss": 0.0992, "step": 73710 }, { "epoch": 1.7188831570050422, "grad_norm": 3.086916446685791, "learning_rate": 4.272834670148604e-07, "loss": 0.112, "step": 73720 }, { "epoch": 1.71911631838186, "grad_norm": 1.6784487962722778, "learning_rate": 4.2720574519679164e-07, "loss": 0.1037, "step": 73730 }, { "epoch": 1.719349479758678, "grad_norm": 1.6364398002624512, "learning_rate": 4.2712802337872285e-07, "loss": 0.1105, "step": 73740 }, { "epoch": 1.7195826411354957, "grad_norm": 1.0922424793243408, "learning_rate": 4.2705030156065405e-07, "loss": 0.1025, "step": 73750 }, { "epoch": 1.7198158025123138, "grad_norm": 1.8718903064727783, "learning_rate": 4.269725797425853e-07, "loss": 0.0937, "step": 73760 }, { "epoch": 1.7200489638891319, "grad_norm": 1.3573027849197388, "learning_rate": 4.268948579245166e-07, "loss": 0.1048, "step": 73770 }, { "epoch": 1.7202821252659497, "grad_norm": 2.189274549484253, "learning_rate": 4.2681713610644774e-07, "loss": 0.0996, "step": 73780 }, { "epoch": 1.7205152866427675, "grad_norm": 2.2930073738098145, "learning_rate": 4.26739414288379e-07, "loss": 0.0941, "step": 73790 }, { "epoch": 1.7207484480195856, "grad_norm": 1.7825136184692383, "learning_rate": 4.2666169247031026e-07, "loss": 0.102, "step": 73800 }, { "epoch": 1.7209816093964034, "grad_norm": 1.3415957689285278, "learning_rate": 4.265839706522415e-07, "loss": 0.1162, "step": 73810 }, { "epoch": 1.7212147707732215, "grad_norm": 1.715190052986145, "learning_rate": 4.265062488341727e-07, "loss": 0.1127, "step": 73820 }, { "epoch": 1.7214479321500393, "grad_norm": 0.9827927947044373, "learning_rate": 4.2642852701610394e-07, "loss": 0.1139, "step": 73830 }, { "epoch": 1.7216810935268572, "grad_norm": 1.0391631126403809, "learning_rate": 4.263508051980352e-07, "loss": 0.098, "step": 73840 }, { "epoch": 1.7219142549036752, "grad_norm": 1.1558661460876465, "learning_rate": 4.262730833799664e-07, "loss": 0.0918, "step": 73850 }, { "epoch": 1.7221474162804933, "grad_norm": 2.7758591175079346, "learning_rate": 4.261953615618976e-07, "loss": 0.1105, "step": 73860 }, { "epoch": 1.7223805776573111, "grad_norm": 1.4869301319122314, "learning_rate": 4.261176397438289e-07, "loss": 0.1003, "step": 73870 }, { "epoch": 1.722613739034129, "grad_norm": 1.9796223640441895, "learning_rate": 4.260399179257601e-07, "loss": 0.0981, "step": 73880 }, { "epoch": 1.7228469004109468, "grad_norm": 1.058362364768982, "learning_rate": 4.2596219610769135e-07, "loss": 0.0996, "step": 73890 }, { "epoch": 1.7230800617877648, "grad_norm": 1.4807658195495605, "learning_rate": 4.2588447428962256e-07, "loss": 0.1018, "step": 73900 }, { "epoch": 1.723313223164583, "grad_norm": 1.735355257987976, "learning_rate": 4.2580675247155377e-07, "loss": 0.1055, "step": 73910 }, { "epoch": 1.7235463845414007, "grad_norm": 3.217998743057251, "learning_rate": 4.2572903065348504e-07, "loss": 0.1, "step": 73920 }, { "epoch": 1.7237795459182186, "grad_norm": 2.134338617324829, "learning_rate": 4.256513088354163e-07, "loss": 0.1061, "step": 73930 }, { "epoch": 1.7240127072950364, "grad_norm": 2.296473503112793, "learning_rate": 4.2557358701734745e-07, "loss": 0.1139, "step": 73940 }, { "epoch": 1.7242458686718545, "grad_norm": 1.771328091621399, "learning_rate": 4.254958651992787e-07, "loss": 0.1212, "step": 73950 }, { "epoch": 1.7244790300486725, "grad_norm": 1.4836167097091675, "learning_rate": 4.2541814338121e-07, "loss": 0.1097, "step": 73960 }, { "epoch": 1.7247121914254904, "grad_norm": 5.1269731521606445, "learning_rate": 4.253404215631412e-07, "loss": 0.1128, "step": 73970 }, { "epoch": 1.7249453528023082, "grad_norm": 1.2231038808822632, "learning_rate": 4.252626997450724e-07, "loss": 0.0946, "step": 73980 }, { "epoch": 1.7251785141791263, "grad_norm": 1.2309837341308594, "learning_rate": 4.2518497792700366e-07, "loss": 0.1073, "step": 73990 }, { "epoch": 1.7254116755559443, "grad_norm": 2.114544630050659, "learning_rate": 4.2510725610893487e-07, "loss": 0.1088, "step": 74000 }, { "epoch": 1.7256448369327622, "grad_norm": 3.9572606086730957, "learning_rate": 4.2502953429086613e-07, "loss": 0.1076, "step": 74010 }, { "epoch": 1.72587799830958, "grad_norm": 1.1195781230926514, "learning_rate": 4.2495181247279734e-07, "loss": 0.1042, "step": 74020 }, { "epoch": 1.7261111596863978, "grad_norm": 1.4588464498519897, "learning_rate": 4.2487409065472855e-07, "loss": 0.0958, "step": 74030 }, { "epoch": 1.726344321063216, "grad_norm": 1.6115574836730957, "learning_rate": 4.247963688366598e-07, "loss": 0.1024, "step": 74040 }, { "epoch": 1.726577482440034, "grad_norm": 1.1995995044708252, "learning_rate": 4.2471864701859107e-07, "loss": 0.0965, "step": 74050 }, { "epoch": 1.7268106438168518, "grad_norm": 1.5703136920928955, "learning_rate": 4.2464092520052223e-07, "loss": 0.0987, "step": 74060 }, { "epoch": 1.7270438051936696, "grad_norm": 2.5318336486816406, "learning_rate": 4.245632033824535e-07, "loss": 0.114, "step": 74070 }, { "epoch": 1.7272769665704875, "grad_norm": 1.6569157838821411, "learning_rate": 4.2448548156438475e-07, "loss": 0.1021, "step": 74080 }, { "epoch": 1.7275101279473055, "grad_norm": 1.8147897720336914, "learning_rate": 4.2440775974631596e-07, "loss": 0.1184, "step": 74090 }, { "epoch": 1.7277432893241236, "grad_norm": 1.5058549642562866, "learning_rate": 4.2433003792824717e-07, "loss": 0.1067, "step": 74100 }, { "epoch": 1.7279764507009414, "grad_norm": 1.8972742557525635, "learning_rate": 4.2425231611017843e-07, "loss": 0.1057, "step": 74110 }, { "epoch": 1.7282096120777592, "grad_norm": 1.75226891040802, "learning_rate": 4.2417459429210964e-07, "loss": 0.1032, "step": 74120 }, { "epoch": 1.728442773454577, "grad_norm": 2.311476945877075, "learning_rate": 4.240968724740409e-07, "loss": 0.1088, "step": 74130 }, { "epoch": 1.7286759348313951, "grad_norm": 3.0336203575134277, "learning_rate": 4.240191506559721e-07, "loss": 0.101, "step": 74140 }, { "epoch": 1.7289090962082132, "grad_norm": 1.3286007642745972, "learning_rate": 4.239414288379033e-07, "loss": 0.1202, "step": 74150 }, { "epoch": 1.729142257585031, "grad_norm": 1.425073504447937, "learning_rate": 4.238637070198346e-07, "loss": 0.1043, "step": 74160 }, { "epoch": 1.7293754189618489, "grad_norm": 1.957706093788147, "learning_rate": 4.2378598520176585e-07, "loss": 0.1057, "step": 74170 }, { "epoch": 1.729608580338667, "grad_norm": 4.087195873260498, "learning_rate": 4.23708263383697e-07, "loss": 0.1029, "step": 74180 }, { "epoch": 1.729841741715485, "grad_norm": 1.6156007051467896, "learning_rate": 4.2363054156562827e-07, "loss": 0.1147, "step": 74190 }, { "epoch": 1.7300749030923028, "grad_norm": 1.4310023784637451, "learning_rate": 4.2355281974755953e-07, "loss": 0.1066, "step": 74200 }, { "epoch": 1.7303080644691207, "grad_norm": 0.9995402097702026, "learning_rate": 4.234750979294908e-07, "loss": 0.1085, "step": 74210 }, { "epoch": 1.7305412258459385, "grad_norm": 1.1068270206451416, "learning_rate": 4.2339737611142195e-07, "loss": 0.0997, "step": 74220 }, { "epoch": 1.7307743872227566, "grad_norm": 2.548881769180298, "learning_rate": 4.233196542933532e-07, "loss": 0.0981, "step": 74230 }, { "epoch": 1.7310075485995746, "grad_norm": 1.3212428092956543, "learning_rate": 4.2324193247528447e-07, "loss": 0.1093, "step": 74240 }, { "epoch": 1.7312407099763925, "grad_norm": 1.590279459953308, "learning_rate": 4.231642106572157e-07, "loss": 0.1048, "step": 74250 }, { "epoch": 1.7314738713532103, "grad_norm": 2.3809826374053955, "learning_rate": 4.230864888391469e-07, "loss": 0.0981, "step": 74260 }, { "epoch": 1.7317070327300281, "grad_norm": 1.431348443031311, "learning_rate": 4.2300876702107815e-07, "loss": 0.1039, "step": 74270 }, { "epoch": 1.7319401941068462, "grad_norm": 1.1063934564590454, "learning_rate": 4.2293104520300936e-07, "loss": 0.1009, "step": 74280 }, { "epoch": 1.7321733554836642, "grad_norm": 1.6109713315963745, "learning_rate": 4.228533233849406e-07, "loss": 0.1006, "step": 74290 }, { "epoch": 1.732406516860482, "grad_norm": 3.0058374404907227, "learning_rate": 4.2277560156687183e-07, "loss": 0.1153, "step": 74300 }, { "epoch": 1.7326396782373, "grad_norm": 1.2958906888961792, "learning_rate": 4.2269787974880304e-07, "loss": 0.1066, "step": 74310 }, { "epoch": 1.732872839614118, "grad_norm": 1.8875993490219116, "learning_rate": 4.226201579307343e-07, "loss": 0.1005, "step": 74320 }, { "epoch": 1.7331060009909358, "grad_norm": 3.348916530609131, "learning_rate": 4.2254243611266557e-07, "loss": 0.1117, "step": 74330 }, { "epoch": 1.7333391623677539, "grad_norm": 2.020242214202881, "learning_rate": 4.224647142945967e-07, "loss": 0.1194, "step": 74340 }, { "epoch": 1.7335723237445717, "grad_norm": 2.1431734561920166, "learning_rate": 4.22386992476528e-07, "loss": 0.1208, "step": 74350 }, { "epoch": 1.7338054851213895, "grad_norm": 1.8738878965377808, "learning_rate": 4.2230927065845925e-07, "loss": 0.1021, "step": 74360 }, { "epoch": 1.7340386464982076, "grad_norm": 2.492435932159424, "learning_rate": 4.2223154884039046e-07, "loss": 0.1096, "step": 74370 }, { "epoch": 1.7342718078750257, "grad_norm": 2.9870455265045166, "learning_rate": 4.2215382702232166e-07, "loss": 0.1064, "step": 74380 }, { "epoch": 1.7345049692518435, "grad_norm": 1.452802062034607, "learning_rate": 4.2207610520425293e-07, "loss": 0.1136, "step": 74390 }, { "epoch": 1.7347381306286613, "grad_norm": 2.7149133682250977, "learning_rate": 4.2199838338618414e-07, "loss": 0.1102, "step": 74400 }, { "epoch": 1.7349712920054792, "grad_norm": 1.5498566627502441, "learning_rate": 4.219206615681154e-07, "loss": 0.1073, "step": 74410 }, { "epoch": 1.7352044533822972, "grad_norm": 3.0726301670074463, "learning_rate": 4.218429397500466e-07, "loss": 0.1163, "step": 74420 }, { "epoch": 1.7354376147591153, "grad_norm": 1.5439485311508179, "learning_rate": 4.217652179319778e-07, "loss": 0.1005, "step": 74430 }, { "epoch": 1.7356707761359331, "grad_norm": 1.4085675477981567, "learning_rate": 4.216874961139091e-07, "loss": 0.1035, "step": 74440 }, { "epoch": 1.735903937512751, "grad_norm": 2.8783178329467773, "learning_rate": 4.2160977429584034e-07, "loss": 0.1118, "step": 74450 }, { "epoch": 1.7361370988895688, "grad_norm": 2.7040820121765137, "learning_rate": 4.215320524777715e-07, "loss": 0.104, "step": 74460 }, { "epoch": 1.7363702602663869, "grad_norm": 1.2940315008163452, "learning_rate": 4.2145433065970276e-07, "loss": 0.0991, "step": 74470 }, { "epoch": 1.736603421643205, "grad_norm": 1.078303337097168, "learning_rate": 4.21376608841634e-07, "loss": 0.1033, "step": 74480 }, { "epoch": 1.7368365830200228, "grad_norm": 1.2077231407165527, "learning_rate": 4.2129888702356523e-07, "loss": 0.1168, "step": 74490 }, { "epoch": 1.7370697443968406, "grad_norm": 2.3022449016571045, "learning_rate": 4.2122116520549644e-07, "loss": 0.0985, "step": 74500 }, { "epoch": 1.7373029057736586, "grad_norm": 3.367543935775757, "learning_rate": 4.211434433874277e-07, "loss": 0.1053, "step": 74510 }, { "epoch": 1.7375360671504765, "grad_norm": 1.3839576244354248, "learning_rate": 4.210657215693589e-07, "loss": 0.1076, "step": 74520 }, { "epoch": 1.7377692285272945, "grad_norm": 3.2874293327331543, "learning_rate": 4.2098799975129017e-07, "loss": 0.1073, "step": 74530 }, { "epoch": 1.7380023899041124, "grad_norm": 2.3424854278564453, "learning_rate": 4.209102779332214e-07, "loss": 0.0944, "step": 74540 }, { "epoch": 1.7382355512809302, "grad_norm": 3.4764437675476074, "learning_rate": 4.208325561151526e-07, "loss": 0.1109, "step": 74550 }, { "epoch": 1.7384687126577483, "grad_norm": 1.6772345304489136, "learning_rate": 4.2075483429708385e-07, "loss": 0.1112, "step": 74560 }, { "epoch": 1.7387018740345663, "grad_norm": 2.4045910835266113, "learning_rate": 4.206771124790151e-07, "loss": 0.1047, "step": 74570 }, { "epoch": 1.7389350354113842, "grad_norm": 1.361761212348938, "learning_rate": 4.2059939066094627e-07, "loss": 0.1084, "step": 74580 }, { "epoch": 1.739168196788202, "grad_norm": 1.6213561296463013, "learning_rate": 4.2052166884287753e-07, "loss": 0.1115, "step": 74590 }, { "epoch": 1.7394013581650198, "grad_norm": 1.9957082271575928, "learning_rate": 4.204439470248088e-07, "loss": 0.1132, "step": 74600 }, { "epoch": 1.739634519541838, "grad_norm": 2.0453882217407227, "learning_rate": 4.2036622520674006e-07, "loss": 0.1244, "step": 74610 }, { "epoch": 1.739867680918656, "grad_norm": 1.1489555835723877, "learning_rate": 4.202885033886712e-07, "loss": 0.1083, "step": 74620 }, { "epoch": 1.7401008422954738, "grad_norm": 1.464648962020874, "learning_rate": 4.202107815706025e-07, "loss": 0.1169, "step": 74630 }, { "epoch": 1.7403340036722916, "grad_norm": 1.1358492374420166, "learning_rate": 4.2013305975253374e-07, "loss": 0.104, "step": 74640 }, { "epoch": 1.7405671650491095, "grad_norm": 1.5714970827102661, "learning_rate": 4.2005533793446495e-07, "loss": 0.1161, "step": 74650 }, { "epoch": 1.7408003264259275, "grad_norm": 1.6994248628616333, "learning_rate": 4.1997761611639616e-07, "loss": 0.0915, "step": 74660 }, { "epoch": 1.7410334878027456, "grad_norm": 1.5909701585769653, "learning_rate": 4.198998942983274e-07, "loss": 0.0985, "step": 74670 }, { "epoch": 1.7412666491795634, "grad_norm": 1.3728147745132446, "learning_rate": 4.1982217248025863e-07, "loss": 0.1016, "step": 74680 }, { "epoch": 1.7414998105563813, "grad_norm": 3.1621716022491455, "learning_rate": 4.197444506621899e-07, "loss": 0.1105, "step": 74690 }, { "epoch": 1.7417329719331993, "grad_norm": 1.9229626655578613, "learning_rate": 4.196667288441211e-07, "loss": 0.1078, "step": 74700 }, { "epoch": 1.7419661333100172, "grad_norm": 1.594583511352539, "learning_rate": 4.195890070260523e-07, "loss": 0.106, "step": 74710 }, { "epoch": 1.7421992946868352, "grad_norm": 1.2828025817871094, "learning_rate": 4.1951128520798357e-07, "loss": 0.0995, "step": 74720 }, { "epoch": 1.742432456063653, "grad_norm": 2.302337884902954, "learning_rate": 4.1943356338991483e-07, "loss": 0.1049, "step": 74730 }, { "epoch": 1.7426656174404709, "grad_norm": 1.1474251747131348, "learning_rate": 4.19355841571846e-07, "loss": 0.104, "step": 74740 }, { "epoch": 1.742898778817289, "grad_norm": 2.918653726577759, "learning_rate": 4.1927811975377725e-07, "loss": 0.1115, "step": 74750 }, { "epoch": 1.743131940194107, "grad_norm": 2.5865657329559326, "learning_rate": 4.192003979357085e-07, "loss": 0.1017, "step": 74760 }, { "epoch": 1.7433651015709248, "grad_norm": 2.1267647743225098, "learning_rate": 4.191226761176397e-07, "loss": 0.1029, "step": 74770 }, { "epoch": 1.7435982629477427, "grad_norm": 1.6593657732009888, "learning_rate": 4.1904495429957093e-07, "loss": 0.1076, "step": 74780 }, { "epoch": 1.7438314243245605, "grad_norm": 2.298204183578491, "learning_rate": 4.189672324815022e-07, "loss": 0.1153, "step": 74790 }, { "epoch": 1.7440645857013786, "grad_norm": 1.2803435325622559, "learning_rate": 4.188895106634334e-07, "loss": 0.1176, "step": 74800 }, { "epoch": 1.7442977470781966, "grad_norm": 2.7673349380493164, "learning_rate": 4.1881178884536467e-07, "loss": 0.1148, "step": 74810 }, { "epoch": 1.7445309084550145, "grad_norm": 1.149958610534668, "learning_rate": 4.187340670272959e-07, "loss": 0.1047, "step": 74820 }, { "epoch": 1.7447640698318323, "grad_norm": 1.71897292137146, "learning_rate": 4.186563452092271e-07, "loss": 0.1107, "step": 74830 }, { "epoch": 1.7449972312086501, "grad_norm": 1.2902196645736694, "learning_rate": 4.1857862339115835e-07, "loss": 0.1105, "step": 74840 }, { "epoch": 1.7452303925854682, "grad_norm": 1.4755961894989014, "learning_rate": 4.185009015730896e-07, "loss": 0.1136, "step": 74850 }, { "epoch": 1.7454635539622863, "grad_norm": 1.1735745668411255, "learning_rate": 4.1842317975502077e-07, "loss": 0.1134, "step": 74860 }, { "epoch": 1.745696715339104, "grad_norm": 1.5963393449783325, "learning_rate": 4.1834545793695203e-07, "loss": 0.1182, "step": 74870 }, { "epoch": 1.745929876715922, "grad_norm": 1.613672137260437, "learning_rate": 4.182677361188833e-07, "loss": 0.1142, "step": 74880 }, { "epoch": 1.74616303809274, "grad_norm": 1.343619465827942, "learning_rate": 4.181900143008145e-07, "loss": 0.0997, "step": 74890 }, { "epoch": 1.7463961994695578, "grad_norm": 1.2583330869674683, "learning_rate": 4.181122924827457e-07, "loss": 0.0987, "step": 74900 }, { "epoch": 1.7466293608463759, "grad_norm": 4.088840007781982, "learning_rate": 4.1803457066467697e-07, "loss": 0.1164, "step": 74910 }, { "epoch": 1.7468625222231937, "grad_norm": 1.4743596315383911, "learning_rate": 4.179568488466082e-07, "loss": 0.106, "step": 74920 }, { "epoch": 1.7470956836000116, "grad_norm": 2.4930834770202637, "learning_rate": 4.1787912702853944e-07, "loss": 0.1098, "step": 74930 }, { "epoch": 1.7473288449768296, "grad_norm": 3.661959409713745, "learning_rate": 4.178014052104707e-07, "loss": 0.1116, "step": 74940 }, { "epoch": 1.7475620063536477, "grad_norm": 1.395885705947876, "learning_rate": 4.1772368339240186e-07, "loss": 0.1086, "step": 74950 }, { "epoch": 1.7477951677304655, "grad_norm": 1.3859316110610962, "learning_rate": 4.176459615743331e-07, "loss": 0.1188, "step": 74960 }, { "epoch": 1.7480283291072833, "grad_norm": 1.096123218536377, "learning_rate": 4.175682397562644e-07, "loss": 0.1098, "step": 74970 }, { "epoch": 1.7482614904841012, "grad_norm": 1.3344892263412476, "learning_rate": 4.174905179381956e-07, "loss": 0.1064, "step": 74980 }, { "epoch": 1.7484946518609192, "grad_norm": 1.1377370357513428, "learning_rate": 4.174127961201268e-07, "loss": 0.1035, "step": 74990 }, { "epoch": 1.7487278132377373, "grad_norm": 2.2407033443450928, "learning_rate": 4.1733507430205807e-07, "loss": 0.0958, "step": 75000 }, { "epoch": 1.7489609746145551, "grad_norm": 1.3055123090744019, "learning_rate": 4.172573524839893e-07, "loss": 0.1113, "step": 75010 }, { "epoch": 1.749194135991373, "grad_norm": 2.093629837036133, "learning_rate": 4.1717963066592054e-07, "loss": 0.0944, "step": 75020 }, { "epoch": 1.7494272973681908, "grad_norm": 2.5046966075897217, "learning_rate": 4.1710190884785175e-07, "loss": 0.1083, "step": 75030 }, { "epoch": 1.7496604587450089, "grad_norm": 2.4534294605255127, "learning_rate": 4.1702418702978295e-07, "loss": 0.101, "step": 75040 }, { "epoch": 1.749893620121827, "grad_norm": 2.486227035522461, "learning_rate": 4.169464652117142e-07, "loss": 0.1029, "step": 75050 }, { "epoch": 1.7501267814986448, "grad_norm": 1.4969578981399536, "learning_rate": 4.168687433936455e-07, "loss": 0.1171, "step": 75060 }, { "epoch": 1.7503599428754626, "grad_norm": 1.9357341527938843, "learning_rate": 4.167910215755767e-07, "loss": 0.1088, "step": 75070 }, { "epoch": 1.7505931042522807, "grad_norm": 2.085460901260376, "learning_rate": 4.167132997575079e-07, "loss": 0.0951, "step": 75080 }, { "epoch": 1.7508262656290985, "grad_norm": 1.633118987083435, "learning_rate": 4.1663557793943916e-07, "loss": 0.1173, "step": 75090 }, { "epoch": 1.7510594270059165, "grad_norm": 2.25049090385437, "learning_rate": 4.165578561213704e-07, "loss": 0.1143, "step": 75100 }, { "epoch": 1.7512925883827344, "grad_norm": 1.9165139198303223, "learning_rate": 4.164801343033016e-07, "loss": 0.1043, "step": 75110 }, { "epoch": 1.7515257497595522, "grad_norm": 1.2802343368530273, "learning_rate": 4.1640241248523284e-07, "loss": 0.1128, "step": 75120 }, { "epoch": 1.7517589111363703, "grad_norm": 1.2299221754074097, "learning_rate": 4.163246906671641e-07, "loss": 0.097, "step": 75130 }, { "epoch": 1.7519920725131883, "grad_norm": 2.990159749984741, "learning_rate": 4.162469688490953e-07, "loss": 0.109, "step": 75140 }, { "epoch": 1.7522252338900062, "grad_norm": 1.4645743370056152, "learning_rate": 4.161692470310265e-07, "loss": 0.1078, "step": 75150 }, { "epoch": 1.752458395266824, "grad_norm": 1.1791791915893555, "learning_rate": 4.160915252129578e-07, "loss": 0.1148, "step": 75160 }, { "epoch": 1.7526915566436418, "grad_norm": 1.6163585186004639, "learning_rate": 4.16013803394889e-07, "loss": 0.1036, "step": 75170 }, { "epoch": 1.75292471802046, "grad_norm": 1.1749759912490845, "learning_rate": 4.1593608157682025e-07, "loss": 0.1002, "step": 75180 }, { "epoch": 1.753157879397278, "grad_norm": 1.617714524269104, "learning_rate": 4.1585835975875146e-07, "loss": 0.0944, "step": 75190 }, { "epoch": 1.7533910407740958, "grad_norm": 1.9496153593063354, "learning_rate": 4.1578063794068267e-07, "loss": 0.1119, "step": 75200 }, { "epoch": 1.7536242021509136, "grad_norm": 2.174008846282959, "learning_rate": 4.1570291612261393e-07, "loss": 0.1103, "step": 75210 }, { "epoch": 1.7538573635277315, "grad_norm": 1.2268197536468506, "learning_rate": 4.156251943045452e-07, "loss": 0.0957, "step": 75220 }, { "epoch": 1.7540905249045495, "grad_norm": 0.9671807289123535, "learning_rate": 4.1554747248647635e-07, "loss": 0.0966, "step": 75230 }, { "epoch": 1.7543236862813676, "grad_norm": 2.4649412631988525, "learning_rate": 4.154697506684076e-07, "loss": 0.1095, "step": 75240 }, { "epoch": 1.7545568476581854, "grad_norm": 1.5639475584030151, "learning_rate": 4.153920288503389e-07, "loss": 0.1058, "step": 75250 }, { "epoch": 1.7547900090350033, "grad_norm": 2.886920928955078, "learning_rate": 4.153143070322701e-07, "loss": 0.1075, "step": 75260 }, { "epoch": 1.7550231704118213, "grad_norm": 1.436342716217041, "learning_rate": 4.152365852142013e-07, "loss": 0.121, "step": 75270 }, { "epoch": 1.7552563317886392, "grad_norm": 3.825284242630005, "learning_rate": 4.1515886339613256e-07, "loss": 0.1237, "step": 75280 }, { "epoch": 1.7554894931654572, "grad_norm": 2.091904878616333, "learning_rate": 4.1508114157806377e-07, "loss": 0.106, "step": 75290 }, { "epoch": 1.755722654542275, "grad_norm": 1.9992451667785645, "learning_rate": 4.1500341975999503e-07, "loss": 0.0998, "step": 75300 }, { "epoch": 1.7559558159190929, "grad_norm": 2.32847261428833, "learning_rate": 4.1492569794192624e-07, "loss": 0.0985, "step": 75310 }, { "epoch": 1.756188977295911, "grad_norm": 1.7498317956924438, "learning_rate": 4.1484797612385745e-07, "loss": 0.1159, "step": 75320 }, { "epoch": 1.756422138672729, "grad_norm": 1.0889804363250732, "learning_rate": 4.147702543057887e-07, "loss": 0.1036, "step": 75330 }, { "epoch": 1.7566553000495468, "grad_norm": 1.0402077436447144, "learning_rate": 4.1469253248771997e-07, "loss": 0.1169, "step": 75340 }, { "epoch": 1.7568884614263647, "grad_norm": 1.592972755432129, "learning_rate": 4.1461481066965113e-07, "loss": 0.1122, "step": 75350 }, { "epoch": 1.7571216228031825, "grad_norm": 1.3715949058532715, "learning_rate": 4.145370888515824e-07, "loss": 0.1114, "step": 75360 }, { "epoch": 1.7573547841800006, "grad_norm": 2.4496519565582275, "learning_rate": 4.1445936703351365e-07, "loss": 0.0995, "step": 75370 }, { "epoch": 1.7575879455568186, "grad_norm": 2.6577441692352295, "learning_rate": 4.1438164521544486e-07, "loss": 0.1094, "step": 75380 }, { "epoch": 1.7578211069336365, "grad_norm": 2.499540090560913, "learning_rate": 4.1430392339737607e-07, "loss": 0.1027, "step": 75390 }, { "epoch": 1.7580542683104543, "grad_norm": 0.9194021821022034, "learning_rate": 4.1422620157930733e-07, "loss": 0.1027, "step": 75400 }, { "epoch": 1.7582874296872721, "grad_norm": 1.7562129497528076, "learning_rate": 4.1414847976123854e-07, "loss": 0.1064, "step": 75410 }, { "epoch": 1.7585205910640902, "grad_norm": 3.15431547164917, "learning_rate": 4.140707579431698e-07, "loss": 0.1004, "step": 75420 }, { "epoch": 1.7587537524409083, "grad_norm": 1.1022146940231323, "learning_rate": 4.13993036125101e-07, "loss": 0.1086, "step": 75430 }, { "epoch": 1.758986913817726, "grad_norm": 3.27120304107666, "learning_rate": 4.139153143070322e-07, "loss": 0.1245, "step": 75440 }, { "epoch": 1.759220075194544, "grad_norm": 1.6177796125411987, "learning_rate": 4.138375924889635e-07, "loss": 0.0943, "step": 75450 }, { "epoch": 1.759453236571362, "grad_norm": 1.9195523262023926, "learning_rate": 4.1375987067089475e-07, "loss": 0.1054, "step": 75460 }, { "epoch": 1.75968639794818, "grad_norm": 3.3299317359924316, "learning_rate": 4.1368992103463287e-07, "loss": 0.1129, "step": 75470 }, { "epoch": 1.7599195593249979, "grad_norm": 1.186314582824707, "learning_rate": 4.1361219921656403e-07, "loss": 0.1058, "step": 75480 }, { "epoch": 1.7601527207018157, "grad_norm": 2.285784959793091, "learning_rate": 4.135344773984953e-07, "loss": 0.0989, "step": 75490 }, { "epoch": 1.7603858820786336, "grad_norm": 3.0010390281677246, "learning_rate": 4.1345675558042655e-07, "loss": 0.1173, "step": 75500 }, { "epoch": 1.7606190434554516, "grad_norm": 1.5113624334335327, "learning_rate": 4.1337903376235776e-07, "loss": 0.1035, "step": 75510 }, { "epoch": 1.7608522048322697, "grad_norm": 2.2533068656921387, "learning_rate": 4.1330131194428897e-07, "loss": 0.1028, "step": 75520 }, { "epoch": 1.7610853662090875, "grad_norm": 1.2314577102661133, "learning_rate": 4.1322359012622023e-07, "loss": 0.1116, "step": 75530 }, { "epoch": 1.7613185275859053, "grad_norm": 1.5265766382217407, "learning_rate": 4.1314586830815144e-07, "loss": 0.1127, "step": 75540 }, { "epoch": 1.7615516889627232, "grad_norm": 1.6138910055160522, "learning_rate": 4.130681464900827e-07, "loss": 0.1179, "step": 75550 }, { "epoch": 1.7617848503395412, "grad_norm": 1.3100948333740234, "learning_rate": 4.129904246720139e-07, "loss": 0.1046, "step": 75560 }, { "epoch": 1.7620180117163593, "grad_norm": 1.2226076126098633, "learning_rate": 4.129127028539451e-07, "loss": 0.1029, "step": 75570 }, { "epoch": 1.7622511730931771, "grad_norm": 1.6776798963546753, "learning_rate": 4.128349810358764e-07, "loss": 0.0983, "step": 75580 }, { "epoch": 1.762484334469995, "grad_norm": 2.098942518234253, "learning_rate": 4.1275725921780765e-07, "loss": 0.1025, "step": 75590 }, { "epoch": 1.7627174958468128, "grad_norm": 1.844714641571045, "learning_rate": 4.126795373997388e-07, "loss": 0.1054, "step": 75600 }, { "epoch": 1.7629506572236309, "grad_norm": 1.65496027469635, "learning_rate": 4.1260181558167007e-07, "loss": 0.1129, "step": 75610 }, { "epoch": 1.763183818600449, "grad_norm": 1.5100656747817993, "learning_rate": 4.1252409376360133e-07, "loss": 0.1118, "step": 75620 }, { "epoch": 1.7634169799772668, "grad_norm": 1.8460441827774048, "learning_rate": 4.1244637194553254e-07, "loss": 0.1019, "step": 75630 }, { "epoch": 1.7636501413540846, "grad_norm": 1.2956185340881348, "learning_rate": 4.1236865012746375e-07, "loss": 0.1058, "step": 75640 }, { "epoch": 1.7638833027309027, "grad_norm": 2.3087847232818604, "learning_rate": 4.12290928309395e-07, "loss": 0.1018, "step": 75650 }, { "epoch": 1.7641164641077207, "grad_norm": 1.1770395040512085, "learning_rate": 4.122132064913262e-07, "loss": 0.1103, "step": 75660 }, { "epoch": 1.7643496254845386, "grad_norm": 1.6085968017578125, "learning_rate": 4.121354846732575e-07, "loss": 0.1021, "step": 75670 }, { "epoch": 1.7645827868613564, "grad_norm": 1.5505211353302002, "learning_rate": 4.120577628551887e-07, "loss": 0.1062, "step": 75680 }, { "epoch": 1.7648159482381742, "grad_norm": 2.086142063140869, "learning_rate": 4.119800410371199e-07, "loss": 0.1032, "step": 75690 }, { "epoch": 1.7650491096149923, "grad_norm": 1.1883951425552368, "learning_rate": 4.1190231921905116e-07, "loss": 0.1102, "step": 75700 }, { "epoch": 1.7652822709918103, "grad_norm": 1.6356713771820068, "learning_rate": 4.118245974009824e-07, "loss": 0.1101, "step": 75710 }, { "epoch": 1.7655154323686282, "grad_norm": 1.4698866605758667, "learning_rate": 4.117468755829136e-07, "loss": 0.0983, "step": 75720 }, { "epoch": 1.765748593745446, "grad_norm": 1.1668212413787842, "learning_rate": 4.1166915376484484e-07, "loss": 0.1104, "step": 75730 }, { "epoch": 1.7659817551222639, "grad_norm": 1.9555495977401733, "learning_rate": 4.115914319467761e-07, "loss": 0.1069, "step": 75740 }, { "epoch": 1.766214916499082, "grad_norm": 1.8755651712417603, "learning_rate": 4.115137101287073e-07, "loss": 0.1053, "step": 75750 }, { "epoch": 1.7664480778759, "grad_norm": 2.892598867416382, "learning_rate": 4.114359883106385e-07, "loss": 0.109, "step": 75760 }, { "epoch": 1.7666812392527178, "grad_norm": 1.1357077360153198, "learning_rate": 4.113582664925698e-07, "loss": 0.1016, "step": 75770 }, { "epoch": 1.7669144006295356, "grad_norm": 1.2623544931411743, "learning_rate": 4.11280544674501e-07, "loss": 0.1108, "step": 75780 }, { "epoch": 1.7671475620063537, "grad_norm": 1.0719009637832642, "learning_rate": 4.1120282285643226e-07, "loss": 0.0996, "step": 75790 }, { "epoch": 1.7673807233831715, "grad_norm": 1.2789883613586426, "learning_rate": 4.1112510103836347e-07, "loss": 0.1108, "step": 75800 }, { "epoch": 1.7676138847599896, "grad_norm": 2.070681571960449, "learning_rate": 4.110473792202947e-07, "loss": 0.1193, "step": 75810 }, { "epoch": 1.7678470461368074, "grad_norm": 1.4601831436157227, "learning_rate": 4.1096965740222594e-07, "loss": 0.1094, "step": 75820 }, { "epoch": 1.7680802075136253, "grad_norm": 1.5255922079086304, "learning_rate": 4.108919355841572e-07, "loss": 0.106, "step": 75830 }, { "epoch": 1.7683133688904433, "grad_norm": 1.7022970914840698, "learning_rate": 4.1081421376608835e-07, "loss": 0.1196, "step": 75840 }, { "epoch": 1.7685465302672614, "grad_norm": 2.6826512813568115, "learning_rate": 4.107364919480196e-07, "loss": 0.0934, "step": 75850 }, { "epoch": 1.7687796916440792, "grad_norm": 2.060574531555176, "learning_rate": 4.106587701299509e-07, "loss": 0.1033, "step": 75860 }, { "epoch": 1.769012853020897, "grad_norm": 1.610824704170227, "learning_rate": 4.1058104831188214e-07, "loss": 0.1202, "step": 75870 }, { "epoch": 1.769246014397715, "grad_norm": 2.699958562850952, "learning_rate": 4.105033264938133e-07, "loss": 0.1147, "step": 75880 }, { "epoch": 1.769479175774533, "grad_norm": 1.217888593673706, "learning_rate": 4.1042560467574456e-07, "loss": 0.0979, "step": 75890 }, { "epoch": 1.769712337151351, "grad_norm": 1.561414361000061, "learning_rate": 4.103478828576758e-07, "loss": 0.1057, "step": 75900 }, { "epoch": 1.7699454985281688, "grad_norm": 1.277814507484436, "learning_rate": 4.1027016103960703e-07, "loss": 0.1097, "step": 75910 }, { "epoch": 1.7701786599049867, "grad_norm": 1.1062370538711548, "learning_rate": 4.1019243922153824e-07, "loss": 0.0984, "step": 75920 }, { "epoch": 1.7704118212818045, "grad_norm": 1.692473292350769, "learning_rate": 4.101147174034695e-07, "loss": 0.1061, "step": 75930 }, { "epoch": 1.7706449826586226, "grad_norm": 1.8205498456954956, "learning_rate": 4.100369955854007e-07, "loss": 0.1082, "step": 75940 }, { "epoch": 1.7708781440354406, "grad_norm": 2.2704427242279053, "learning_rate": 4.09959273767332e-07, "loss": 0.111, "step": 75950 }, { "epoch": 1.7711113054122585, "grad_norm": 1.1730132102966309, "learning_rate": 4.098815519492632e-07, "loss": 0.105, "step": 75960 }, { "epoch": 1.7713444667890763, "grad_norm": 1.2007324695587158, "learning_rate": 4.098038301311944e-07, "loss": 0.1085, "step": 75970 }, { "epoch": 1.7715776281658944, "grad_norm": 2.7953555583953857, "learning_rate": 4.0972610831312565e-07, "loss": 0.1189, "step": 75980 }, { "epoch": 1.7718107895427122, "grad_norm": 1.747092843055725, "learning_rate": 4.096483864950569e-07, "loss": 0.1007, "step": 75990 }, { "epoch": 1.7720439509195303, "grad_norm": 1.1695630550384521, "learning_rate": 4.0957066467698807e-07, "loss": 0.0986, "step": 76000 }, { "epoch": 1.772277112296348, "grad_norm": 2.1866908073425293, "learning_rate": 4.0949294285891933e-07, "loss": 0.1, "step": 76010 }, { "epoch": 1.772510273673166, "grad_norm": 1.3715366125106812, "learning_rate": 4.094152210408506e-07, "loss": 0.1079, "step": 76020 }, { "epoch": 1.772743435049984, "grad_norm": 1.2729114294052124, "learning_rate": 4.093374992227818e-07, "loss": 0.1052, "step": 76030 }, { "epoch": 1.772976596426802, "grad_norm": 4.046088695526123, "learning_rate": 4.09259777404713e-07, "loss": 0.1075, "step": 76040 }, { "epoch": 1.77320975780362, "grad_norm": 2.752295732498169, "learning_rate": 4.091820555866443e-07, "loss": 0.1026, "step": 76050 }, { "epoch": 1.7734429191804377, "grad_norm": 2.300184965133667, "learning_rate": 4.091043337685755e-07, "loss": 0.1107, "step": 76060 }, { "epoch": 1.7736760805572556, "grad_norm": 1.384214162826538, "learning_rate": 4.0902661195050675e-07, "loss": 0.1076, "step": 76070 }, { "epoch": 1.7739092419340736, "grad_norm": 1.1809014081954956, "learning_rate": 4.0894889013243796e-07, "loss": 0.1132, "step": 76080 }, { "epoch": 1.7741424033108917, "grad_norm": 1.3327528238296509, "learning_rate": 4.0887116831436917e-07, "loss": 0.1165, "step": 76090 }, { "epoch": 1.7743755646877095, "grad_norm": 1.2639132738113403, "learning_rate": 4.0879344649630043e-07, "loss": 0.1123, "step": 76100 }, { "epoch": 1.7746087260645274, "grad_norm": 1.4288827180862427, "learning_rate": 4.087157246782317e-07, "loss": 0.0999, "step": 76110 }, { "epoch": 1.7748418874413452, "grad_norm": 2.018615484237671, "learning_rate": 4.0863800286016285e-07, "loss": 0.1061, "step": 76120 }, { "epoch": 1.7750750488181632, "grad_norm": 2.945958375930786, "learning_rate": 4.085602810420941e-07, "loss": 0.1009, "step": 76130 }, { "epoch": 1.7753082101949813, "grad_norm": 2.4344468116760254, "learning_rate": 4.0848255922402537e-07, "loss": 0.1121, "step": 76140 }, { "epoch": 1.7755413715717991, "grad_norm": 3.7457942962646484, "learning_rate": 4.084048374059566e-07, "loss": 0.105, "step": 76150 }, { "epoch": 1.775774532948617, "grad_norm": 1.7145663499832153, "learning_rate": 4.083271155878878e-07, "loss": 0.1014, "step": 76160 }, { "epoch": 1.776007694325435, "grad_norm": 2.5285537242889404, "learning_rate": 4.0824939376981905e-07, "loss": 0.0945, "step": 76170 }, { "epoch": 1.7762408557022529, "grad_norm": 1.995254397392273, "learning_rate": 4.0817167195175026e-07, "loss": 0.1125, "step": 76180 }, { "epoch": 1.776474017079071, "grad_norm": 1.4432315826416016, "learning_rate": 4.080939501336815e-07, "loss": 0.0982, "step": 76190 }, { "epoch": 1.7767071784558888, "grad_norm": 1.3331581354141235, "learning_rate": 4.0801622831561273e-07, "loss": 0.0972, "step": 76200 }, { "epoch": 1.7769403398327066, "grad_norm": 3.3123104572296143, "learning_rate": 4.0793850649754394e-07, "loss": 0.106, "step": 76210 }, { "epoch": 1.7771735012095247, "grad_norm": 1.1779500246047974, "learning_rate": 4.078607846794752e-07, "loss": 0.1127, "step": 76220 }, { "epoch": 1.7774066625863427, "grad_norm": 1.4269412755966187, "learning_rate": 4.0778306286140647e-07, "loss": 0.1144, "step": 76230 }, { "epoch": 1.7776398239631606, "grad_norm": 1.968866229057312, "learning_rate": 4.077053410433376e-07, "loss": 0.1084, "step": 76240 }, { "epoch": 1.7778729853399784, "grad_norm": 1.019942045211792, "learning_rate": 4.076276192252689e-07, "loss": 0.1137, "step": 76250 }, { "epoch": 1.7781061467167962, "grad_norm": 1.8975390195846558, "learning_rate": 4.0754989740720015e-07, "loss": 0.1109, "step": 76260 }, { "epoch": 1.7783393080936143, "grad_norm": 2.565197467803955, "learning_rate": 4.074721755891314e-07, "loss": 0.1035, "step": 76270 }, { "epoch": 1.7785724694704323, "grad_norm": 2.2475318908691406, "learning_rate": 4.0739445377106257e-07, "loss": 0.1043, "step": 76280 }, { "epoch": 1.7788056308472502, "grad_norm": 1.2113336324691772, "learning_rate": 4.0731673195299383e-07, "loss": 0.1035, "step": 76290 }, { "epoch": 1.779038792224068, "grad_norm": 2.155576229095459, "learning_rate": 4.072390101349251e-07, "loss": 0.0866, "step": 76300 }, { "epoch": 1.7792719536008859, "grad_norm": 1.1627647876739502, "learning_rate": 4.071612883168563e-07, "loss": 0.1132, "step": 76310 }, { "epoch": 1.779505114977704, "grad_norm": 2.302452325820923, "learning_rate": 4.070835664987875e-07, "loss": 0.1028, "step": 76320 }, { "epoch": 1.779738276354522, "grad_norm": 1.512511968612671, "learning_rate": 4.0700584468071877e-07, "loss": 0.1092, "step": 76330 }, { "epoch": 1.7799714377313398, "grad_norm": 1.4379308223724365, "learning_rate": 4.0692812286265e-07, "loss": 0.0976, "step": 76340 }, { "epoch": 1.7802045991081576, "grad_norm": 1.405610203742981, "learning_rate": 4.0685040104458124e-07, "loss": 0.1038, "step": 76350 }, { "epoch": 1.7804377604849757, "grad_norm": 2.624485969543457, "learning_rate": 4.0677267922651245e-07, "loss": 0.1139, "step": 76360 }, { "epoch": 1.7806709218617935, "grad_norm": 2.9825820922851562, "learning_rate": 4.0669495740844366e-07, "loss": 0.1071, "step": 76370 }, { "epoch": 1.7809040832386116, "grad_norm": 1.5009130239486694, "learning_rate": 4.066172355903749e-07, "loss": 0.115, "step": 76380 }, { "epoch": 1.7811372446154294, "grad_norm": 1.6244083642959595, "learning_rate": 4.065395137723062e-07, "loss": 0.0998, "step": 76390 }, { "epoch": 1.7813704059922473, "grad_norm": 1.2625606060028076, "learning_rate": 4.0646179195423734e-07, "loss": 0.1104, "step": 76400 }, { "epoch": 1.7816035673690653, "grad_norm": 1.5606547594070435, "learning_rate": 4.063840701361686e-07, "loss": 0.1033, "step": 76410 }, { "epoch": 1.7818367287458834, "grad_norm": 0.9124963283538818, "learning_rate": 4.0630634831809987e-07, "loss": 0.1119, "step": 76420 }, { "epoch": 1.7820698901227012, "grad_norm": 2.143470048904419, "learning_rate": 4.062286265000311e-07, "loss": 0.1036, "step": 76430 }, { "epoch": 1.782303051499519, "grad_norm": 1.066200613975525, "learning_rate": 4.061509046819623e-07, "loss": 0.1039, "step": 76440 }, { "epoch": 1.782536212876337, "grad_norm": 1.4590097665786743, "learning_rate": 4.0607318286389355e-07, "loss": 0.0996, "step": 76450 }, { "epoch": 1.782769374253155, "grad_norm": 1.265116572380066, "learning_rate": 4.0599546104582476e-07, "loss": 0.1047, "step": 76460 }, { "epoch": 1.783002535629973, "grad_norm": 2.9856433868408203, "learning_rate": 4.05917739227756e-07, "loss": 0.1102, "step": 76470 }, { "epoch": 1.7832356970067909, "grad_norm": 1.9259333610534668, "learning_rate": 4.0584001740968723e-07, "loss": 0.1016, "step": 76480 }, { "epoch": 1.7834688583836087, "grad_norm": 1.4142040014266968, "learning_rate": 4.0576229559161844e-07, "loss": 0.1091, "step": 76490 }, { "epoch": 1.7837020197604265, "grad_norm": 1.3538936376571655, "learning_rate": 4.056845737735497e-07, "loss": 0.1005, "step": 76500 }, { "epoch": 1.7839351811372446, "grad_norm": 3.337146759033203, "learning_rate": 4.0560685195548096e-07, "loss": 0.1031, "step": 76510 }, { "epoch": 1.7841683425140626, "grad_norm": 1.1711210012435913, "learning_rate": 4.055291301374121e-07, "loss": 0.1021, "step": 76520 }, { "epoch": 1.7844015038908805, "grad_norm": 1.1035184860229492, "learning_rate": 4.054514083193434e-07, "loss": 0.1013, "step": 76530 }, { "epoch": 1.7846346652676983, "grad_norm": 1.1417226791381836, "learning_rate": 4.0537368650127464e-07, "loss": 0.1055, "step": 76540 }, { "epoch": 1.7848678266445164, "grad_norm": 2.208442211151123, "learning_rate": 4.0529596468320585e-07, "loss": 0.0998, "step": 76550 }, { "epoch": 1.7851009880213342, "grad_norm": 1.7052481174468994, "learning_rate": 4.0521824286513706e-07, "loss": 0.1094, "step": 76560 }, { "epoch": 1.7853341493981523, "grad_norm": 1.4142345190048218, "learning_rate": 4.051405210470683e-07, "loss": 0.1002, "step": 76570 }, { "epoch": 1.78556731077497, "grad_norm": 1.3323580026626587, "learning_rate": 4.0506279922899953e-07, "loss": 0.1052, "step": 76580 }, { "epoch": 1.785800472151788, "grad_norm": 1.629996657371521, "learning_rate": 4.049850774109308e-07, "loss": 0.1048, "step": 76590 }, { "epoch": 1.786033633528606, "grad_norm": 1.7303791046142578, "learning_rate": 4.04907355592862e-07, "loss": 0.0977, "step": 76600 }, { "epoch": 1.786266794905424, "grad_norm": 1.551845669746399, "learning_rate": 4.048296337747932e-07, "loss": 0.1002, "step": 76610 }, { "epoch": 1.786499956282242, "grad_norm": 1.7610726356506348, "learning_rate": 4.0475191195672447e-07, "loss": 0.1069, "step": 76620 }, { "epoch": 1.7867331176590597, "grad_norm": 1.7369040250778198, "learning_rate": 4.0467419013865574e-07, "loss": 0.096, "step": 76630 }, { "epoch": 1.7869662790358776, "grad_norm": 1.8366731405258179, "learning_rate": 4.045964683205869e-07, "loss": 0.0996, "step": 76640 }, { "epoch": 1.7871994404126956, "grad_norm": 1.8862682580947876, "learning_rate": 4.0451874650251815e-07, "loss": 0.1004, "step": 76650 }, { "epoch": 1.7874326017895137, "grad_norm": 1.1712892055511475, "learning_rate": 4.044410246844494e-07, "loss": 0.1017, "step": 76660 }, { "epoch": 1.7876657631663315, "grad_norm": 1.5427929162979126, "learning_rate": 4.043633028663806e-07, "loss": 0.1019, "step": 76670 }, { "epoch": 1.7878989245431494, "grad_norm": 1.3051999807357788, "learning_rate": 4.0428558104831183e-07, "loss": 0.1049, "step": 76680 }, { "epoch": 1.7881320859199672, "grad_norm": 1.4661688804626465, "learning_rate": 4.042078592302431e-07, "loss": 0.1058, "step": 76690 }, { "epoch": 1.7883652472967853, "grad_norm": 1.4959335327148438, "learning_rate": 4.0413013741217436e-07, "loss": 0.1196, "step": 76700 }, { "epoch": 1.7885984086736033, "grad_norm": 1.5135002136230469, "learning_rate": 4.0405241559410557e-07, "loss": 0.1109, "step": 76710 }, { "epoch": 1.7888315700504211, "grad_norm": 1.7881624698638916, "learning_rate": 4.039746937760368e-07, "loss": 0.1068, "step": 76720 }, { "epoch": 1.789064731427239, "grad_norm": 1.5736210346221924, "learning_rate": 4.0389697195796804e-07, "loss": 0.1134, "step": 76730 }, { "epoch": 1.789297892804057, "grad_norm": 1.4699677228927612, "learning_rate": 4.0381925013989925e-07, "loss": 0.1084, "step": 76740 }, { "epoch": 1.789531054180875, "grad_norm": 2.2938356399536133, "learning_rate": 4.037415283218305e-07, "loss": 0.1123, "step": 76750 }, { "epoch": 1.789764215557693, "grad_norm": 1.8055486679077148, "learning_rate": 4.036638065037617e-07, "loss": 0.1084, "step": 76760 }, { "epoch": 1.7899973769345108, "grad_norm": 1.6224883794784546, "learning_rate": 4.0358608468569293e-07, "loss": 0.107, "step": 76770 }, { "epoch": 1.7902305383113286, "grad_norm": 0.9795041680335999, "learning_rate": 4.035083628676242e-07, "loss": 0.1016, "step": 76780 }, { "epoch": 1.7904636996881467, "grad_norm": 1.2281134128570557, "learning_rate": 4.0343064104955545e-07, "loss": 0.1035, "step": 76790 }, { "epoch": 1.7906968610649647, "grad_norm": 1.6445173025131226, "learning_rate": 4.033529192314866e-07, "loss": 0.1044, "step": 76800 }, { "epoch": 1.7909300224417826, "grad_norm": 2.94897723197937, "learning_rate": 4.0327519741341787e-07, "loss": 0.1175, "step": 76810 }, { "epoch": 1.7911631838186004, "grad_norm": 2.233229160308838, "learning_rate": 4.0319747559534913e-07, "loss": 0.1074, "step": 76820 }, { "epoch": 1.7913963451954182, "grad_norm": 1.2763314247131348, "learning_rate": 4.0311975377728034e-07, "loss": 0.0983, "step": 76830 }, { "epoch": 1.7916295065722363, "grad_norm": 2.3255059719085693, "learning_rate": 4.0304203195921155e-07, "loss": 0.1007, "step": 76840 }, { "epoch": 1.7918626679490544, "grad_norm": 1.7687993049621582, "learning_rate": 4.029643101411428e-07, "loss": 0.1041, "step": 76850 }, { "epoch": 1.7920958293258722, "grad_norm": 1.6346476078033447, "learning_rate": 4.02886588323074e-07, "loss": 0.1077, "step": 76860 }, { "epoch": 1.79232899070269, "grad_norm": 1.7153031826019287, "learning_rate": 4.028088665050053e-07, "loss": 0.1218, "step": 76870 }, { "epoch": 1.7925621520795079, "grad_norm": 1.5212886333465576, "learning_rate": 4.027311446869365e-07, "loss": 0.1079, "step": 76880 }, { "epoch": 1.792795313456326, "grad_norm": 2.188469648361206, "learning_rate": 4.026534228688677e-07, "loss": 0.1061, "step": 76890 }, { "epoch": 1.793028474833144, "grad_norm": 1.2706516981124878, "learning_rate": 4.0257570105079897e-07, "loss": 0.1079, "step": 76900 }, { "epoch": 1.7932616362099618, "grad_norm": 1.3742313385009766, "learning_rate": 4.0249797923273023e-07, "loss": 0.111, "step": 76910 }, { "epoch": 1.7934947975867797, "grad_norm": 2.247795820236206, "learning_rate": 4.024202574146614e-07, "loss": 0.1072, "step": 76920 }, { "epoch": 1.7937279589635977, "grad_norm": 3.7241430282592773, "learning_rate": 4.0234253559659265e-07, "loss": 0.12, "step": 76930 }, { "epoch": 1.7939611203404158, "grad_norm": 1.1678718328475952, "learning_rate": 4.022648137785239e-07, "loss": 0.116, "step": 76940 }, { "epoch": 1.7941942817172336, "grad_norm": 1.4425781965255737, "learning_rate": 4.021870919604551e-07, "loss": 0.1093, "step": 76950 }, { "epoch": 1.7944274430940514, "grad_norm": 1.1251094341278076, "learning_rate": 4.0210937014238633e-07, "loss": 0.1021, "step": 76960 }, { "epoch": 1.7946606044708693, "grad_norm": 4.019096374511719, "learning_rate": 4.020316483243176e-07, "loss": 0.0961, "step": 76970 }, { "epoch": 1.7948937658476873, "grad_norm": 1.225907564163208, "learning_rate": 4.019539265062488e-07, "loss": 0.0983, "step": 76980 }, { "epoch": 1.7951269272245054, "grad_norm": 1.7387150526046753, "learning_rate": 4.0187620468818006e-07, "loss": 0.1067, "step": 76990 }, { "epoch": 1.7953600886013232, "grad_norm": 1.6260994672775269, "learning_rate": 4.0179848287011127e-07, "loss": 0.0997, "step": 77000 }, { "epoch": 1.795593249978141, "grad_norm": 2.4974896907806396, "learning_rate": 4.017207610520425e-07, "loss": 0.1033, "step": 77010 }, { "epoch": 1.795826411354959, "grad_norm": 1.166694164276123, "learning_rate": 4.0164303923397374e-07, "loss": 0.0954, "step": 77020 }, { "epoch": 1.796059572731777, "grad_norm": 1.774146556854248, "learning_rate": 4.01565317415905e-07, "loss": 0.1219, "step": 77030 }, { "epoch": 1.796292734108595, "grad_norm": 2.107866048812866, "learning_rate": 4.0148759559783616e-07, "loss": 0.1164, "step": 77040 }, { "epoch": 1.7965258954854129, "grad_norm": 1.132004976272583, "learning_rate": 4.014098737797674e-07, "loss": 0.1059, "step": 77050 }, { "epoch": 1.7967590568622307, "grad_norm": 1.6760892868041992, "learning_rate": 4.013321519616987e-07, "loss": 0.1034, "step": 77060 }, { "epoch": 1.7969922182390488, "grad_norm": 1.449809193611145, "learning_rate": 4.012544301436299e-07, "loss": 0.1115, "step": 77070 }, { "epoch": 1.7972253796158666, "grad_norm": 4.191474914550781, "learning_rate": 4.011767083255611e-07, "loss": 0.1068, "step": 77080 }, { "epoch": 1.7974585409926847, "grad_norm": 2.2176594734191895, "learning_rate": 4.0109898650749236e-07, "loss": 0.1013, "step": 77090 }, { "epoch": 1.7976917023695025, "grad_norm": 1.9244499206542969, "learning_rate": 4.010212646894236e-07, "loss": 0.1026, "step": 77100 }, { "epoch": 1.7979248637463203, "grad_norm": 1.2736343145370483, "learning_rate": 4.0094354287135484e-07, "loss": 0.1181, "step": 77110 }, { "epoch": 1.7981580251231384, "grad_norm": 1.2326935529708862, "learning_rate": 4.0086582105328605e-07, "loss": 0.0958, "step": 77120 }, { "epoch": 1.7983911864999564, "grad_norm": 1.5353256464004517, "learning_rate": 4.0078809923521725e-07, "loss": 0.1023, "step": 77130 }, { "epoch": 1.7986243478767743, "grad_norm": 1.6590946912765503, "learning_rate": 4.007103774171485e-07, "loss": 0.1063, "step": 77140 }, { "epoch": 1.7988575092535921, "grad_norm": 1.246121883392334, "learning_rate": 4.006326555990798e-07, "loss": 0.1207, "step": 77150 }, { "epoch": 1.79909067063041, "grad_norm": 1.7076722383499146, "learning_rate": 4.00554933781011e-07, "loss": 0.1229, "step": 77160 }, { "epoch": 1.799323832007228, "grad_norm": 1.4157010316848755, "learning_rate": 4.004772119629422e-07, "loss": 0.0949, "step": 77170 }, { "epoch": 1.799556993384046, "grad_norm": 3.4157590866088867, "learning_rate": 4.0039949014487346e-07, "loss": 0.107, "step": 77180 }, { "epoch": 1.799790154760864, "grad_norm": 2.1584765911102295, "learning_rate": 4.003217683268047e-07, "loss": 0.1092, "step": 77190 }, { "epoch": 1.8000233161376817, "grad_norm": 3.1637611389160156, "learning_rate": 4.002440465087359e-07, "loss": 0.1024, "step": 77200 }, { "epoch": 1.8002564775144996, "grad_norm": 1.3860046863555908, "learning_rate": 4.0016632469066714e-07, "loss": 0.1004, "step": 77210 }, { "epoch": 1.8004896388913176, "grad_norm": 1.2535412311553955, "learning_rate": 4.000886028725984e-07, "loss": 0.0996, "step": 77220 }, { "epoch": 1.8007228002681357, "grad_norm": 2.487497091293335, "learning_rate": 4.000108810545296e-07, "loss": 0.1036, "step": 77230 }, { "epoch": 1.8009559616449535, "grad_norm": 1.4172627925872803, "learning_rate": 3.999331592364608e-07, "loss": 0.1027, "step": 77240 }, { "epoch": 1.8011891230217714, "grad_norm": 1.491362452507019, "learning_rate": 3.998554374183921e-07, "loss": 0.1064, "step": 77250 }, { "epoch": 1.8014222843985894, "grad_norm": 1.6732630729675293, "learning_rate": 3.997777156003233e-07, "loss": 0.0999, "step": 77260 }, { "epoch": 1.8016554457754073, "grad_norm": 1.3790122270584106, "learning_rate": 3.9969999378225455e-07, "loss": 0.1154, "step": 77270 }, { "epoch": 1.8018886071522253, "grad_norm": 1.5248442888259888, "learning_rate": 3.9962227196418576e-07, "loss": 0.1005, "step": 77280 }, { "epoch": 1.8021217685290432, "grad_norm": 1.7933634519577026, "learning_rate": 3.9954455014611697e-07, "loss": 0.1016, "step": 77290 }, { "epoch": 1.802354929905861, "grad_norm": 1.139348030090332, "learning_rate": 3.9946682832804823e-07, "loss": 0.1085, "step": 77300 }, { "epoch": 1.802588091282679, "grad_norm": 1.1483771800994873, "learning_rate": 3.993891065099795e-07, "loss": 0.0946, "step": 77310 }, { "epoch": 1.802821252659497, "grad_norm": 1.4148143529891968, "learning_rate": 3.9931138469191065e-07, "loss": 0.1065, "step": 77320 }, { "epoch": 1.803054414036315, "grad_norm": 1.224717378616333, "learning_rate": 3.992336628738419e-07, "loss": 0.1067, "step": 77330 }, { "epoch": 1.8032875754131328, "grad_norm": 1.7659955024719238, "learning_rate": 3.991559410557732e-07, "loss": 0.1034, "step": 77340 }, { "epoch": 1.8035207367899506, "grad_norm": 1.162883996963501, "learning_rate": 3.990782192377044e-07, "loss": 0.1062, "step": 77350 }, { "epoch": 1.8037538981667687, "grad_norm": 1.4066195487976074, "learning_rate": 3.990004974196356e-07, "loss": 0.1068, "step": 77360 }, { "epoch": 1.8039870595435867, "grad_norm": 1.9035778045654297, "learning_rate": 3.9892277560156686e-07, "loss": 0.1124, "step": 77370 }, { "epoch": 1.8042202209204046, "grad_norm": 3.0265069007873535, "learning_rate": 3.9884505378349807e-07, "loss": 0.1016, "step": 77380 }, { "epoch": 1.8044533822972224, "grad_norm": 1.5009284019470215, "learning_rate": 3.9876733196542933e-07, "loss": 0.1067, "step": 77390 }, { "epoch": 1.8046865436740402, "grad_norm": 1.3352327346801758, "learning_rate": 3.9868961014736054e-07, "loss": 0.1095, "step": 77400 }, { "epoch": 1.8049197050508583, "grad_norm": 2.0078959465026855, "learning_rate": 3.9861188832929175e-07, "loss": 0.0949, "step": 77410 }, { "epoch": 1.8051528664276764, "grad_norm": 2.298975706100464, "learning_rate": 3.98534166511223e-07, "loss": 0.1133, "step": 77420 }, { "epoch": 1.8053860278044942, "grad_norm": 1.4147593975067139, "learning_rate": 3.9845644469315427e-07, "loss": 0.1074, "step": 77430 }, { "epoch": 1.805619189181312, "grad_norm": 2.2478621006011963, "learning_rate": 3.9837872287508543e-07, "loss": 0.1133, "step": 77440 }, { "epoch": 1.80585235055813, "grad_norm": 1.416150450706482, "learning_rate": 3.983010010570167e-07, "loss": 0.1082, "step": 77450 }, { "epoch": 1.806085511934948, "grad_norm": 3.057803153991699, "learning_rate": 3.9822327923894795e-07, "loss": 0.1004, "step": 77460 }, { "epoch": 1.806318673311766, "grad_norm": 2.1478419303894043, "learning_rate": 3.98153329602686e-07, "loss": 0.1084, "step": 77470 }, { "epoch": 1.8065518346885838, "grad_norm": 3.127131223678589, "learning_rate": 3.980756077846173e-07, "loss": 0.103, "step": 77480 }, { "epoch": 1.8067849960654017, "grad_norm": 2.140002489089966, "learning_rate": 3.979978859665485e-07, "loss": 0.1138, "step": 77490 }, { "epoch": 1.8070181574422197, "grad_norm": 3.6147656440734863, "learning_rate": 3.979201641484797e-07, "loss": 0.113, "step": 77500 }, { "epoch": 1.8072513188190378, "grad_norm": 1.2935000658035278, "learning_rate": 3.9784244233041097e-07, "loss": 0.1035, "step": 77510 }, { "epoch": 1.8074844801958556, "grad_norm": 1.3094844818115234, "learning_rate": 3.9776472051234223e-07, "loss": 0.1077, "step": 77520 }, { "epoch": 1.8077176415726735, "grad_norm": 1.4108740091323853, "learning_rate": 3.976869986942734e-07, "loss": 0.1052, "step": 77530 }, { "epoch": 1.8079508029494913, "grad_norm": 2.6458120346069336, "learning_rate": 3.9760927687620465e-07, "loss": 0.101, "step": 77540 }, { "epoch": 1.8081839643263093, "grad_norm": 1.4683305025100708, "learning_rate": 3.975315550581359e-07, "loss": 0.1165, "step": 77550 }, { "epoch": 1.8084171257031274, "grad_norm": 2.082432508468628, "learning_rate": 3.9745383324006717e-07, "loss": 0.1077, "step": 77560 }, { "epoch": 1.8086502870799452, "grad_norm": 2.8340766429901123, "learning_rate": 3.9737611142199833e-07, "loss": 0.1119, "step": 77570 }, { "epoch": 1.808883448456763, "grad_norm": 1.417171835899353, "learning_rate": 3.972983896039296e-07, "loss": 0.1019, "step": 77580 }, { "epoch": 1.809116609833581, "grad_norm": 1.4426183700561523, "learning_rate": 3.9722066778586085e-07, "loss": 0.1134, "step": 77590 }, { "epoch": 1.809349771210399, "grad_norm": 1.9555033445358276, "learning_rate": 3.9714294596779206e-07, "loss": 0.1017, "step": 77600 }, { "epoch": 1.809582932587217, "grad_norm": 1.4218487739562988, "learning_rate": 3.9706522414972327e-07, "loss": 0.1083, "step": 77610 }, { "epoch": 1.8098160939640349, "grad_norm": 1.33453369140625, "learning_rate": 3.9698750233165453e-07, "loss": 0.102, "step": 77620 }, { "epoch": 1.8100492553408527, "grad_norm": 2.105928421020508, "learning_rate": 3.9690978051358574e-07, "loss": 0.0884, "step": 77630 }, { "epoch": 1.8102824167176708, "grad_norm": 1.6713042259216309, "learning_rate": 3.96832058695517e-07, "loss": 0.1102, "step": 77640 }, { "epoch": 1.8105155780944886, "grad_norm": 1.8857425451278687, "learning_rate": 3.967543368774482e-07, "loss": 0.1077, "step": 77650 }, { "epoch": 1.8107487394713067, "grad_norm": 1.2890450954437256, "learning_rate": 3.966766150593794e-07, "loss": 0.1144, "step": 77660 }, { "epoch": 1.8109819008481245, "grad_norm": 1.6066865921020508, "learning_rate": 3.965988932413107e-07, "loss": 0.1057, "step": 77670 }, { "epoch": 1.8112150622249423, "grad_norm": 1.3718053102493286, "learning_rate": 3.9652117142324195e-07, "loss": 0.1087, "step": 77680 }, { "epoch": 1.8114482236017604, "grad_norm": 1.4179389476776123, "learning_rate": 3.964434496051731e-07, "loss": 0.1091, "step": 77690 }, { "epoch": 1.8116813849785784, "grad_norm": 2.1088755130767822, "learning_rate": 3.9636572778710437e-07, "loss": 0.1062, "step": 77700 }, { "epoch": 1.8119145463553963, "grad_norm": 1.641137957572937, "learning_rate": 3.9628800596903563e-07, "loss": 0.1047, "step": 77710 }, { "epoch": 1.8121477077322141, "grad_norm": 1.2026715278625488, "learning_rate": 3.9621028415096684e-07, "loss": 0.1066, "step": 77720 }, { "epoch": 1.812380869109032, "grad_norm": 1.495116114616394, "learning_rate": 3.9613256233289805e-07, "loss": 0.1001, "step": 77730 }, { "epoch": 1.81261403048585, "grad_norm": 1.369545817375183, "learning_rate": 3.960548405148293e-07, "loss": 0.1095, "step": 77740 }, { "epoch": 1.812847191862668, "grad_norm": 1.14570152759552, "learning_rate": 3.959771186967605e-07, "loss": 0.1129, "step": 77750 }, { "epoch": 1.813080353239486, "grad_norm": 1.765089511871338, "learning_rate": 3.9590716906049864e-07, "loss": 0.1172, "step": 77760 }, { "epoch": 1.8133135146163037, "grad_norm": 1.659517526626587, "learning_rate": 3.958294472424299e-07, "loss": 0.1119, "step": 77770 }, { "epoch": 1.8135466759931216, "grad_norm": 1.3621565103530884, "learning_rate": 3.9575172542436106e-07, "loss": 0.1069, "step": 77780 }, { "epoch": 1.8137798373699396, "grad_norm": 1.5708022117614746, "learning_rate": 3.956740036062923e-07, "loss": 0.119, "step": 77790 }, { "epoch": 1.8140129987467577, "grad_norm": 1.5860085487365723, "learning_rate": 3.955962817882236e-07, "loss": 0.1085, "step": 77800 }, { "epoch": 1.8142461601235755, "grad_norm": 2.573317527770996, "learning_rate": 3.955185599701548e-07, "loss": 0.1015, "step": 77810 }, { "epoch": 1.8144793215003934, "grad_norm": 1.497819423675537, "learning_rate": 3.95440838152086e-07, "loss": 0.108, "step": 77820 }, { "epoch": 1.8147124828772114, "grad_norm": 1.5152311325073242, "learning_rate": 3.9536311633401727e-07, "loss": 0.1121, "step": 77830 }, { "epoch": 1.8149456442540293, "grad_norm": 1.6511523723602295, "learning_rate": 3.952853945159485e-07, "loss": 0.1036, "step": 77840 }, { "epoch": 1.8151788056308473, "grad_norm": 1.3883638381958008, "learning_rate": 3.9520767269787974e-07, "loss": 0.1028, "step": 77850 }, { "epoch": 1.8154119670076652, "grad_norm": 1.866905689239502, "learning_rate": 3.9512995087981095e-07, "loss": 0.1182, "step": 77860 }, { "epoch": 1.815645128384483, "grad_norm": 2.4934401512145996, "learning_rate": 3.9505222906174216e-07, "loss": 0.0985, "step": 77870 }, { "epoch": 1.815878289761301, "grad_norm": 1.5924911499023438, "learning_rate": 3.949745072436734e-07, "loss": 0.0957, "step": 77880 }, { "epoch": 1.8161114511381191, "grad_norm": 1.4643934965133667, "learning_rate": 3.948967854256047e-07, "loss": 0.098, "step": 77890 }, { "epoch": 1.816344612514937, "grad_norm": 1.2212796211242676, "learning_rate": 3.9481906360753584e-07, "loss": 0.1046, "step": 77900 }, { "epoch": 1.8165777738917548, "grad_norm": 2.03808331489563, "learning_rate": 3.947413417894671e-07, "loss": 0.0989, "step": 77910 }, { "epoch": 1.8168109352685726, "grad_norm": 1.1314932107925415, "learning_rate": 3.9466361997139836e-07, "loss": 0.1125, "step": 77920 }, { "epoch": 1.8170440966453907, "grad_norm": 1.4626119136810303, "learning_rate": 3.945858981533296e-07, "loss": 0.1109, "step": 77930 }, { "epoch": 1.8172772580222087, "grad_norm": 1.9151540994644165, "learning_rate": 3.945081763352608e-07, "loss": 0.0971, "step": 77940 }, { "epoch": 1.8175104193990266, "grad_norm": 1.357000470161438, "learning_rate": 3.9443045451719204e-07, "loss": 0.1072, "step": 77950 }, { "epoch": 1.8177435807758444, "grad_norm": 3.8920083045959473, "learning_rate": 3.943527326991233e-07, "loss": 0.1127, "step": 77960 }, { "epoch": 1.8179767421526623, "grad_norm": 2.9831390380859375, "learning_rate": 3.942750108810545e-07, "loss": 0.1055, "step": 77970 }, { "epoch": 1.8182099035294803, "grad_norm": 1.6185094118118286, "learning_rate": 3.941972890629857e-07, "loss": 0.1133, "step": 77980 }, { "epoch": 1.8184430649062984, "grad_norm": 1.9005120992660522, "learning_rate": 3.94119567244917e-07, "loss": 0.1102, "step": 77990 }, { "epoch": 1.8186762262831162, "grad_norm": 1.2247343063354492, "learning_rate": 3.940418454268482e-07, "loss": 0.112, "step": 78000 }, { "epoch": 1.818909387659934, "grad_norm": 1.1361109018325806, "learning_rate": 3.9396412360877946e-07, "loss": 0.1079, "step": 78010 }, { "epoch": 1.819142549036752, "grad_norm": 2.06864333152771, "learning_rate": 3.9388640179071067e-07, "loss": 0.1126, "step": 78020 }, { "epoch": 1.8193757104135702, "grad_norm": 1.7945894002914429, "learning_rate": 3.938086799726419e-07, "loss": 0.1228, "step": 78030 }, { "epoch": 1.819608871790388, "grad_norm": 1.9121917486190796, "learning_rate": 3.9373095815457314e-07, "loss": 0.1071, "step": 78040 }, { "epoch": 1.8198420331672058, "grad_norm": 1.4596549272537231, "learning_rate": 3.936532363365044e-07, "loss": 0.1154, "step": 78050 }, { "epoch": 1.8200751945440237, "grad_norm": 1.1618101596832275, "learning_rate": 3.935755145184356e-07, "loss": 0.1131, "step": 78060 }, { "epoch": 1.8203083559208417, "grad_norm": 1.2535812854766846, "learning_rate": 3.934977927003668e-07, "loss": 0.1031, "step": 78070 }, { "epoch": 1.8205415172976598, "grad_norm": 2.059997797012329, "learning_rate": 3.934200708822981e-07, "loss": 0.0983, "step": 78080 }, { "epoch": 1.8207746786744776, "grad_norm": 1.787495493888855, "learning_rate": 3.933423490642293e-07, "loss": 0.1145, "step": 78090 }, { "epoch": 1.8210078400512955, "grad_norm": 2.6407079696655273, "learning_rate": 3.9326462724616055e-07, "loss": 0.1066, "step": 78100 }, { "epoch": 1.8212410014281133, "grad_norm": 2.727185010910034, "learning_rate": 3.9318690542809176e-07, "loss": 0.1059, "step": 78110 }, { "epoch": 1.8214741628049314, "grad_norm": 1.4547120332717896, "learning_rate": 3.9310918361002297e-07, "loss": 0.1098, "step": 78120 }, { "epoch": 1.8217073241817494, "grad_norm": 1.225845456123352, "learning_rate": 3.9303146179195423e-07, "loss": 0.1143, "step": 78130 }, { "epoch": 1.8219404855585672, "grad_norm": 1.50600004196167, "learning_rate": 3.929537399738855e-07, "loss": 0.1066, "step": 78140 }, { "epoch": 1.822173646935385, "grad_norm": 3.076704502105713, "learning_rate": 3.9287601815581665e-07, "loss": 0.1018, "step": 78150 }, { "epoch": 1.822406808312203, "grad_norm": 1.3811156749725342, "learning_rate": 3.927982963377479e-07, "loss": 0.0909, "step": 78160 }, { "epoch": 1.822639969689021, "grad_norm": 1.5802925825119019, "learning_rate": 3.927205745196792e-07, "loss": 0.116, "step": 78170 }, { "epoch": 1.822873131065839, "grad_norm": 1.5361405611038208, "learning_rate": 3.926428527016104e-07, "loss": 0.1119, "step": 78180 }, { "epoch": 1.8231062924426569, "grad_norm": 1.181502103805542, "learning_rate": 3.925651308835416e-07, "loss": 0.1043, "step": 78190 }, { "epoch": 1.8233394538194747, "grad_norm": 1.6120243072509766, "learning_rate": 3.9248740906547285e-07, "loss": 0.112, "step": 78200 }, { "epoch": 1.8235726151962928, "grad_norm": 1.2768913507461548, "learning_rate": 3.9240968724740406e-07, "loss": 0.1033, "step": 78210 }, { "epoch": 1.8238057765731108, "grad_norm": 2.4280004501342773, "learning_rate": 3.923319654293353e-07, "loss": 0.1266, "step": 78220 }, { "epoch": 1.8240389379499287, "grad_norm": 1.0726814270019531, "learning_rate": 3.9225424361126654e-07, "loss": 0.0996, "step": 78230 }, { "epoch": 1.8242720993267465, "grad_norm": 2.1991772651672363, "learning_rate": 3.9217652179319774e-07, "loss": 0.1049, "step": 78240 }, { "epoch": 1.8245052607035643, "grad_norm": 2.723250150680542, "learning_rate": 3.92098799975129e-07, "loss": 0.1014, "step": 78250 }, { "epoch": 1.8247384220803824, "grad_norm": 1.0665644407272339, "learning_rate": 3.9202107815706027e-07, "loss": 0.1012, "step": 78260 }, { "epoch": 1.8249715834572005, "grad_norm": 1.0325586795806885, "learning_rate": 3.919433563389914e-07, "loss": 0.0914, "step": 78270 }, { "epoch": 1.8252047448340183, "grad_norm": 1.234505295753479, "learning_rate": 3.918656345209227e-07, "loss": 0.1054, "step": 78280 }, { "epoch": 1.8254379062108361, "grad_norm": 1.4085787534713745, "learning_rate": 3.9178791270285395e-07, "loss": 0.101, "step": 78290 }, { "epoch": 1.825671067587654, "grad_norm": 2.5413529872894287, "learning_rate": 3.917101908847852e-07, "loss": 0.1018, "step": 78300 }, { "epoch": 1.825904228964472, "grad_norm": 3.005387783050537, "learning_rate": 3.9163246906671637e-07, "loss": 0.1095, "step": 78310 }, { "epoch": 1.82613739034129, "grad_norm": 2.0460195541381836, "learning_rate": 3.9155474724864763e-07, "loss": 0.1099, "step": 78320 }, { "epoch": 1.826370551718108, "grad_norm": 1.3006054162979126, "learning_rate": 3.914770254305789e-07, "loss": 0.1064, "step": 78330 }, { "epoch": 1.8266037130949258, "grad_norm": 3.175595283508301, "learning_rate": 3.913993036125101e-07, "loss": 0.092, "step": 78340 }, { "epoch": 1.8268368744717438, "grad_norm": 2.1326236724853516, "learning_rate": 3.913215817944413e-07, "loss": 0.1108, "step": 78350 }, { "epoch": 1.8270700358485616, "grad_norm": 1.4959012269973755, "learning_rate": 3.9124385997637257e-07, "loss": 0.1045, "step": 78360 }, { "epoch": 1.8273031972253797, "grad_norm": 1.7600805759429932, "learning_rate": 3.911661381583038e-07, "loss": 0.1045, "step": 78370 }, { "epoch": 1.8275363586021975, "grad_norm": 1.707014560699463, "learning_rate": 3.9108841634023504e-07, "loss": 0.102, "step": 78380 }, { "epoch": 1.8277695199790154, "grad_norm": 2.1375961303710938, "learning_rate": 3.9101069452216625e-07, "loss": 0.092, "step": 78390 }, { "epoch": 1.8280026813558334, "grad_norm": 1.9103562831878662, "learning_rate": 3.9093297270409746e-07, "loss": 0.1022, "step": 78400 }, { "epoch": 1.8282358427326515, "grad_norm": 1.034134030342102, "learning_rate": 3.908552508860287e-07, "loss": 0.0998, "step": 78410 }, { "epoch": 1.8284690041094693, "grad_norm": 2.197906017303467, "learning_rate": 3.9077752906796e-07, "loss": 0.0975, "step": 78420 }, { "epoch": 1.8287021654862872, "grad_norm": 1.3308050632476807, "learning_rate": 3.9069980724989114e-07, "loss": 0.1043, "step": 78430 }, { "epoch": 1.828935326863105, "grad_norm": 1.5755491256713867, "learning_rate": 3.906220854318224e-07, "loss": 0.1081, "step": 78440 }, { "epoch": 1.829168488239923, "grad_norm": 1.3183363676071167, "learning_rate": 3.9054436361375367e-07, "loss": 0.1036, "step": 78450 }, { "epoch": 1.8294016496167411, "grad_norm": 1.3056813478469849, "learning_rate": 3.904666417956849e-07, "loss": 0.1055, "step": 78460 }, { "epoch": 1.829634810993559, "grad_norm": 1.537489891052246, "learning_rate": 3.903889199776161e-07, "loss": 0.1052, "step": 78470 }, { "epoch": 1.8298679723703768, "grad_norm": 2.1914713382720947, "learning_rate": 3.9031119815954735e-07, "loss": 0.1019, "step": 78480 }, { "epoch": 1.8301011337471946, "grad_norm": 1.3376941680908203, "learning_rate": 3.9023347634147856e-07, "loss": 0.1089, "step": 78490 }, { "epoch": 1.8303342951240127, "grad_norm": 3.5249390602111816, "learning_rate": 3.901557545234098e-07, "loss": 0.1087, "step": 78500 }, { "epoch": 1.8305674565008307, "grad_norm": 2.216219902038574, "learning_rate": 3.9007803270534103e-07, "loss": 0.1025, "step": 78510 }, { "epoch": 1.8308006178776486, "grad_norm": 1.2649188041687012, "learning_rate": 3.9000031088727224e-07, "loss": 0.1019, "step": 78520 }, { "epoch": 1.8310337792544664, "grad_norm": 2.611509323120117, "learning_rate": 3.899225890692035e-07, "loss": 0.1051, "step": 78530 }, { "epoch": 1.8312669406312845, "grad_norm": 1.649698257446289, "learning_rate": 3.8984486725113476e-07, "loss": 0.112, "step": 78540 }, { "epoch": 1.8315001020081023, "grad_norm": 2.5517067909240723, "learning_rate": 3.897671454330659e-07, "loss": 0.107, "step": 78550 }, { "epoch": 1.8317332633849204, "grad_norm": 3.2236037254333496, "learning_rate": 3.896894236149972e-07, "loss": 0.1004, "step": 78560 }, { "epoch": 1.8319664247617382, "grad_norm": 2.6465675830841064, "learning_rate": 3.8961170179692844e-07, "loss": 0.1051, "step": 78570 }, { "epoch": 1.832199586138556, "grad_norm": 1.3407303094863892, "learning_rate": 3.8953397997885965e-07, "loss": 0.108, "step": 78580 }, { "epoch": 1.832432747515374, "grad_norm": 1.2575414180755615, "learning_rate": 3.8945625816079086e-07, "loss": 0.0955, "step": 78590 }, { "epoch": 1.8326659088921922, "grad_norm": 3.0903098583221436, "learning_rate": 3.893785363427221e-07, "loss": 0.1069, "step": 78600 }, { "epoch": 1.83289907026901, "grad_norm": 1.2207376956939697, "learning_rate": 3.8930081452465333e-07, "loss": 0.1086, "step": 78610 }, { "epoch": 1.8331322316458278, "grad_norm": 2.217611312866211, "learning_rate": 3.892230927065846e-07, "loss": 0.1066, "step": 78620 }, { "epoch": 1.8333653930226457, "grad_norm": 1.9326213598251343, "learning_rate": 3.891453708885158e-07, "loss": 0.1102, "step": 78630 }, { "epoch": 1.8335985543994637, "grad_norm": 1.2234028577804565, "learning_rate": 3.89067649070447e-07, "loss": 0.1157, "step": 78640 }, { "epoch": 1.8338317157762818, "grad_norm": 1.7947288751602173, "learning_rate": 3.889899272523783e-07, "loss": 0.1122, "step": 78650 }, { "epoch": 1.8340648771530996, "grad_norm": 1.4702813625335693, "learning_rate": 3.8891220543430954e-07, "loss": 0.1099, "step": 78660 }, { "epoch": 1.8342980385299175, "grad_norm": 1.72555410861969, "learning_rate": 3.888344836162407e-07, "loss": 0.114, "step": 78670 }, { "epoch": 1.8345311999067353, "grad_norm": 1.115950584411621, "learning_rate": 3.8875676179817196e-07, "loss": 0.1054, "step": 78680 }, { "epoch": 1.8347643612835534, "grad_norm": 1.4722355604171753, "learning_rate": 3.886790399801032e-07, "loss": 0.1074, "step": 78690 }, { "epoch": 1.8349975226603714, "grad_norm": 1.4927667379379272, "learning_rate": 3.8860131816203443e-07, "loss": 0.11, "step": 78700 }, { "epoch": 1.8352306840371893, "grad_norm": 1.1799900531768799, "learning_rate": 3.8852359634396564e-07, "loss": 0.0973, "step": 78710 }, { "epoch": 1.835463845414007, "grad_norm": 1.24822199344635, "learning_rate": 3.884458745258969e-07, "loss": 0.1104, "step": 78720 }, { "epoch": 1.8356970067908251, "grad_norm": 1.3254908323287964, "learning_rate": 3.883681527078281e-07, "loss": 0.1019, "step": 78730 }, { "epoch": 1.835930168167643, "grad_norm": 1.3887534141540527, "learning_rate": 3.8829043088975937e-07, "loss": 0.1092, "step": 78740 }, { "epoch": 1.836163329544461, "grad_norm": 1.9701703786849976, "learning_rate": 3.882127090716906e-07, "loss": 0.1115, "step": 78750 }, { "epoch": 1.8363964909212789, "grad_norm": 1.2008802890777588, "learning_rate": 3.8813498725362184e-07, "loss": 0.107, "step": 78760 }, { "epoch": 1.8366296522980967, "grad_norm": 1.2080373764038086, "learning_rate": 3.8805726543555305e-07, "loss": 0.1006, "step": 78770 }, { "epoch": 1.8368628136749148, "grad_norm": 1.6334583759307861, "learning_rate": 3.879795436174843e-07, "loss": 0.1016, "step": 78780 }, { "epoch": 1.8370959750517328, "grad_norm": 0.9835147857666016, "learning_rate": 3.879018217994155e-07, "loss": 0.0896, "step": 78790 }, { "epoch": 1.8373291364285507, "grad_norm": 1.0867629051208496, "learning_rate": 3.8782409998134673e-07, "loss": 0.1066, "step": 78800 }, { "epoch": 1.8375622978053685, "grad_norm": 1.4931519031524658, "learning_rate": 3.87746378163278e-07, "loss": 0.1054, "step": 78810 }, { "epoch": 1.8377954591821863, "grad_norm": 1.97434401512146, "learning_rate": 3.8766865634520926e-07, "loss": 0.0985, "step": 78820 }, { "epoch": 1.8380286205590044, "grad_norm": 1.3037817478179932, "learning_rate": 3.875909345271404e-07, "loss": 0.1114, "step": 78830 }, { "epoch": 1.8382617819358225, "grad_norm": 1.8955014944076538, "learning_rate": 3.875132127090717e-07, "loss": 0.1075, "step": 78840 }, { "epoch": 1.8384949433126403, "grad_norm": 1.3742504119873047, "learning_rate": 3.8743549089100294e-07, "loss": 0.0873, "step": 78850 }, { "epoch": 1.8387281046894581, "grad_norm": 1.2008357048034668, "learning_rate": 3.8735776907293415e-07, "loss": 0.0945, "step": 78860 }, { "epoch": 1.838961266066276, "grad_norm": 3.1277732849121094, "learning_rate": 3.8728004725486535e-07, "loss": 0.1182, "step": 78870 }, { "epoch": 1.839194427443094, "grad_norm": 1.097679853439331, "learning_rate": 3.872023254367966e-07, "loss": 0.1112, "step": 78880 }, { "epoch": 1.839427588819912, "grad_norm": 1.9318456649780273, "learning_rate": 3.871246036187278e-07, "loss": 0.1019, "step": 78890 }, { "epoch": 1.83966075019673, "grad_norm": 3.259610652923584, "learning_rate": 3.870468818006591e-07, "loss": 0.1103, "step": 78900 }, { "epoch": 1.8398939115735478, "grad_norm": 1.6061954498291016, "learning_rate": 3.869691599825903e-07, "loss": 0.1072, "step": 78910 }, { "epoch": 1.8401270729503658, "grad_norm": 1.695234775543213, "learning_rate": 3.868914381645215e-07, "loss": 0.1065, "step": 78920 }, { "epoch": 1.8403602343271837, "grad_norm": 1.5161106586456299, "learning_rate": 3.8681371634645277e-07, "loss": 0.1183, "step": 78930 }, { "epoch": 1.8405933957040017, "grad_norm": 1.6655490398406982, "learning_rate": 3.8673599452838403e-07, "loss": 0.1101, "step": 78940 }, { "epoch": 1.8408265570808195, "grad_norm": 1.1400071382522583, "learning_rate": 3.866582727103152e-07, "loss": 0.098, "step": 78950 }, { "epoch": 1.8410597184576374, "grad_norm": 2.8823657035827637, "learning_rate": 3.8658055089224645e-07, "loss": 0.1104, "step": 78960 }, { "epoch": 1.8412928798344554, "grad_norm": 1.2459802627563477, "learning_rate": 3.865028290741777e-07, "loss": 0.1214, "step": 78970 }, { "epoch": 1.8415260412112735, "grad_norm": 1.3306807279586792, "learning_rate": 3.864251072561089e-07, "loss": 0.0962, "step": 78980 }, { "epoch": 1.8417592025880913, "grad_norm": 1.2309513092041016, "learning_rate": 3.8634738543804013e-07, "loss": 0.107, "step": 78990 }, { "epoch": 1.8419923639649092, "grad_norm": 1.2927557229995728, "learning_rate": 3.862696636199714e-07, "loss": 0.1073, "step": 79000 }, { "epoch": 1.842225525341727, "grad_norm": 1.2340545654296875, "learning_rate": 3.861919418019026e-07, "loss": 0.1026, "step": 79010 }, { "epoch": 1.842458686718545, "grad_norm": 2.1238784790039062, "learning_rate": 3.8611421998383386e-07, "loss": 0.1128, "step": 79020 }, { "epoch": 1.8426918480953631, "grad_norm": 1.528127908706665, "learning_rate": 3.8603649816576507e-07, "loss": 0.1203, "step": 79030 }, { "epoch": 1.842925009472181, "grad_norm": 3.9184916019439697, "learning_rate": 3.859587763476963e-07, "loss": 0.1056, "step": 79040 }, { "epoch": 1.8431581708489988, "grad_norm": 1.32016921043396, "learning_rate": 3.8588105452962754e-07, "loss": 0.1209, "step": 79050 }, { "epoch": 1.8433913322258166, "grad_norm": 1.3179823160171509, "learning_rate": 3.858033327115588e-07, "loss": 0.0958, "step": 79060 }, { "epoch": 1.8436244936026347, "grad_norm": 1.7158135175704956, "learning_rate": 3.8572561089348996e-07, "loss": 0.1114, "step": 79070 }, { "epoch": 1.8438576549794528, "grad_norm": 1.2559694051742554, "learning_rate": 3.856478890754212e-07, "loss": 0.104, "step": 79080 }, { "epoch": 1.8440908163562706, "grad_norm": 1.1559044122695923, "learning_rate": 3.855701672573525e-07, "loss": 0.1034, "step": 79090 }, { "epoch": 1.8443239777330884, "grad_norm": 1.1818251609802246, "learning_rate": 3.854924454392837e-07, "loss": 0.1118, "step": 79100 }, { "epoch": 1.8445571391099065, "grad_norm": 1.3085390329360962, "learning_rate": 3.854147236212149e-07, "loss": 0.0991, "step": 79110 }, { "epoch": 1.8447903004867243, "grad_norm": 2.781229257583618, "learning_rate": 3.8533700180314617e-07, "loss": 0.1107, "step": 79120 }, { "epoch": 1.8450234618635424, "grad_norm": 1.411388635635376, "learning_rate": 3.852592799850774e-07, "loss": 0.1151, "step": 79130 }, { "epoch": 1.8452566232403602, "grad_norm": 1.4564940929412842, "learning_rate": 3.8518155816700864e-07, "loss": 0.1026, "step": 79140 }, { "epoch": 1.845489784617178, "grad_norm": 2.1014339923858643, "learning_rate": 3.8510383634893985e-07, "loss": 0.0996, "step": 79150 }, { "epoch": 1.8457229459939961, "grad_norm": 1.1733471155166626, "learning_rate": 3.8502611453087106e-07, "loss": 0.0994, "step": 79160 }, { "epoch": 1.8459561073708142, "grad_norm": 1.2411426305770874, "learning_rate": 3.849483927128023e-07, "loss": 0.1093, "step": 79170 }, { "epoch": 1.846189268747632, "grad_norm": 1.3572756052017212, "learning_rate": 3.848706708947336e-07, "loss": 0.101, "step": 79180 }, { "epoch": 1.8464224301244498, "grad_norm": 1.2609792947769165, "learning_rate": 3.8479294907666474e-07, "loss": 0.0971, "step": 79190 }, { "epoch": 1.8466555915012677, "grad_norm": 1.5059115886688232, "learning_rate": 3.84715227258596e-07, "loss": 0.1036, "step": 79200 }, { "epoch": 1.8468887528780857, "grad_norm": 2.9135684967041016, "learning_rate": 3.8463750544052726e-07, "loss": 0.1047, "step": 79210 }, { "epoch": 1.8471219142549038, "grad_norm": 0.9526144862174988, "learning_rate": 3.845597836224585e-07, "loss": 0.0946, "step": 79220 }, { "epoch": 1.8473550756317216, "grad_norm": 3.430558919906616, "learning_rate": 3.844820618043897e-07, "loss": 0.1076, "step": 79230 }, { "epoch": 1.8475882370085395, "grad_norm": 1.617310643196106, "learning_rate": 3.8440433998632094e-07, "loss": 0.1079, "step": 79240 }, { "epoch": 1.8478213983853573, "grad_norm": 1.3431484699249268, "learning_rate": 3.843266181682522e-07, "loss": 0.1055, "step": 79250 }, { "epoch": 1.8480545597621754, "grad_norm": 1.1869648694992065, "learning_rate": 3.842488963501834e-07, "loss": 0.1038, "step": 79260 }, { "epoch": 1.8482877211389934, "grad_norm": 2.6063201427459717, "learning_rate": 3.841711745321146e-07, "loss": 0.1015, "step": 79270 }, { "epoch": 1.8485208825158113, "grad_norm": 3.0562658309936523, "learning_rate": 3.840934527140459e-07, "loss": 0.1163, "step": 79280 }, { "epoch": 1.848754043892629, "grad_norm": 2.5883448123931885, "learning_rate": 3.840157308959771e-07, "loss": 0.107, "step": 79290 }, { "epoch": 1.8489872052694472, "grad_norm": 1.3008482456207275, "learning_rate": 3.8393800907790836e-07, "loss": 0.0984, "step": 79300 }, { "epoch": 1.8492203666462652, "grad_norm": 0.9948167204856873, "learning_rate": 3.8386028725983957e-07, "loss": 0.0964, "step": 79310 }, { "epoch": 1.849453528023083, "grad_norm": 1.2453181743621826, "learning_rate": 3.837825654417708e-07, "loss": 0.1127, "step": 79320 }, { "epoch": 1.8496866893999009, "grad_norm": 1.96128511428833, "learning_rate": 3.8370484362370204e-07, "loss": 0.1144, "step": 79330 }, { "epoch": 1.8499198507767187, "grad_norm": 1.0099506378173828, "learning_rate": 3.836271218056333e-07, "loss": 0.0993, "step": 79340 }, { "epoch": 1.8501530121535368, "grad_norm": 2.4856162071228027, "learning_rate": 3.8354939998756446e-07, "loss": 0.1147, "step": 79350 }, { "epoch": 1.8503861735303548, "grad_norm": 1.8190248012542725, "learning_rate": 3.834716781694957e-07, "loss": 0.1052, "step": 79360 }, { "epoch": 1.8506193349071727, "grad_norm": 1.0428147315979004, "learning_rate": 3.83393956351427e-07, "loss": 0.0976, "step": 79370 }, { "epoch": 1.8508524962839905, "grad_norm": 1.776841163635254, "learning_rate": 3.833162345333582e-07, "loss": 0.1005, "step": 79380 }, { "epoch": 1.8510856576608083, "grad_norm": 1.255348801612854, "learning_rate": 3.832385127152894e-07, "loss": 0.1075, "step": 79390 }, { "epoch": 1.8513188190376264, "grad_norm": 1.5551739931106567, "learning_rate": 3.8316079089722066e-07, "loss": 0.1258, "step": 79400 }, { "epoch": 1.8515519804144445, "grad_norm": 2.6094493865966797, "learning_rate": 3.8308306907915187e-07, "loss": 0.1035, "step": 79410 }, { "epoch": 1.8517851417912623, "grad_norm": 3.5806972980499268, "learning_rate": 3.8300534726108313e-07, "loss": 0.0999, "step": 79420 }, { "epoch": 1.8520183031680801, "grad_norm": 1.5731945037841797, "learning_rate": 3.8292762544301434e-07, "loss": 0.1072, "step": 79430 }, { "epoch": 1.852251464544898, "grad_norm": 2.265130043029785, "learning_rate": 3.8284990362494555e-07, "loss": 0.0905, "step": 79440 }, { "epoch": 1.852484625921716, "grad_norm": 1.5147511959075928, "learning_rate": 3.827721818068768e-07, "loss": 0.1136, "step": 79450 }, { "epoch": 1.852717787298534, "grad_norm": 1.1866635084152222, "learning_rate": 3.826944599888081e-07, "loss": 0.1134, "step": 79460 }, { "epoch": 1.852950948675352, "grad_norm": 1.3128399848937988, "learning_rate": 3.8261673817073923e-07, "loss": 0.1153, "step": 79470 }, { "epoch": 1.8531841100521698, "grad_norm": 1.3636037111282349, "learning_rate": 3.825390163526705e-07, "loss": 0.1055, "step": 79480 }, { "epoch": 1.8534172714289878, "grad_norm": 1.3593502044677734, "learning_rate": 3.8246129453460175e-07, "loss": 0.1117, "step": 79490 }, { "epoch": 1.8536504328058059, "grad_norm": 1.7957326173782349, "learning_rate": 3.8238357271653296e-07, "loss": 0.0962, "step": 79500 }, { "epoch": 1.8538835941826237, "grad_norm": 1.412166953086853, "learning_rate": 3.8230585089846417e-07, "loss": 0.1094, "step": 79510 }, { "epoch": 1.8541167555594416, "grad_norm": 1.6376405954360962, "learning_rate": 3.8222812908039544e-07, "loss": 0.1077, "step": 79520 }, { "epoch": 1.8543499169362594, "grad_norm": 1.308166265487671, "learning_rate": 3.8215040726232664e-07, "loss": 0.1125, "step": 79530 }, { "epoch": 1.8545830783130774, "grad_norm": 1.1124043464660645, "learning_rate": 3.820726854442579e-07, "loss": 0.1016, "step": 79540 }, { "epoch": 1.8548162396898955, "grad_norm": 1.7207883596420288, "learning_rate": 3.819949636261891e-07, "loss": 0.1215, "step": 79550 }, { "epoch": 1.8550494010667133, "grad_norm": 1.201072335243225, "learning_rate": 3.819172418081203e-07, "loss": 0.1028, "step": 79560 }, { "epoch": 1.8552825624435312, "grad_norm": 1.3648430109024048, "learning_rate": 3.818395199900516e-07, "loss": 0.1052, "step": 79570 }, { "epoch": 1.855515723820349, "grad_norm": 1.371639609336853, "learning_rate": 3.8176179817198285e-07, "loss": 0.1178, "step": 79580 }, { "epoch": 1.855748885197167, "grad_norm": 1.4811278581619263, "learning_rate": 3.81684076353914e-07, "loss": 0.1141, "step": 79590 }, { "epoch": 1.8559820465739851, "grad_norm": 1.2970476150512695, "learning_rate": 3.8160635453584527e-07, "loss": 0.1127, "step": 79600 }, { "epoch": 1.856215207950803, "grad_norm": 2.9052443504333496, "learning_rate": 3.8152863271777653e-07, "loss": 0.0995, "step": 79610 }, { "epoch": 1.8564483693276208, "grad_norm": 1.6641384363174438, "learning_rate": 3.814509108997078e-07, "loss": 0.0978, "step": 79620 }, { "epoch": 1.8566815307044389, "grad_norm": 1.5746700763702393, "learning_rate": 3.8137318908163895e-07, "loss": 0.1017, "step": 79630 }, { "epoch": 1.8569146920812567, "grad_norm": 1.9584550857543945, "learning_rate": 3.812954672635702e-07, "loss": 0.1079, "step": 79640 }, { "epoch": 1.8571478534580748, "grad_norm": 1.4381184577941895, "learning_rate": 3.8121774544550147e-07, "loss": 0.1081, "step": 79650 }, { "epoch": 1.8573810148348926, "grad_norm": 2.8260364532470703, "learning_rate": 3.811400236274327e-07, "loss": 0.106, "step": 79660 }, { "epoch": 1.8576141762117104, "grad_norm": 1.0281288623809814, "learning_rate": 3.810623018093639e-07, "loss": 0.1067, "step": 79670 }, { "epoch": 1.8578473375885285, "grad_norm": 1.5389728546142578, "learning_rate": 3.8098457999129515e-07, "loss": 0.1039, "step": 79680 }, { "epoch": 1.8580804989653465, "grad_norm": 1.2101718187332153, "learning_rate": 3.8090685817322636e-07, "loss": 0.106, "step": 79690 }, { "epoch": 1.8583136603421644, "grad_norm": 2.4819352626800537, "learning_rate": 3.808291363551576e-07, "loss": 0.1086, "step": 79700 }, { "epoch": 1.8585468217189822, "grad_norm": 2.044297218322754, "learning_rate": 3.8075141453708883e-07, "loss": 0.1088, "step": 79710 }, { "epoch": 1.8587799830958, "grad_norm": 1.513224482536316, "learning_rate": 3.8067369271902004e-07, "loss": 0.1035, "step": 79720 }, { "epoch": 1.8590131444726181, "grad_norm": 2.070005178451538, "learning_rate": 3.805959709009513e-07, "loss": 0.1085, "step": 79730 }, { "epoch": 1.8592463058494362, "grad_norm": 1.3503751754760742, "learning_rate": 3.8051824908288257e-07, "loss": 0.0934, "step": 79740 }, { "epoch": 1.859479467226254, "grad_norm": 1.5939018726348877, "learning_rate": 3.804405272648137e-07, "loss": 0.1015, "step": 79750 }, { "epoch": 1.8597126286030718, "grad_norm": 1.2933168411254883, "learning_rate": 3.80362805446745e-07, "loss": 0.098, "step": 79760 }, { "epoch": 1.8599457899798897, "grad_norm": 2.055104970932007, "learning_rate": 3.8028508362867625e-07, "loss": 0.1122, "step": 79770 }, { "epoch": 1.8601789513567077, "grad_norm": 3.0358755588531494, "learning_rate": 3.8020736181060746e-07, "loss": 0.1107, "step": 79780 }, { "epoch": 1.8604121127335258, "grad_norm": 1.8010132312774658, "learning_rate": 3.8012963999253867e-07, "loss": 0.0983, "step": 79790 }, { "epoch": 1.8606452741103436, "grad_norm": 1.9071948528289795, "learning_rate": 3.8005191817446993e-07, "loss": 0.1045, "step": 79800 }, { "epoch": 1.8608784354871615, "grad_norm": 1.112318992614746, "learning_rate": 3.7997419635640114e-07, "loss": 0.1015, "step": 79810 }, { "epoch": 1.8611115968639795, "grad_norm": 3.511037588119507, "learning_rate": 3.798964745383324e-07, "loss": 0.1216, "step": 79820 }, { "epoch": 1.8613447582407974, "grad_norm": 1.2730132341384888, "learning_rate": 3.798187527202636e-07, "loss": 0.1021, "step": 79830 }, { "epoch": 1.8615779196176154, "grad_norm": 1.804833173751831, "learning_rate": 3.797410309021948e-07, "loss": 0.0958, "step": 79840 }, { "epoch": 1.8618110809944333, "grad_norm": 1.3041881322860718, "learning_rate": 3.796633090841261e-07, "loss": 0.0985, "step": 79850 }, { "epoch": 1.862044242371251, "grad_norm": 0.993539035320282, "learning_rate": 3.7958558726605734e-07, "loss": 0.1032, "step": 79860 }, { "epoch": 1.8622774037480692, "grad_norm": 1.2385859489440918, "learning_rate": 3.795078654479885e-07, "loss": 0.1149, "step": 79870 }, { "epoch": 1.8625105651248872, "grad_norm": 1.4909948110580444, "learning_rate": 3.7943014362991976e-07, "loss": 0.099, "step": 79880 }, { "epoch": 1.862743726501705, "grad_norm": 1.4817177057266235, "learning_rate": 3.79352421811851e-07, "loss": 0.1074, "step": 79890 }, { "epoch": 1.862976887878523, "grad_norm": 1.3297325372695923, "learning_rate": 3.7927469999378223e-07, "loss": 0.1097, "step": 79900 }, { "epoch": 1.8632100492553407, "grad_norm": 1.1194244623184204, "learning_rate": 3.7919697817571344e-07, "loss": 0.1094, "step": 79910 }, { "epoch": 1.8634432106321588, "grad_norm": 1.394105076789856, "learning_rate": 3.791192563576447e-07, "loss": 0.1041, "step": 79920 }, { "epoch": 1.8636763720089768, "grad_norm": 1.8755922317504883, "learning_rate": 3.790415345395759e-07, "loss": 0.1064, "step": 79930 }, { "epoch": 1.8639095333857947, "grad_norm": 2.0208044052124023, "learning_rate": 3.789638127215072e-07, "loss": 0.1029, "step": 79940 }, { "epoch": 1.8641426947626125, "grad_norm": 1.323338508605957, "learning_rate": 3.788860909034384e-07, "loss": 0.0983, "step": 79950 }, { "epoch": 1.8643758561394304, "grad_norm": 1.1977224349975586, "learning_rate": 3.788083690853696e-07, "loss": 0.0995, "step": 79960 }, { "epoch": 1.8646090175162484, "grad_norm": 1.8620144128799438, "learning_rate": 3.7873064726730086e-07, "loss": 0.1093, "step": 79970 }, { "epoch": 1.8648421788930665, "grad_norm": 1.3424997329711914, "learning_rate": 3.786529254492321e-07, "loss": 0.1144, "step": 79980 }, { "epoch": 1.8650753402698843, "grad_norm": 2.072185516357422, "learning_rate": 3.785752036311633e-07, "loss": 0.1107, "step": 79990 }, { "epoch": 1.8653085016467021, "grad_norm": 1.0988576412200928, "learning_rate": 3.7849748181309454e-07, "loss": 0.1095, "step": 80000 }, { "epoch": 1.8653085016467021, "eval_accuracy": 0.947303649671734, "eval_f1": 0.9623489154082466, "eval_loss": 0.1369890421628952, "eval_runtime": 6311.1562, "eval_samples_per_second": 289.95, "eval_steps_per_second": 36.244, "step": 80000 }, { "epoch": 1.8655416630235202, "grad_norm": 1.3258728981018066, "learning_rate": 3.784197599950258e-07, "loss": 0.1062, "step": 80010 }, { "epoch": 1.865774824400338, "grad_norm": 2.554032564163208, "learning_rate": 3.7834203817695706e-07, "loss": 0.1027, "step": 80020 }, { "epoch": 1.866007985777156, "grad_norm": 1.846328854560852, "learning_rate": 3.782643163588882e-07, "loss": 0.104, "step": 80030 }, { "epoch": 1.866241147153974, "grad_norm": 2.402487277984619, "learning_rate": 3.781865945408195e-07, "loss": 0.1004, "step": 80040 }, { "epoch": 1.8664743085307918, "grad_norm": 1.5811326503753662, "learning_rate": 3.7810887272275074e-07, "loss": 0.1054, "step": 80050 }, { "epoch": 1.8667074699076098, "grad_norm": 1.699507713317871, "learning_rate": 3.7803115090468195e-07, "loss": 0.1075, "step": 80060 }, { "epoch": 1.8669406312844279, "grad_norm": 1.6128580570220947, "learning_rate": 3.7795342908661316e-07, "loss": 0.1136, "step": 80070 }, { "epoch": 1.8671737926612457, "grad_norm": 1.5087711811065674, "learning_rate": 3.778757072685444e-07, "loss": 0.1087, "step": 80080 }, { "epoch": 1.8674069540380636, "grad_norm": 1.5262212753295898, "learning_rate": 3.7779798545047563e-07, "loss": 0.1044, "step": 80090 }, { "epoch": 1.8676401154148814, "grad_norm": 2.5088391304016113, "learning_rate": 3.777202636324069e-07, "loss": 0.1032, "step": 80100 }, { "epoch": 1.8678732767916995, "grad_norm": 1.2595146894454956, "learning_rate": 3.776425418143381e-07, "loss": 0.1041, "step": 80110 }, { "epoch": 1.8681064381685175, "grad_norm": 1.210741639137268, "learning_rate": 3.775648199962693e-07, "loss": 0.0978, "step": 80120 }, { "epoch": 1.8683395995453353, "grad_norm": 1.3742905855178833, "learning_rate": 3.774870981782006e-07, "loss": 0.1159, "step": 80130 }, { "epoch": 1.8685727609221532, "grad_norm": 2.2845211029052734, "learning_rate": 3.7740937636013184e-07, "loss": 0.1085, "step": 80140 }, { "epoch": 1.868805922298971, "grad_norm": 1.7894433736801147, "learning_rate": 3.77331654542063e-07, "loss": 0.1179, "step": 80150 }, { "epoch": 1.869039083675789, "grad_norm": 2.286247730255127, "learning_rate": 3.7725393272399425e-07, "loss": 0.1046, "step": 80160 }, { "epoch": 1.8692722450526071, "grad_norm": 2.1263458728790283, "learning_rate": 3.771762109059255e-07, "loss": 0.1121, "step": 80170 }, { "epoch": 1.869505406429425, "grad_norm": 3.111807107925415, "learning_rate": 3.770984890878567e-07, "loss": 0.1111, "step": 80180 }, { "epoch": 1.8697385678062428, "grad_norm": 1.8578803539276123, "learning_rate": 3.7702076726978793e-07, "loss": 0.107, "step": 80190 }, { "epoch": 1.8699717291830609, "grad_norm": 1.6838434934616089, "learning_rate": 3.769430454517192e-07, "loss": 0.1034, "step": 80200 }, { "epoch": 1.8702048905598787, "grad_norm": 2.5485105514526367, "learning_rate": 3.768653236336504e-07, "loss": 0.1073, "step": 80210 }, { "epoch": 1.8704380519366968, "grad_norm": 1.752656102180481, "learning_rate": 3.7678760181558167e-07, "loss": 0.1068, "step": 80220 }, { "epoch": 1.8706712133135146, "grad_norm": 1.5343059301376343, "learning_rate": 3.767098799975129e-07, "loss": 0.1045, "step": 80230 }, { "epoch": 1.8709043746903324, "grad_norm": 2.1367173194885254, "learning_rate": 3.766321581794441e-07, "loss": 0.1039, "step": 80240 }, { "epoch": 1.8711375360671505, "grad_norm": 1.36968195438385, "learning_rate": 3.7655443636137535e-07, "loss": 0.1153, "step": 80250 }, { "epoch": 1.8713706974439686, "grad_norm": 1.5954285860061646, "learning_rate": 3.764767145433066e-07, "loss": 0.0987, "step": 80260 }, { "epoch": 1.8716038588207864, "grad_norm": 2.7125701904296875, "learning_rate": 3.7639899272523777e-07, "loss": 0.1168, "step": 80270 }, { "epoch": 1.8718370201976042, "grad_norm": 1.340166687965393, "learning_rate": 3.7632127090716903e-07, "loss": 0.0995, "step": 80280 }, { "epoch": 1.872070181574422, "grad_norm": 1.347839117050171, "learning_rate": 3.762435490891003e-07, "loss": 0.114, "step": 80290 }, { "epoch": 1.8723033429512401, "grad_norm": 1.549957036972046, "learning_rate": 3.761658272710315e-07, "loss": 0.1123, "step": 80300 }, { "epoch": 1.8725365043280582, "grad_norm": 2.1072752475738525, "learning_rate": 3.760881054529627e-07, "loss": 0.0963, "step": 80310 }, { "epoch": 1.872769665704876, "grad_norm": 2.4798896312713623, "learning_rate": 3.7601038363489397e-07, "loss": 0.1123, "step": 80320 }, { "epoch": 1.8730028270816939, "grad_norm": 1.047380805015564, "learning_rate": 3.759326618168252e-07, "loss": 0.1027, "step": 80330 }, { "epoch": 1.8732359884585117, "grad_norm": 1.7975019216537476, "learning_rate": 3.7585493999875644e-07, "loss": 0.1045, "step": 80340 }, { "epoch": 1.8734691498353298, "grad_norm": 1.5201390981674194, "learning_rate": 3.7577721818068765e-07, "loss": 0.1028, "step": 80350 }, { "epoch": 1.8737023112121478, "grad_norm": 1.8279528617858887, "learning_rate": 3.7569949636261886e-07, "loss": 0.1179, "step": 80360 }, { "epoch": 1.8739354725889656, "grad_norm": 1.473319172859192, "learning_rate": 3.756217745445501e-07, "loss": 0.1025, "step": 80370 }, { "epoch": 1.8741686339657835, "grad_norm": 1.3603273630142212, "learning_rate": 3.755440527264814e-07, "loss": 0.1044, "step": 80380 }, { "epoch": 1.8744017953426015, "grad_norm": 1.2977522611618042, "learning_rate": 3.7546633090841254e-07, "loss": 0.093, "step": 80390 }, { "epoch": 1.8746349567194194, "grad_norm": 1.3990745544433594, "learning_rate": 3.753886090903438e-07, "loss": 0.1149, "step": 80400 }, { "epoch": 1.8748681180962374, "grad_norm": 1.7441534996032715, "learning_rate": 3.7531088727227507e-07, "loss": 0.1021, "step": 80410 }, { "epoch": 1.8751012794730553, "grad_norm": 1.6133570671081543, "learning_rate": 3.752331654542063e-07, "loss": 0.1125, "step": 80420 }, { "epoch": 1.875334440849873, "grad_norm": 1.2426316738128662, "learning_rate": 3.751554436361375e-07, "loss": 0.1026, "step": 80430 }, { "epoch": 1.8755676022266912, "grad_norm": 1.2766352891921997, "learning_rate": 3.7507772181806875e-07, "loss": 0.1021, "step": 80440 }, { "epoch": 1.8758007636035092, "grad_norm": 1.4665058851242065, "learning_rate": 3.75e-07, "loss": 0.1088, "step": 80450 }, { "epoch": 1.876033924980327, "grad_norm": 1.8370842933654785, "learning_rate": 3.749222781819312e-07, "loss": 0.1085, "step": 80460 }, { "epoch": 1.876267086357145, "grad_norm": 1.3584378957748413, "learning_rate": 3.748445563638625e-07, "loss": 0.1021, "step": 80470 }, { "epoch": 1.8765002477339627, "grad_norm": 3.385101556777954, "learning_rate": 3.747668345457937e-07, "loss": 0.1072, "step": 80480 }, { "epoch": 1.8767334091107808, "grad_norm": 2.0043516159057617, "learning_rate": 3.746891127277249e-07, "loss": 0.106, "step": 80490 }, { "epoch": 1.8769665704875989, "grad_norm": 1.4722672700881958, "learning_rate": 3.7461139090965616e-07, "loss": 0.1034, "step": 80500 }, { "epoch": 1.8771997318644167, "grad_norm": 2.317599058151245, "learning_rate": 3.745336690915874e-07, "loss": 0.1027, "step": 80510 }, { "epoch": 1.8774328932412345, "grad_norm": 3.0979576110839844, "learning_rate": 3.744559472735186e-07, "loss": 0.0921, "step": 80520 }, { "epoch": 1.8776660546180524, "grad_norm": 1.6072660684585571, "learning_rate": 3.7437822545544984e-07, "loss": 0.1078, "step": 80530 }, { "epoch": 1.8778992159948704, "grad_norm": 1.199707269668579, "learning_rate": 3.743005036373811e-07, "loss": 0.1201, "step": 80540 }, { "epoch": 1.8781323773716885, "grad_norm": 1.8188574314117432, "learning_rate": 3.742227818193123e-07, "loss": 0.1061, "step": 80550 }, { "epoch": 1.8783655387485063, "grad_norm": 3.08986496925354, "learning_rate": 3.741450600012435e-07, "loss": 0.1081, "step": 80560 }, { "epoch": 1.8785987001253242, "grad_norm": 1.0511879920959473, "learning_rate": 3.740673381831748e-07, "loss": 0.111, "step": 80570 }, { "epoch": 1.8788318615021422, "grad_norm": 1.271170973777771, "learning_rate": 3.73989616365106e-07, "loss": 0.114, "step": 80580 }, { "epoch": 1.87906502287896, "grad_norm": 1.3791900873184204, "learning_rate": 3.7391189454703726e-07, "loss": 0.1092, "step": 80590 }, { "epoch": 1.879298184255778, "grad_norm": 2.3795294761657715, "learning_rate": 3.7383417272896847e-07, "loss": 0.1102, "step": 80600 }, { "epoch": 1.879531345632596, "grad_norm": 1.3013750314712524, "learning_rate": 3.737564509108997e-07, "loss": 0.108, "step": 80610 }, { "epoch": 1.8797645070094138, "grad_norm": 1.0296984910964966, "learning_rate": 3.7367872909283094e-07, "loss": 0.1039, "step": 80620 }, { "epoch": 1.8799976683862318, "grad_norm": 1.750230073928833, "learning_rate": 3.736010072747622e-07, "loss": 0.103, "step": 80630 }, { "epoch": 1.88023082976305, "grad_norm": 2.1969668865203857, "learning_rate": 3.7352328545669336e-07, "loss": 0.1077, "step": 80640 }, { "epoch": 1.8804639911398677, "grad_norm": 1.0569348335266113, "learning_rate": 3.734455636386246e-07, "loss": 0.1054, "step": 80650 }, { "epoch": 1.8806971525166856, "grad_norm": 2.3101799488067627, "learning_rate": 3.733678418205559e-07, "loss": 0.1032, "step": 80660 }, { "epoch": 1.8809303138935034, "grad_norm": 1.4820351600646973, "learning_rate": 3.732901200024871e-07, "loss": 0.1034, "step": 80670 }, { "epoch": 1.8811634752703215, "grad_norm": 2.328873872756958, "learning_rate": 3.732123981844183e-07, "loss": 0.1155, "step": 80680 }, { "epoch": 1.8813966366471395, "grad_norm": 1.7655431032180786, "learning_rate": 3.7313467636634956e-07, "loss": 0.1032, "step": 80690 }, { "epoch": 1.8816297980239574, "grad_norm": 1.9483437538146973, "learning_rate": 3.7305695454828077e-07, "loss": 0.1038, "step": 80700 }, { "epoch": 1.8818629594007752, "grad_norm": 2.762387275695801, "learning_rate": 3.7297923273021203e-07, "loss": 0.1054, "step": 80710 }, { "epoch": 1.882096120777593, "grad_norm": 2.709991931915283, "learning_rate": 3.7290151091214324e-07, "loss": 0.1016, "step": 80720 }, { "epoch": 1.882329282154411, "grad_norm": 2.037102699279785, "learning_rate": 3.7282378909407445e-07, "loss": 0.1012, "step": 80730 }, { "epoch": 1.8825624435312291, "grad_norm": 2.351910352706909, "learning_rate": 3.727460672760057e-07, "loss": 0.1037, "step": 80740 }, { "epoch": 1.882795604908047, "grad_norm": 2.4909584522247314, "learning_rate": 3.72668345457937e-07, "loss": 0.1009, "step": 80750 }, { "epoch": 1.8830287662848648, "grad_norm": 2.0094046592712402, "learning_rate": 3.7259062363986813e-07, "loss": 0.1015, "step": 80760 }, { "epoch": 1.8832619276616829, "grad_norm": 1.7938529253005981, "learning_rate": 3.725129018217994e-07, "loss": 0.0971, "step": 80770 }, { "epoch": 1.883495089038501, "grad_norm": 2.331136465072632, "learning_rate": 3.7243518000373065e-07, "loss": 0.0985, "step": 80780 }, { "epoch": 1.8837282504153188, "grad_norm": 2.8094489574432373, "learning_rate": 3.7235745818566186e-07, "loss": 0.1111, "step": 80790 }, { "epoch": 1.8839614117921366, "grad_norm": 1.0458699464797974, "learning_rate": 3.7227973636759307e-07, "loss": 0.1015, "step": 80800 }, { "epoch": 1.8841945731689544, "grad_norm": 1.5801506042480469, "learning_rate": 3.7220201454952434e-07, "loss": 0.1011, "step": 80810 }, { "epoch": 1.8844277345457725, "grad_norm": 3.245048761367798, "learning_rate": 3.7212429273145554e-07, "loss": 0.105, "step": 80820 }, { "epoch": 1.8846608959225906, "grad_norm": 1.360568642616272, "learning_rate": 3.720465709133868e-07, "loss": 0.1063, "step": 80830 }, { "epoch": 1.8848940572994084, "grad_norm": 1.4005533456802368, "learning_rate": 3.71968849095318e-07, "loss": 0.0965, "step": 80840 }, { "epoch": 1.8851272186762262, "grad_norm": 1.557984709739685, "learning_rate": 3.718911272772492e-07, "loss": 0.0915, "step": 80850 }, { "epoch": 1.885360380053044, "grad_norm": 1.5564104318618774, "learning_rate": 3.718134054591805e-07, "loss": 0.109, "step": 80860 }, { "epoch": 1.8855935414298621, "grad_norm": 1.2039670944213867, "learning_rate": 3.7173568364111175e-07, "loss": 0.095, "step": 80870 }, { "epoch": 1.8858267028066802, "grad_norm": 1.546864628791809, "learning_rate": 3.716579618230429e-07, "loss": 0.1155, "step": 80880 }, { "epoch": 1.886059864183498, "grad_norm": 3.232128620147705, "learning_rate": 3.7158024000497417e-07, "loss": 0.1105, "step": 80890 }, { "epoch": 1.8862930255603159, "grad_norm": 1.351934552192688, "learning_rate": 3.7150251818690543e-07, "loss": 0.0974, "step": 80900 }, { "epoch": 1.8865261869371337, "grad_norm": 1.4222583770751953, "learning_rate": 3.714247963688367e-07, "loss": 0.1095, "step": 80910 }, { "epoch": 1.8867593483139518, "grad_norm": 1.5385311841964722, "learning_rate": 3.7134707455076785e-07, "loss": 0.1043, "step": 80920 }, { "epoch": 1.8869925096907698, "grad_norm": 1.3700218200683594, "learning_rate": 3.712693527326991e-07, "loss": 0.1002, "step": 80930 }, { "epoch": 1.8872256710675877, "grad_norm": 1.220169186592102, "learning_rate": 3.7119163091463037e-07, "loss": 0.1201, "step": 80940 }, { "epoch": 1.8874588324444055, "grad_norm": 2.0127882957458496, "learning_rate": 3.711139090965616e-07, "loss": 0.113, "step": 80950 }, { "epoch": 1.8876919938212235, "grad_norm": 1.8820991516113281, "learning_rate": 3.710361872784928e-07, "loss": 0.1004, "step": 80960 }, { "epoch": 1.8879251551980416, "grad_norm": 1.216975450515747, "learning_rate": 3.7095846546042405e-07, "loss": 0.1133, "step": 80970 }, { "epoch": 1.8881583165748594, "grad_norm": 1.516671895980835, "learning_rate": 3.7088074364235526e-07, "loss": 0.1078, "step": 80980 }, { "epoch": 1.8883914779516773, "grad_norm": 1.5671873092651367, "learning_rate": 3.708030218242865e-07, "loss": 0.1058, "step": 80990 }, { "epoch": 1.8886246393284951, "grad_norm": 1.3332136869430542, "learning_rate": 3.7072530000621773e-07, "loss": 0.0996, "step": 81000 }, { "epoch": 1.8888578007053132, "grad_norm": 3.560441017150879, "learning_rate": 3.7064757818814894e-07, "loss": 0.1146, "step": 81010 }, { "epoch": 1.8890909620821312, "grad_norm": 1.5785166025161743, "learning_rate": 3.705698563700802e-07, "loss": 0.1146, "step": 81020 }, { "epoch": 1.889324123458949, "grad_norm": 1.7128005027770996, "learning_rate": 3.7049213455201147e-07, "loss": 0.1162, "step": 81030 }, { "epoch": 1.889557284835767, "grad_norm": 1.3952245712280273, "learning_rate": 3.704144127339426e-07, "loss": 0.0942, "step": 81040 }, { "epoch": 1.8897904462125847, "grad_norm": 1.1981661319732666, "learning_rate": 3.703366909158739e-07, "loss": 0.1075, "step": 81050 }, { "epoch": 1.8900236075894028, "grad_norm": 1.5056082010269165, "learning_rate": 3.7025896909780515e-07, "loss": 0.1016, "step": 81060 }, { "epoch": 1.8902567689662209, "grad_norm": 2.2665863037109375, "learning_rate": 3.7018124727973636e-07, "loss": 0.0987, "step": 81070 }, { "epoch": 1.8904899303430387, "grad_norm": 3.157332181930542, "learning_rate": 3.7010352546166757e-07, "loss": 0.1018, "step": 81080 }, { "epoch": 1.8907230917198565, "grad_norm": 1.4766074419021606, "learning_rate": 3.7002580364359883e-07, "loss": 0.1054, "step": 81090 }, { "epoch": 1.8909562530966746, "grad_norm": 1.3001865148544312, "learning_rate": 3.6994808182553004e-07, "loss": 0.1012, "step": 81100 }, { "epoch": 1.8911894144734924, "grad_norm": 1.3101791143417358, "learning_rate": 3.698703600074613e-07, "loss": 0.1037, "step": 81110 }, { "epoch": 1.8914225758503105, "grad_norm": 1.1153104305267334, "learning_rate": 3.697926381893925e-07, "loss": 0.0985, "step": 81120 }, { "epoch": 1.8916557372271283, "grad_norm": 1.2129544019699097, "learning_rate": 3.697149163713237e-07, "loss": 0.1032, "step": 81130 }, { "epoch": 1.8918888986039462, "grad_norm": 3.306297540664673, "learning_rate": 3.69637194553255e-07, "loss": 0.1128, "step": 81140 }, { "epoch": 1.8921220599807642, "grad_norm": 2.2857306003570557, "learning_rate": 3.6955947273518624e-07, "loss": 0.1057, "step": 81150 }, { "epoch": 1.8923552213575823, "grad_norm": 1.3229981660842896, "learning_rate": 3.694817509171174e-07, "loss": 0.12, "step": 81160 }, { "epoch": 1.8925883827344, "grad_norm": 1.1281930208206177, "learning_rate": 3.6940402909904866e-07, "loss": 0.1011, "step": 81170 }, { "epoch": 1.892821544111218, "grad_norm": 1.4738483428955078, "learning_rate": 3.693263072809799e-07, "loss": 0.0967, "step": 81180 }, { "epoch": 1.8930547054880358, "grad_norm": 1.043961763381958, "learning_rate": 3.6924858546291113e-07, "loss": 0.1014, "step": 81190 }, { "epoch": 1.8932878668648538, "grad_norm": 1.6917574405670166, "learning_rate": 3.6917086364484234e-07, "loss": 0.1035, "step": 81200 }, { "epoch": 1.893521028241672, "grad_norm": 1.060630440711975, "learning_rate": 3.690931418267736e-07, "loss": 0.0934, "step": 81210 }, { "epoch": 1.8937541896184897, "grad_norm": 1.6414052248001099, "learning_rate": 3.690154200087048e-07, "loss": 0.1141, "step": 81220 }, { "epoch": 1.8939873509953076, "grad_norm": 1.5953869819641113, "learning_rate": 3.689376981906361e-07, "loss": 0.1081, "step": 81230 }, { "epoch": 1.8942205123721254, "grad_norm": 1.4340614080429077, "learning_rate": 3.688599763725673e-07, "loss": 0.1109, "step": 81240 }, { "epoch": 1.8944536737489435, "grad_norm": 1.607771873474121, "learning_rate": 3.687822545544985e-07, "loss": 0.1062, "step": 81250 }, { "epoch": 1.8946868351257615, "grad_norm": 1.0129247903823853, "learning_rate": 3.6870453273642976e-07, "loss": 0.0915, "step": 81260 }, { "epoch": 1.8949199965025794, "grad_norm": 1.4639341831207275, "learning_rate": 3.68626810918361e-07, "loss": 0.1127, "step": 81270 }, { "epoch": 1.8951531578793972, "grad_norm": 1.3752089738845825, "learning_rate": 3.685490891002922e-07, "loss": 0.0997, "step": 81280 }, { "epoch": 1.8953863192562153, "grad_norm": 1.8030779361724854, "learning_rate": 3.6847136728222344e-07, "loss": 0.1128, "step": 81290 }, { "epoch": 1.895619480633033, "grad_norm": 2.343362808227539, "learning_rate": 3.683936454641547e-07, "loss": 0.1108, "step": 81300 }, { "epoch": 1.8958526420098512, "grad_norm": 1.967086672782898, "learning_rate": 3.6831592364608596e-07, "loss": 0.1129, "step": 81310 }, { "epoch": 1.896085803386669, "grad_norm": 2.7947466373443604, "learning_rate": 3.682382018280171e-07, "loss": 0.1082, "step": 81320 }, { "epoch": 1.8963189647634868, "grad_norm": 2.650649309158325, "learning_rate": 3.681604800099484e-07, "loss": 0.1071, "step": 81330 }, { "epoch": 1.8965521261403049, "grad_norm": 1.4463132619857788, "learning_rate": 3.6808275819187964e-07, "loss": 0.0974, "step": 81340 }, { "epoch": 1.896785287517123, "grad_norm": 2.1645848751068115, "learning_rate": 3.6800503637381085e-07, "loss": 0.1157, "step": 81350 }, { "epoch": 1.8970184488939408, "grad_norm": 1.9171106815338135, "learning_rate": 3.6792731455574206e-07, "loss": 0.1121, "step": 81360 }, { "epoch": 1.8972516102707586, "grad_norm": 1.3602995872497559, "learning_rate": 3.678495927376733e-07, "loss": 0.1091, "step": 81370 }, { "epoch": 1.8974847716475765, "grad_norm": 1.349211573600769, "learning_rate": 3.6777187091960453e-07, "loss": 0.1111, "step": 81380 }, { "epoch": 1.8977179330243945, "grad_norm": 1.539085865020752, "learning_rate": 3.676941491015358e-07, "loss": 0.1017, "step": 81390 }, { "epoch": 1.8979510944012126, "grad_norm": 1.574440598487854, "learning_rate": 3.67616427283467e-07, "loss": 0.106, "step": 81400 }, { "epoch": 1.8981842557780304, "grad_norm": 3.5670459270477295, "learning_rate": 3.675387054653982e-07, "loss": 0.1048, "step": 81410 }, { "epoch": 1.8984174171548482, "grad_norm": 1.5582739114761353, "learning_rate": 3.6746098364732947e-07, "loss": 0.1031, "step": 81420 }, { "epoch": 1.898650578531666, "grad_norm": 0.9890890121459961, "learning_rate": 3.6738326182926074e-07, "loss": 0.107, "step": 81430 }, { "epoch": 1.8988837399084841, "grad_norm": 1.2411582469940186, "learning_rate": 3.673055400111919e-07, "loss": 0.1002, "step": 81440 }, { "epoch": 1.8991169012853022, "grad_norm": 2.364521026611328, "learning_rate": 3.6722781819312315e-07, "loss": 0.1017, "step": 81450 }, { "epoch": 1.89935006266212, "grad_norm": 1.50620698928833, "learning_rate": 3.671500963750544e-07, "loss": 0.1004, "step": 81460 }, { "epoch": 1.8995832240389379, "grad_norm": 1.2642216682434082, "learning_rate": 3.670723745569856e-07, "loss": 0.1018, "step": 81470 }, { "epoch": 1.899816385415756, "grad_norm": 1.684881329536438, "learning_rate": 3.6699465273891683e-07, "loss": 0.1063, "step": 81480 }, { "epoch": 1.9000495467925738, "grad_norm": 1.7037973403930664, "learning_rate": 3.669169309208481e-07, "loss": 0.1122, "step": 81490 }, { "epoch": 1.9002827081693918, "grad_norm": 1.5281599760055542, "learning_rate": 3.668392091027793e-07, "loss": 0.1172, "step": 81500 }, { "epoch": 1.9005158695462097, "grad_norm": 1.2644587755203247, "learning_rate": 3.6676148728471057e-07, "loss": 0.1054, "step": 81510 }, { "epoch": 1.9007490309230275, "grad_norm": 1.4458760023117065, "learning_rate": 3.666837654666418e-07, "loss": 0.0961, "step": 81520 }, { "epoch": 1.9009821922998456, "grad_norm": 1.9963104724884033, "learning_rate": 3.66606043648573e-07, "loss": 0.1002, "step": 81530 }, { "epoch": 1.9012153536766636, "grad_norm": 1.3113861083984375, "learning_rate": 3.6652832183050425e-07, "loss": 0.1062, "step": 81540 }, { "epoch": 1.9014485150534814, "grad_norm": 1.8380345106124878, "learning_rate": 3.664506000124355e-07, "loss": 0.0986, "step": 81550 }, { "epoch": 1.9016816764302993, "grad_norm": 1.4558228254318237, "learning_rate": 3.6637287819436667e-07, "loss": 0.1253, "step": 81560 }, { "epoch": 1.9019148378071171, "grad_norm": 2.2765889167785645, "learning_rate": 3.6629515637629793e-07, "loss": 0.1114, "step": 81570 }, { "epoch": 1.9021479991839352, "grad_norm": 2.6947836875915527, "learning_rate": 3.662174345582292e-07, "loss": 0.1086, "step": 81580 }, { "epoch": 1.9023811605607532, "grad_norm": 2.2330358028411865, "learning_rate": 3.661397127401604e-07, "loss": 0.0967, "step": 81590 }, { "epoch": 1.902614321937571, "grad_norm": 2.0227773189544678, "learning_rate": 3.660619909220916e-07, "loss": 0.1091, "step": 81600 }, { "epoch": 1.902847483314389, "grad_norm": 3.397533893585205, "learning_rate": 3.6598426910402287e-07, "loss": 0.1071, "step": 81610 }, { "epoch": 1.9030806446912067, "grad_norm": 1.9836615324020386, "learning_rate": 3.659065472859541e-07, "loss": 0.0931, "step": 81620 }, { "epoch": 1.9033138060680248, "grad_norm": 1.6443105936050415, "learning_rate": 3.6582882546788534e-07, "loss": 0.1, "step": 81630 }, { "epoch": 1.9035469674448429, "grad_norm": 1.7760332822799683, "learning_rate": 3.6575110364981655e-07, "loss": 0.1032, "step": 81640 }, { "epoch": 1.9037801288216607, "grad_norm": 1.5889071226119995, "learning_rate": 3.6567338183174776e-07, "loss": 0.1087, "step": 81650 }, { "epoch": 1.9040132901984785, "grad_norm": 1.8806933164596558, "learning_rate": 3.65595660013679e-07, "loss": 0.1017, "step": 81660 }, { "epoch": 1.9042464515752966, "grad_norm": 1.3179339170455933, "learning_rate": 3.655179381956103e-07, "loss": 0.1056, "step": 81670 }, { "epoch": 1.9044796129521144, "grad_norm": 1.9672399759292603, "learning_rate": 3.6544021637754144e-07, "loss": 0.1209, "step": 81680 }, { "epoch": 1.9047127743289325, "grad_norm": 1.5533663034439087, "learning_rate": 3.653624945594727e-07, "loss": 0.104, "step": 81690 }, { "epoch": 1.9049459357057503, "grad_norm": 0.9699487686157227, "learning_rate": 3.6528477274140397e-07, "loss": 0.1028, "step": 81700 }, { "epoch": 1.9051790970825682, "grad_norm": 2.097489595413208, "learning_rate": 3.6520705092333523e-07, "loss": 0.1098, "step": 81710 }, { "epoch": 1.9054122584593862, "grad_norm": 2.268202304840088, "learning_rate": 3.651293291052664e-07, "loss": 0.1097, "step": 81720 }, { "epoch": 1.9056454198362043, "grad_norm": 0.9648366570472717, "learning_rate": 3.6505160728719765e-07, "loss": 0.1009, "step": 81730 }, { "epoch": 1.9058785812130221, "grad_norm": 2.5066709518432617, "learning_rate": 3.649738854691289e-07, "loss": 0.1054, "step": 81740 }, { "epoch": 1.90611174258984, "grad_norm": 1.5089435577392578, "learning_rate": 3.648961636510601e-07, "loss": 0.1146, "step": 81750 }, { "epoch": 1.9063449039666578, "grad_norm": 2.8392434120178223, "learning_rate": 3.6481844183299133e-07, "loss": 0.1057, "step": 81760 }, { "epoch": 1.9065780653434758, "grad_norm": 1.401985764503479, "learning_rate": 3.6474849219672945e-07, "loss": 0.1, "step": 81770 }, { "epoch": 1.906811226720294, "grad_norm": 1.2755554914474487, "learning_rate": 3.6467077037866066e-07, "loss": 0.0959, "step": 81780 }, { "epoch": 1.9070443880971117, "grad_norm": 2.299203634262085, "learning_rate": 3.645930485605919e-07, "loss": 0.1047, "step": 81790 }, { "epoch": 1.9072775494739296, "grad_norm": 1.1751608848571777, "learning_rate": 3.645153267425232e-07, "loss": 0.1076, "step": 81800 }, { "epoch": 1.9075107108507474, "grad_norm": 1.6061197519302368, "learning_rate": 3.6443760492445434e-07, "loss": 0.0923, "step": 81810 }, { "epoch": 1.9077438722275655, "grad_norm": 1.4808977842330933, "learning_rate": 3.643598831063856e-07, "loss": 0.1038, "step": 81820 }, { "epoch": 1.9079770336043835, "grad_norm": 1.9443283081054688, "learning_rate": 3.6428216128831687e-07, "loss": 0.1097, "step": 81830 }, { "epoch": 1.9082101949812014, "grad_norm": 1.185926914215088, "learning_rate": 3.642044394702481e-07, "loss": 0.1183, "step": 81840 }, { "epoch": 1.9084433563580192, "grad_norm": 1.4706634283065796, "learning_rate": 3.641267176521793e-07, "loss": 0.1062, "step": 81850 }, { "epoch": 1.9086765177348373, "grad_norm": 1.6292165517807007, "learning_rate": 3.6404899583411055e-07, "loss": 0.1125, "step": 81860 }, { "epoch": 1.908909679111655, "grad_norm": 1.2572795152664185, "learning_rate": 3.6397127401604176e-07, "loss": 0.0927, "step": 81870 }, { "epoch": 1.9091428404884732, "grad_norm": 1.5561792850494385, "learning_rate": 3.63893552197973e-07, "loss": 0.111, "step": 81880 }, { "epoch": 1.909376001865291, "grad_norm": 1.8649920225143433, "learning_rate": 3.6381583037990423e-07, "loss": 0.1116, "step": 81890 }, { "epoch": 1.9096091632421088, "grad_norm": 1.6228710412979126, "learning_rate": 3.6373810856183544e-07, "loss": 0.1037, "step": 81900 }, { "epoch": 1.909842324618927, "grad_norm": 1.3822989463806152, "learning_rate": 3.636603867437667e-07, "loss": 0.1083, "step": 81910 }, { "epoch": 1.910075485995745, "grad_norm": 2.213592052459717, "learning_rate": 3.6358266492569796e-07, "loss": 0.1151, "step": 81920 }, { "epoch": 1.9103086473725628, "grad_norm": 1.4761613607406616, "learning_rate": 3.635049431076291e-07, "loss": 0.0959, "step": 81930 }, { "epoch": 1.9105418087493806, "grad_norm": 3.793818712234497, "learning_rate": 3.634272212895604e-07, "loss": 0.0952, "step": 81940 }, { "epoch": 1.9107749701261985, "grad_norm": 1.0335291624069214, "learning_rate": 3.6334949947149164e-07, "loss": 0.1051, "step": 81950 }, { "epoch": 1.9110081315030165, "grad_norm": 1.47175133228302, "learning_rate": 3.6327177765342285e-07, "loss": 0.1031, "step": 81960 }, { "epoch": 1.9112412928798346, "grad_norm": 1.5038167238235474, "learning_rate": 3.6319405583535406e-07, "loss": 0.1098, "step": 81970 }, { "epoch": 1.9114744542566524, "grad_norm": 1.4526115655899048, "learning_rate": 3.631163340172853e-07, "loss": 0.0983, "step": 81980 }, { "epoch": 1.9117076156334702, "grad_norm": 1.0585929155349731, "learning_rate": 3.6303861219921653e-07, "loss": 0.1096, "step": 81990 }, { "epoch": 1.911940777010288, "grad_norm": 2.1789841651916504, "learning_rate": 3.629608903811478e-07, "loss": 0.105, "step": 82000 }, { "epoch": 1.9121739383871061, "grad_norm": 1.2105093002319336, "learning_rate": 3.62883168563079e-07, "loss": 0.1048, "step": 82010 }, { "epoch": 1.9124070997639242, "grad_norm": 2.0018293857574463, "learning_rate": 3.628054467450102e-07, "loss": 0.1057, "step": 82020 }, { "epoch": 1.912640261140742, "grad_norm": 1.270521640777588, "learning_rate": 3.627277249269415e-07, "loss": 0.1031, "step": 82030 }, { "epoch": 1.9128734225175599, "grad_norm": 1.28456449508667, "learning_rate": 3.6265000310887274e-07, "loss": 0.1108, "step": 82040 }, { "epoch": 1.913106583894378, "grad_norm": 2.5160601139068604, "learning_rate": 3.625722812908039e-07, "loss": 0.1037, "step": 82050 }, { "epoch": 1.913339745271196, "grad_norm": 0.9989191889762878, "learning_rate": 3.6249455947273516e-07, "loss": 0.1031, "step": 82060 }, { "epoch": 1.9135729066480138, "grad_norm": 1.7063161134719849, "learning_rate": 3.624168376546664e-07, "loss": 0.115, "step": 82070 }, { "epoch": 1.9138060680248317, "grad_norm": 1.4144963026046753, "learning_rate": 3.6233911583659763e-07, "loss": 0.1049, "step": 82080 }, { "epoch": 1.9140392294016495, "grad_norm": 1.2838586568832397, "learning_rate": 3.6226139401852884e-07, "loss": 0.0932, "step": 82090 }, { "epoch": 1.9142723907784676, "grad_norm": 1.586885690689087, "learning_rate": 3.621836722004601e-07, "loss": 0.102, "step": 82100 }, { "epoch": 1.9145055521552856, "grad_norm": 2.3033337593078613, "learning_rate": 3.6210595038239136e-07, "loss": 0.1094, "step": 82110 }, { "epoch": 1.9147387135321035, "grad_norm": 1.374461054801941, "learning_rate": 3.6202822856432257e-07, "loss": 0.1092, "step": 82120 }, { "epoch": 1.9149718749089213, "grad_norm": 1.2436058521270752, "learning_rate": 3.619505067462538e-07, "loss": 0.1061, "step": 82130 }, { "epoch": 1.9152050362857391, "grad_norm": 1.6726888418197632, "learning_rate": 3.6187278492818504e-07, "loss": 0.0902, "step": 82140 }, { "epoch": 1.9154381976625572, "grad_norm": 1.4439691305160522, "learning_rate": 3.6179506311011625e-07, "loss": 0.1098, "step": 82150 }, { "epoch": 1.9156713590393752, "grad_norm": 1.6261709928512573, "learning_rate": 3.617173412920475e-07, "loss": 0.0999, "step": 82160 }, { "epoch": 1.915904520416193, "grad_norm": 1.701522946357727, "learning_rate": 3.616396194739787e-07, "loss": 0.1167, "step": 82170 }, { "epoch": 1.916137681793011, "grad_norm": 3.0749852657318115, "learning_rate": 3.6156189765590993e-07, "loss": 0.1063, "step": 82180 }, { "epoch": 1.9163708431698288, "grad_norm": 1.4353066682815552, "learning_rate": 3.614841758378412e-07, "loss": 0.0996, "step": 82190 }, { "epoch": 1.9166040045466468, "grad_norm": 1.1648446321487427, "learning_rate": 3.6140645401977246e-07, "loss": 0.1078, "step": 82200 }, { "epoch": 1.9168371659234649, "grad_norm": 3.3792593479156494, "learning_rate": 3.613287322017036e-07, "loss": 0.1113, "step": 82210 }, { "epoch": 1.9170703273002827, "grad_norm": 1.5919067859649658, "learning_rate": 3.6125101038363487e-07, "loss": 0.1122, "step": 82220 }, { "epoch": 1.9173034886771005, "grad_norm": 3.7542622089385986, "learning_rate": 3.6117328856556614e-07, "loss": 0.1072, "step": 82230 }, { "epoch": 1.9175366500539186, "grad_norm": 2.157158613204956, "learning_rate": 3.6109556674749734e-07, "loss": 0.1026, "step": 82240 }, { "epoch": 1.9177698114307367, "grad_norm": 1.1701723337173462, "learning_rate": 3.6101784492942855e-07, "loss": 0.117, "step": 82250 }, { "epoch": 1.9180029728075545, "grad_norm": 1.7407159805297852, "learning_rate": 3.609401231113598e-07, "loss": 0.1188, "step": 82260 }, { "epoch": 1.9182361341843723, "grad_norm": 1.5198310613632202, "learning_rate": 3.60862401293291e-07, "loss": 0.1154, "step": 82270 }, { "epoch": 1.9184692955611902, "grad_norm": 0.9189335703849792, "learning_rate": 3.607846794752223e-07, "loss": 0.094, "step": 82280 }, { "epoch": 1.9187024569380082, "grad_norm": 1.5787787437438965, "learning_rate": 3.607069576571535e-07, "loss": 0.1088, "step": 82290 }, { "epoch": 1.9189356183148263, "grad_norm": 1.3749183416366577, "learning_rate": 3.606292358390847e-07, "loss": 0.0996, "step": 82300 }, { "epoch": 1.9191687796916441, "grad_norm": 1.5049476623535156, "learning_rate": 3.6055151402101597e-07, "loss": 0.1042, "step": 82310 }, { "epoch": 1.919401941068462, "grad_norm": 2.1691670417785645, "learning_rate": 3.6047379220294723e-07, "loss": 0.1118, "step": 82320 }, { "epoch": 1.9196351024452798, "grad_norm": 1.411718487739563, "learning_rate": 3.603960703848784e-07, "loss": 0.1061, "step": 82330 }, { "epoch": 1.9198682638220979, "grad_norm": 1.2799595594406128, "learning_rate": 3.6031834856680965e-07, "loss": 0.1142, "step": 82340 }, { "epoch": 1.920101425198916, "grad_norm": 1.2031065225601196, "learning_rate": 3.602406267487409e-07, "loss": 0.1075, "step": 82350 }, { "epoch": 1.9203345865757337, "grad_norm": 1.6391956806182861, "learning_rate": 3.601629049306721e-07, "loss": 0.1089, "step": 82360 }, { "epoch": 1.9205677479525516, "grad_norm": 0.9991888999938965, "learning_rate": 3.6008518311260333e-07, "loss": 0.0984, "step": 82370 }, { "epoch": 1.9208009093293696, "grad_norm": 1.6615525484085083, "learning_rate": 3.600074612945346e-07, "loss": 0.1142, "step": 82380 }, { "epoch": 1.9210340707061875, "grad_norm": 1.4175037145614624, "learning_rate": 3.599297394764658e-07, "loss": 0.1033, "step": 82390 }, { "epoch": 1.9212672320830055, "grad_norm": 3.6334176063537598, "learning_rate": 3.5985201765839706e-07, "loss": 0.1065, "step": 82400 }, { "epoch": 1.9215003934598234, "grad_norm": 1.095902681350708, "learning_rate": 3.5977429584032827e-07, "loss": 0.0999, "step": 82410 }, { "epoch": 1.9217335548366412, "grad_norm": 1.7215774059295654, "learning_rate": 3.596965740222595e-07, "loss": 0.0997, "step": 82420 }, { "epoch": 1.9219667162134593, "grad_norm": 2.768855333328247, "learning_rate": 3.5961885220419074e-07, "loss": 0.1013, "step": 82430 }, { "epoch": 1.9221998775902773, "grad_norm": 1.3497563600540161, "learning_rate": 3.59541130386122e-07, "loss": 0.1141, "step": 82440 }, { "epoch": 1.9224330389670952, "grad_norm": 1.2973064184188843, "learning_rate": 3.5946340856805316e-07, "loss": 0.1006, "step": 82450 }, { "epoch": 1.922666200343913, "grad_norm": 2.7823424339294434, "learning_rate": 3.593856867499844e-07, "loss": 0.102, "step": 82460 }, { "epoch": 1.9228993617207308, "grad_norm": 1.7228955030441284, "learning_rate": 3.593079649319157e-07, "loss": 0.1046, "step": 82470 }, { "epoch": 1.923132523097549, "grad_norm": 3.0513079166412354, "learning_rate": 3.592302431138469e-07, "loss": 0.1076, "step": 82480 }, { "epoch": 1.923365684474367, "grad_norm": 1.5245225429534912, "learning_rate": 3.591525212957781e-07, "loss": 0.0994, "step": 82490 }, { "epoch": 1.9235988458511848, "grad_norm": 1.3019702434539795, "learning_rate": 3.5907479947770937e-07, "loss": 0.0942, "step": 82500 }, { "epoch": 1.9238320072280026, "grad_norm": 1.3188501596450806, "learning_rate": 3.589970776596406e-07, "loss": 0.1008, "step": 82510 }, { "epoch": 1.9240651686048205, "grad_norm": 2.188943862915039, "learning_rate": 3.5891935584157184e-07, "loss": 0.1125, "step": 82520 }, { "epoch": 1.9242983299816385, "grad_norm": 1.4922605752944946, "learning_rate": 3.5884163402350305e-07, "loss": 0.116, "step": 82530 }, { "epoch": 1.9245314913584566, "grad_norm": 1.1754231452941895, "learning_rate": 3.5876391220543426e-07, "loss": 0.1051, "step": 82540 }, { "epoch": 1.9247646527352744, "grad_norm": 1.1907073259353638, "learning_rate": 3.586861903873655e-07, "loss": 0.1109, "step": 82550 }, { "epoch": 1.9249978141120923, "grad_norm": 2.3094170093536377, "learning_rate": 3.586084685692968e-07, "loss": 0.1243, "step": 82560 }, { "epoch": 1.9252309754889103, "grad_norm": 1.401459813117981, "learning_rate": 3.58530746751228e-07, "loss": 0.1061, "step": 82570 }, { "epoch": 1.9254641368657281, "grad_norm": 1.4363012313842773, "learning_rate": 3.584530249331592e-07, "loss": 0.102, "step": 82580 }, { "epoch": 1.9256972982425462, "grad_norm": 1.0877383947372437, "learning_rate": 3.5837530311509046e-07, "loss": 0.1023, "step": 82590 }, { "epoch": 1.925930459619364, "grad_norm": 1.4238080978393555, "learning_rate": 3.582975812970217e-07, "loss": 0.0997, "step": 82600 }, { "epoch": 1.9261636209961819, "grad_norm": 1.6242696046829224, "learning_rate": 3.582198594789529e-07, "loss": 0.0971, "step": 82610 }, { "epoch": 1.926396782373, "grad_norm": 1.3874931335449219, "learning_rate": 3.5814213766088414e-07, "loss": 0.1046, "step": 82620 }, { "epoch": 1.926629943749818, "grad_norm": 1.404914379119873, "learning_rate": 3.580644158428154e-07, "loss": 0.1092, "step": 82630 }, { "epoch": 1.9268631051266358, "grad_norm": 1.6572773456573486, "learning_rate": 3.579866940247466e-07, "loss": 0.1161, "step": 82640 }, { "epoch": 1.9270962665034537, "grad_norm": 1.4547276496887207, "learning_rate": 3.579089722066778e-07, "loss": 0.101, "step": 82650 }, { "epoch": 1.9273294278802715, "grad_norm": 1.6636615991592407, "learning_rate": 3.578312503886091e-07, "loss": 0.101, "step": 82660 }, { "epoch": 1.9275625892570896, "grad_norm": 1.8353639841079712, "learning_rate": 3.577535285705403e-07, "loss": 0.1029, "step": 82670 }, { "epoch": 1.9277957506339076, "grad_norm": 1.6742037534713745, "learning_rate": 3.5767580675247156e-07, "loss": 0.105, "step": 82680 }, { "epoch": 1.9280289120107255, "grad_norm": 2.5830042362213135, "learning_rate": 3.5759808493440277e-07, "loss": 0.0974, "step": 82690 }, { "epoch": 1.9282620733875433, "grad_norm": 1.385205864906311, "learning_rate": 3.57520363116334e-07, "loss": 0.1152, "step": 82700 }, { "epoch": 1.9284952347643611, "grad_norm": 2.1838011741638184, "learning_rate": 3.5744264129826524e-07, "loss": 0.1034, "step": 82710 }, { "epoch": 1.9287283961411792, "grad_norm": 2.4330830574035645, "learning_rate": 3.573649194801965e-07, "loss": 0.1205, "step": 82720 }, { "epoch": 1.9289615575179972, "grad_norm": 1.6043716669082642, "learning_rate": 3.5728719766212765e-07, "loss": 0.1015, "step": 82730 }, { "epoch": 1.929194718894815, "grad_norm": 4.652280807495117, "learning_rate": 3.572094758440589e-07, "loss": 0.1085, "step": 82740 }, { "epoch": 1.929427880271633, "grad_norm": 1.1938416957855225, "learning_rate": 3.571317540259902e-07, "loss": 0.0925, "step": 82750 }, { "epoch": 1.929661041648451, "grad_norm": 2.9316141605377197, "learning_rate": 3.570540322079214e-07, "loss": 0.1093, "step": 82760 }, { "epoch": 1.9298942030252688, "grad_norm": 1.2462446689605713, "learning_rate": 3.569763103898526e-07, "loss": 0.1071, "step": 82770 }, { "epoch": 1.9301273644020869, "grad_norm": 1.4260432720184326, "learning_rate": 3.5689858857178386e-07, "loss": 0.1104, "step": 82780 }, { "epoch": 1.9303605257789047, "grad_norm": 1.7770100831985474, "learning_rate": 3.5682086675371507e-07, "loss": 0.0995, "step": 82790 }, { "epoch": 1.9305936871557225, "grad_norm": 1.1320141553878784, "learning_rate": 3.5674314493564633e-07, "loss": 0.1079, "step": 82800 }, { "epoch": 1.9308268485325406, "grad_norm": 1.084088683128357, "learning_rate": 3.5666542311757754e-07, "loss": 0.0951, "step": 82810 }, { "epoch": 1.9310600099093587, "grad_norm": 2.2577743530273438, "learning_rate": 3.5658770129950875e-07, "loss": 0.1029, "step": 82820 }, { "epoch": 1.9312931712861765, "grad_norm": 1.8790264129638672, "learning_rate": 3.5650997948144e-07, "loss": 0.1085, "step": 82830 }, { "epoch": 1.9315263326629943, "grad_norm": 1.1150476932525635, "learning_rate": 3.564322576633713e-07, "loss": 0.1073, "step": 82840 }, { "epoch": 1.9317594940398122, "grad_norm": 1.3821451663970947, "learning_rate": 3.5635453584530243e-07, "loss": 0.1036, "step": 82850 }, { "epoch": 1.9319926554166302, "grad_norm": 2.6356523036956787, "learning_rate": 3.5628458620904056e-07, "loss": 0.1076, "step": 82860 }, { "epoch": 1.9322258167934483, "grad_norm": 2.089263677597046, "learning_rate": 3.562068643909718e-07, "loss": 0.1098, "step": 82870 }, { "epoch": 1.9324589781702661, "grad_norm": 2.6290695667266846, "learning_rate": 3.5612914257290303e-07, "loss": 0.108, "step": 82880 }, { "epoch": 1.932692139547084, "grad_norm": 1.9861822128295898, "learning_rate": 3.560514207548343e-07, "loss": 0.0972, "step": 82890 }, { "epoch": 1.9329253009239018, "grad_norm": 2.5651004314422607, "learning_rate": 3.559736989367655e-07, "loss": 0.1026, "step": 82900 }, { "epoch": 1.9331584623007199, "grad_norm": 1.4386444091796875, "learning_rate": 3.558959771186967e-07, "loss": 0.0965, "step": 82910 }, { "epoch": 1.933391623677538, "grad_norm": 1.38084876537323, "learning_rate": 3.5581825530062797e-07, "loss": 0.1061, "step": 82920 }, { "epoch": 1.9336247850543558, "grad_norm": 2.689469337463379, "learning_rate": 3.5574053348255923e-07, "loss": 0.1055, "step": 82930 }, { "epoch": 1.9338579464311736, "grad_norm": 1.1231330633163452, "learning_rate": 3.556628116644904e-07, "loss": 0.1147, "step": 82940 }, { "epoch": 1.9340911078079916, "grad_norm": 2.2497715950012207, "learning_rate": 3.5558508984642165e-07, "loss": 0.1073, "step": 82950 }, { "epoch": 1.9343242691848095, "grad_norm": 4.847393035888672, "learning_rate": 3.555073680283529e-07, "loss": 0.1188, "step": 82960 }, { "epoch": 1.9345574305616275, "grad_norm": 2.2991554737091064, "learning_rate": 3.554296462102842e-07, "loss": 0.1067, "step": 82970 }, { "epoch": 1.9347905919384454, "grad_norm": 1.1252869367599487, "learning_rate": 3.5535192439221533e-07, "loss": 0.0943, "step": 82980 }, { "epoch": 1.9350237533152632, "grad_norm": 1.681137204170227, "learning_rate": 3.552742025741466e-07, "loss": 0.1012, "step": 82990 }, { "epoch": 1.9352569146920813, "grad_norm": 1.958745002746582, "learning_rate": 3.5519648075607786e-07, "loss": 0.0877, "step": 83000 }, { "epoch": 1.9354900760688993, "grad_norm": 2.1826837062835693, "learning_rate": 3.5511875893800906e-07, "loss": 0.1015, "step": 83010 }, { "epoch": 1.9357232374457172, "grad_norm": 1.5886693000793457, "learning_rate": 3.5504103711994027e-07, "loss": 0.1006, "step": 83020 }, { "epoch": 1.935956398822535, "grad_norm": 1.4138338565826416, "learning_rate": 3.5496331530187154e-07, "loss": 0.1091, "step": 83030 }, { "epoch": 1.9361895601993528, "grad_norm": 2.6783173084259033, "learning_rate": 3.5488559348380274e-07, "loss": 0.0976, "step": 83040 }, { "epoch": 1.936422721576171, "grad_norm": 2.1345150470733643, "learning_rate": 3.54807871665734e-07, "loss": 0.1118, "step": 83050 }, { "epoch": 1.936655882952989, "grad_norm": 2.8259592056274414, "learning_rate": 3.547301498476652e-07, "loss": 0.1068, "step": 83060 }, { "epoch": 1.9368890443298068, "grad_norm": 1.4529715776443481, "learning_rate": 3.546524280295964e-07, "loss": 0.1062, "step": 83070 }, { "epoch": 1.9371222057066246, "grad_norm": 2.190420150756836, "learning_rate": 3.545747062115277e-07, "loss": 0.1153, "step": 83080 }, { "epoch": 1.9373553670834425, "grad_norm": 2.2327382564544678, "learning_rate": 3.5449698439345895e-07, "loss": 0.113, "step": 83090 }, { "epoch": 1.9375885284602605, "grad_norm": 1.1491800546646118, "learning_rate": 3.544192625753901e-07, "loss": 0.0995, "step": 83100 }, { "epoch": 1.9378216898370786, "grad_norm": 2.1201014518737793, "learning_rate": 3.5434154075732137e-07, "loss": 0.1045, "step": 83110 }, { "epoch": 1.9380548512138964, "grad_norm": 1.7489937543869019, "learning_rate": 3.5426381893925263e-07, "loss": 0.1072, "step": 83120 }, { "epoch": 1.9382880125907143, "grad_norm": 1.9046289920806885, "learning_rate": 3.5418609712118384e-07, "loss": 0.0926, "step": 83130 }, { "epoch": 1.9385211739675323, "grad_norm": 1.9858224391937256, "learning_rate": 3.5410837530311505e-07, "loss": 0.0948, "step": 83140 }, { "epoch": 1.9387543353443502, "grad_norm": 2.344454765319824, "learning_rate": 3.540306534850463e-07, "loss": 0.106, "step": 83150 }, { "epoch": 1.9389874967211682, "grad_norm": 1.1857171058654785, "learning_rate": 3.539529316669775e-07, "loss": 0.1016, "step": 83160 }, { "epoch": 1.939220658097986, "grad_norm": 1.4179719686508179, "learning_rate": 3.538752098489088e-07, "loss": 0.1126, "step": 83170 }, { "epoch": 1.9394538194748039, "grad_norm": 1.769472360610962, "learning_rate": 3.5379748803084e-07, "loss": 0.102, "step": 83180 }, { "epoch": 1.939686980851622, "grad_norm": 2.2463889122009277, "learning_rate": 3.537197662127712e-07, "loss": 0.1182, "step": 83190 }, { "epoch": 1.93992014222844, "grad_norm": 1.7881509065628052, "learning_rate": 3.5364204439470246e-07, "loss": 0.1036, "step": 83200 }, { "epoch": 1.9401533036052578, "grad_norm": 1.364966630935669, "learning_rate": 3.535643225766337e-07, "loss": 0.1017, "step": 83210 }, { "epoch": 1.9403864649820757, "grad_norm": 3.7226860523223877, "learning_rate": 3.534866007585649e-07, "loss": 0.1162, "step": 83220 }, { "epoch": 1.9406196263588935, "grad_norm": 1.368767261505127, "learning_rate": 3.5340887894049614e-07, "loss": 0.0985, "step": 83230 }, { "epoch": 1.9408527877357116, "grad_norm": 1.5908129215240479, "learning_rate": 3.533311571224274e-07, "loss": 0.113, "step": 83240 }, { "epoch": 1.9410859491125296, "grad_norm": 2.3628993034362793, "learning_rate": 3.532534353043586e-07, "loss": 0.1053, "step": 83250 }, { "epoch": 1.9413191104893475, "grad_norm": 1.0876154899597168, "learning_rate": 3.531757134862898e-07, "loss": 0.1109, "step": 83260 }, { "epoch": 1.9415522718661653, "grad_norm": 2.940321922302246, "learning_rate": 3.530979916682211e-07, "loss": 0.1041, "step": 83270 }, { "epoch": 1.9417854332429831, "grad_norm": 1.1106116771697998, "learning_rate": 3.530202698501523e-07, "loss": 0.1073, "step": 83280 }, { "epoch": 1.9420185946198012, "grad_norm": 2.026425838470459, "learning_rate": 3.5294254803208356e-07, "loss": 0.113, "step": 83290 }, { "epoch": 1.9422517559966193, "grad_norm": 1.5779441595077515, "learning_rate": 3.5286482621401477e-07, "loss": 0.104, "step": 83300 }, { "epoch": 1.942484917373437, "grad_norm": 2.6173603534698486, "learning_rate": 3.52787104395946e-07, "loss": 0.0995, "step": 83310 }, { "epoch": 1.942718078750255, "grad_norm": 2.505279779434204, "learning_rate": 3.5270938257787724e-07, "loss": 0.0965, "step": 83320 }, { "epoch": 1.942951240127073, "grad_norm": 2.019893169403076, "learning_rate": 3.526316607598085e-07, "loss": 0.1042, "step": 83330 }, { "epoch": 1.943184401503891, "grad_norm": 3.3615856170654297, "learning_rate": 3.5255393894173976e-07, "loss": 0.1036, "step": 83340 }, { "epoch": 1.9434175628807089, "grad_norm": 1.5109140872955322, "learning_rate": 3.524762171236709e-07, "loss": 0.1232, "step": 83350 }, { "epoch": 1.9436507242575267, "grad_norm": 1.7021420001983643, "learning_rate": 3.523984953056022e-07, "loss": 0.0984, "step": 83360 }, { "epoch": 1.9438838856343446, "grad_norm": 1.520620346069336, "learning_rate": 3.5232077348753344e-07, "loss": 0.1016, "step": 83370 }, { "epoch": 1.9441170470111626, "grad_norm": 1.4811933040618896, "learning_rate": 3.5224305166946465e-07, "loss": 0.1086, "step": 83380 }, { "epoch": 1.9443502083879807, "grad_norm": 1.6691066026687622, "learning_rate": 3.5216532985139586e-07, "loss": 0.1053, "step": 83390 }, { "epoch": 1.9445833697647985, "grad_norm": 1.4068061113357544, "learning_rate": 3.520876080333271e-07, "loss": 0.1071, "step": 83400 }, { "epoch": 1.9448165311416163, "grad_norm": 1.0679508447647095, "learning_rate": 3.5200988621525833e-07, "loss": 0.0939, "step": 83410 }, { "epoch": 1.9450496925184342, "grad_norm": 1.5282596349716187, "learning_rate": 3.519321643971896e-07, "loss": 0.1062, "step": 83420 }, { "epoch": 1.9452828538952522, "grad_norm": 2.7951183319091797, "learning_rate": 3.518544425791208e-07, "loss": 0.109, "step": 83430 }, { "epoch": 1.9455160152720703, "grad_norm": 2.0139498710632324, "learning_rate": 3.51776720761052e-07, "loss": 0.1056, "step": 83440 }, { "epoch": 1.9457491766488881, "grad_norm": 1.9026027917861938, "learning_rate": 3.516989989429833e-07, "loss": 0.1158, "step": 83450 }, { "epoch": 1.945982338025706, "grad_norm": 1.2366225719451904, "learning_rate": 3.5162127712491454e-07, "loss": 0.1054, "step": 83460 }, { "epoch": 1.9462154994025238, "grad_norm": 2.820096969604492, "learning_rate": 3.515435553068457e-07, "loss": 0.1132, "step": 83470 }, { "epoch": 1.9464486607793419, "grad_norm": 1.0745129585266113, "learning_rate": 3.5146583348877696e-07, "loss": 0.0979, "step": 83480 }, { "epoch": 1.94668182215616, "grad_norm": 1.7007142305374146, "learning_rate": 3.513881116707082e-07, "loss": 0.0973, "step": 83490 }, { "epoch": 1.9469149835329778, "grad_norm": 1.912292718887329, "learning_rate": 3.5131038985263943e-07, "loss": 0.1073, "step": 83500 }, { "epoch": 1.9471481449097956, "grad_norm": 1.4694632291793823, "learning_rate": 3.5123266803457064e-07, "loss": 0.1077, "step": 83510 }, { "epoch": 1.9473813062866137, "grad_norm": 1.2178006172180176, "learning_rate": 3.511549462165019e-07, "loss": 0.1132, "step": 83520 }, { "epoch": 1.9476144676634317, "grad_norm": 1.3686559200286865, "learning_rate": 3.510772243984331e-07, "loss": 0.1158, "step": 83530 }, { "epoch": 1.9478476290402496, "grad_norm": 2.0921664237976074, "learning_rate": 3.5099950258036437e-07, "loss": 0.1028, "step": 83540 }, { "epoch": 1.9480807904170674, "grad_norm": 0.9747083783149719, "learning_rate": 3.509217807622956e-07, "loss": 0.1035, "step": 83550 }, { "epoch": 1.9483139517938852, "grad_norm": 1.3904626369476318, "learning_rate": 3.508440589442268e-07, "loss": 0.0861, "step": 83560 }, { "epoch": 1.9485471131707033, "grad_norm": 1.5805814266204834, "learning_rate": 3.5076633712615805e-07, "loss": 0.1033, "step": 83570 }, { "epoch": 1.9487802745475213, "grad_norm": 3.32002592086792, "learning_rate": 3.506886153080893e-07, "loss": 0.087, "step": 83580 }, { "epoch": 1.9490134359243392, "grad_norm": 2.242435932159424, "learning_rate": 3.5061089349002047e-07, "loss": 0.1021, "step": 83590 }, { "epoch": 1.949246597301157, "grad_norm": 1.9000897407531738, "learning_rate": 3.5053317167195173e-07, "loss": 0.1098, "step": 83600 }, { "epoch": 1.9494797586779748, "grad_norm": 1.0020676851272583, "learning_rate": 3.50455449853883e-07, "loss": 0.0934, "step": 83610 }, { "epoch": 1.949712920054793, "grad_norm": 1.1423896551132202, "learning_rate": 3.503777280358142e-07, "loss": 0.104, "step": 83620 }, { "epoch": 1.949946081431611, "grad_norm": 1.8268802165985107, "learning_rate": 3.503000062177454e-07, "loss": 0.1138, "step": 83630 }, { "epoch": 1.9501792428084288, "grad_norm": 1.3867439031600952, "learning_rate": 3.502222843996767e-07, "loss": 0.1075, "step": 83640 }, { "epoch": 1.9504124041852466, "grad_norm": 2.598621368408203, "learning_rate": 3.501445625816079e-07, "loss": 0.113, "step": 83650 }, { "epoch": 1.9506455655620647, "grad_norm": 2.871053695678711, "learning_rate": 3.5006684076353915e-07, "loss": 0.1154, "step": 83660 }, { "epoch": 1.9508787269388825, "grad_norm": 2.449063301086426, "learning_rate": 3.4998911894547035e-07, "loss": 0.1053, "step": 83670 }, { "epoch": 1.9511118883157006, "grad_norm": 1.7097511291503906, "learning_rate": 3.4991139712740156e-07, "loss": 0.0974, "step": 83680 }, { "epoch": 1.9513450496925184, "grad_norm": 2.104156732559204, "learning_rate": 3.498336753093328e-07, "loss": 0.1189, "step": 83690 }, { "epoch": 1.9515782110693363, "grad_norm": 1.1289565563201904, "learning_rate": 3.497559534912641e-07, "loss": 0.103, "step": 83700 }, { "epoch": 1.9518113724461543, "grad_norm": 1.4960392713546753, "learning_rate": 3.4967823167319524e-07, "loss": 0.0956, "step": 83710 }, { "epoch": 1.9520445338229724, "grad_norm": 1.8165984153747559, "learning_rate": 3.496005098551265e-07, "loss": 0.1074, "step": 83720 }, { "epoch": 1.9522776951997902, "grad_norm": 1.4745482206344604, "learning_rate": 3.4952278803705777e-07, "loss": 0.1092, "step": 83730 }, { "epoch": 1.952510856576608, "grad_norm": 2.8642406463623047, "learning_rate": 3.49445066218989e-07, "loss": 0.1067, "step": 83740 }, { "epoch": 1.952744017953426, "grad_norm": 2.516481399536133, "learning_rate": 3.493673444009202e-07, "loss": 0.1015, "step": 83750 }, { "epoch": 1.952977179330244, "grad_norm": 2.2225587368011475, "learning_rate": 3.4928962258285145e-07, "loss": 0.1072, "step": 83760 }, { "epoch": 1.953210340707062, "grad_norm": 1.2156569957733154, "learning_rate": 3.492119007647827e-07, "loss": 0.1057, "step": 83770 }, { "epoch": 1.9534435020838798, "grad_norm": 2.494629144668579, "learning_rate": 3.491341789467139e-07, "loss": 0.0952, "step": 83780 }, { "epoch": 1.9536766634606977, "grad_norm": 2.1077115535736084, "learning_rate": 3.4905645712864513e-07, "loss": 0.1042, "step": 83790 }, { "epoch": 1.9539098248375155, "grad_norm": 1.680027723312378, "learning_rate": 3.489787353105764e-07, "loss": 0.0944, "step": 83800 }, { "epoch": 1.9541429862143336, "grad_norm": 1.1955145597457886, "learning_rate": 3.489010134925076e-07, "loss": 0.1021, "step": 83810 }, { "epoch": 1.9543761475911516, "grad_norm": 1.412593126296997, "learning_rate": 3.4882329167443886e-07, "loss": 0.1103, "step": 83820 }, { "epoch": 1.9546093089679695, "grad_norm": 2.7805817127227783, "learning_rate": 3.4874556985637007e-07, "loss": 0.1139, "step": 83830 }, { "epoch": 1.9548424703447873, "grad_norm": 1.5278676748275757, "learning_rate": 3.486678480383013e-07, "loss": 0.1083, "step": 83840 }, { "epoch": 1.9550756317216054, "grad_norm": 1.8761028051376343, "learning_rate": 3.4859012622023254e-07, "loss": 0.1066, "step": 83850 }, { "epoch": 1.9553087930984232, "grad_norm": 1.3320428133010864, "learning_rate": 3.485124044021638e-07, "loss": 0.1039, "step": 83860 }, { "epoch": 1.9555419544752413, "grad_norm": 2.344813108444214, "learning_rate": 3.4843468258409496e-07, "loss": 0.1042, "step": 83870 }, { "epoch": 1.955775115852059, "grad_norm": 1.2727323770523071, "learning_rate": 3.483569607660262e-07, "loss": 0.1015, "step": 83880 }, { "epoch": 1.956008277228877, "grad_norm": 1.775550365447998, "learning_rate": 3.482792389479575e-07, "loss": 0.108, "step": 83890 }, { "epoch": 1.956241438605695, "grad_norm": 2.6299285888671875, "learning_rate": 3.482015171298887e-07, "loss": 0.0993, "step": 83900 }, { "epoch": 1.956474599982513, "grad_norm": 2.065030097961426, "learning_rate": 3.481237953118199e-07, "loss": 0.1009, "step": 83910 }, { "epoch": 1.9567077613593309, "grad_norm": 1.9437308311462402, "learning_rate": 3.4804607349375117e-07, "loss": 0.0945, "step": 83920 }, { "epoch": 1.9569409227361487, "grad_norm": 1.4028993844985962, "learning_rate": 3.479683516756824e-07, "loss": 0.1104, "step": 83930 }, { "epoch": 1.9571740841129666, "grad_norm": 1.319812297821045, "learning_rate": 3.4789062985761364e-07, "loss": 0.1018, "step": 83940 }, { "epoch": 1.9574072454897846, "grad_norm": 2.295483112335205, "learning_rate": 3.4781290803954485e-07, "loss": 0.0943, "step": 83950 }, { "epoch": 1.9576404068666027, "grad_norm": 3.0436856746673584, "learning_rate": 3.4773518622147606e-07, "loss": 0.1152, "step": 83960 }, { "epoch": 1.9578735682434205, "grad_norm": 2.3340890407562256, "learning_rate": 3.476574644034073e-07, "loss": 0.1013, "step": 83970 }, { "epoch": 1.9581067296202384, "grad_norm": 1.8902146816253662, "learning_rate": 3.475797425853386e-07, "loss": 0.1132, "step": 83980 }, { "epoch": 1.9583398909970562, "grad_norm": 1.247426152229309, "learning_rate": 3.4750202076726974e-07, "loss": 0.1022, "step": 83990 }, { "epoch": 1.9585730523738742, "grad_norm": 2.0861871242523193, "learning_rate": 3.47424298949201e-07, "loss": 0.101, "step": 84000 }, { "epoch": 1.9588062137506923, "grad_norm": 1.2921807765960693, "learning_rate": 3.4734657713113226e-07, "loss": 0.1117, "step": 84010 }, { "epoch": 1.9590393751275101, "grad_norm": 1.6314713954925537, "learning_rate": 3.4726885531306347e-07, "loss": 0.1202, "step": 84020 }, { "epoch": 1.959272536504328, "grad_norm": 1.2898041009902954, "learning_rate": 3.471911334949947e-07, "loss": 0.1014, "step": 84030 }, { "epoch": 1.959505697881146, "grad_norm": 2.223280191421509, "learning_rate": 3.4711341167692594e-07, "loss": 0.1083, "step": 84040 }, { "epoch": 1.9597388592579639, "grad_norm": 1.194029688835144, "learning_rate": 3.4703568985885715e-07, "loss": 0.1075, "step": 84050 }, { "epoch": 1.959972020634782, "grad_norm": 1.6776169538497925, "learning_rate": 3.469579680407884e-07, "loss": 0.1103, "step": 84060 }, { "epoch": 1.9602051820115998, "grad_norm": 1.4149222373962402, "learning_rate": 3.468802462227196e-07, "loss": 0.0952, "step": 84070 }, { "epoch": 1.9604383433884176, "grad_norm": 1.7507658004760742, "learning_rate": 3.4680252440465083e-07, "loss": 0.1146, "step": 84080 }, { "epoch": 1.9606715047652357, "grad_norm": 3.8298792839050293, "learning_rate": 3.467248025865821e-07, "loss": 0.0996, "step": 84090 }, { "epoch": 1.9609046661420537, "grad_norm": 1.6022874116897583, "learning_rate": 3.4664708076851336e-07, "loss": 0.101, "step": 84100 }, { "epoch": 1.9611378275188716, "grad_norm": 1.5384142398834229, "learning_rate": 3.465693589504445e-07, "loss": 0.1052, "step": 84110 }, { "epoch": 1.9613709888956894, "grad_norm": 2.032630443572998, "learning_rate": 3.464916371323758e-07, "loss": 0.1049, "step": 84120 }, { "epoch": 1.9616041502725072, "grad_norm": 1.325505018234253, "learning_rate": 3.4641391531430704e-07, "loss": 0.1045, "step": 84130 }, { "epoch": 1.9618373116493253, "grad_norm": 1.3997972011566162, "learning_rate": 3.4633619349623825e-07, "loss": 0.0931, "step": 84140 }, { "epoch": 1.9620704730261433, "grad_norm": 1.7463418245315552, "learning_rate": 3.4625847167816946e-07, "loss": 0.1047, "step": 84150 }, { "epoch": 1.9623036344029612, "grad_norm": 2.587393045425415, "learning_rate": 3.461807498601007e-07, "loss": 0.1174, "step": 84160 }, { "epoch": 1.962536795779779, "grad_norm": 1.1898106336593628, "learning_rate": 3.4610302804203193e-07, "loss": 0.1022, "step": 84170 }, { "epoch": 1.9627699571565969, "grad_norm": 1.8647491931915283, "learning_rate": 3.460253062239632e-07, "loss": 0.1054, "step": 84180 }, { "epoch": 1.963003118533415, "grad_norm": 2.687105178833008, "learning_rate": 3.459475844058944e-07, "loss": 0.1129, "step": 84190 }, { "epoch": 1.963236279910233, "grad_norm": 1.4522607326507568, "learning_rate": 3.458698625878256e-07, "loss": 0.1067, "step": 84200 }, { "epoch": 1.9634694412870508, "grad_norm": 1.6024726629257202, "learning_rate": 3.4579214076975687e-07, "loss": 0.1047, "step": 84210 }, { "epoch": 1.9637026026638686, "grad_norm": 2.1668241024017334, "learning_rate": 3.4571441895168813e-07, "loss": 0.0899, "step": 84220 }, { "epoch": 1.9639357640406867, "grad_norm": 3.7912282943725586, "learning_rate": 3.4563669713361934e-07, "loss": 0.0963, "step": 84230 }, { "epoch": 1.9641689254175045, "grad_norm": 3.74881911277771, "learning_rate": 3.4555897531555055e-07, "loss": 0.1057, "step": 84240 }, { "epoch": 1.9644020867943226, "grad_norm": 1.596024751663208, "learning_rate": 3.454812534974818e-07, "loss": 0.093, "step": 84250 }, { "epoch": 1.9646352481711404, "grad_norm": 1.1667242050170898, "learning_rate": 3.454035316794131e-07, "loss": 0.1081, "step": 84260 }, { "epoch": 1.9648684095479583, "grad_norm": 1.60555899143219, "learning_rate": 3.4532580986134423e-07, "loss": 0.109, "step": 84270 }, { "epoch": 1.9651015709247763, "grad_norm": 2.075122833251953, "learning_rate": 3.452480880432755e-07, "loss": 0.0978, "step": 84280 }, { "epoch": 1.9653347323015944, "grad_norm": 1.6792172193527222, "learning_rate": 3.4517036622520675e-07, "loss": 0.0997, "step": 84290 }, { "epoch": 1.9655678936784122, "grad_norm": 2.3379905223846436, "learning_rate": 3.4509264440713796e-07, "loss": 0.1074, "step": 84300 }, { "epoch": 1.96580105505523, "grad_norm": 3.0261728763580322, "learning_rate": 3.4501492258906917e-07, "loss": 0.097, "step": 84310 }, { "epoch": 1.966034216432048, "grad_norm": 2.2481746673583984, "learning_rate": 3.4493720077100044e-07, "loss": 0.1056, "step": 84320 }, { "epoch": 1.966267377808866, "grad_norm": 1.287794828414917, "learning_rate": 3.4485947895293164e-07, "loss": 0.1106, "step": 84330 }, { "epoch": 1.966500539185684, "grad_norm": 2.6130712032318115, "learning_rate": 3.447817571348629e-07, "loss": 0.1137, "step": 84340 }, { "epoch": 1.9667337005625019, "grad_norm": 1.1956020593643188, "learning_rate": 3.447040353167941e-07, "loss": 0.1009, "step": 84350 }, { "epoch": 1.9669668619393197, "grad_norm": 1.4411957263946533, "learning_rate": 3.446263134987253e-07, "loss": 0.1058, "step": 84360 }, { "epoch": 1.9672000233161375, "grad_norm": 1.2199536561965942, "learning_rate": 3.445485916806566e-07, "loss": 0.1017, "step": 84370 }, { "epoch": 1.9674331846929556, "grad_norm": 2.920077085494995, "learning_rate": 3.4447086986258785e-07, "loss": 0.1091, "step": 84380 }, { "epoch": 1.9676663460697736, "grad_norm": 1.586058497428894, "learning_rate": 3.44393148044519e-07, "loss": 0.1142, "step": 84390 }, { "epoch": 1.9678995074465915, "grad_norm": 3.0471689701080322, "learning_rate": 3.4431542622645027e-07, "loss": 0.1127, "step": 84400 }, { "epoch": 1.9681326688234093, "grad_norm": 1.976297378540039, "learning_rate": 3.4423770440838153e-07, "loss": 0.0973, "step": 84410 }, { "epoch": 1.9683658302002274, "grad_norm": 1.7329912185668945, "learning_rate": 3.4415998259031274e-07, "loss": 0.1101, "step": 84420 }, { "epoch": 1.9685989915770452, "grad_norm": 1.2307885885238647, "learning_rate": 3.4408226077224395e-07, "loss": 0.0995, "step": 84430 }, { "epoch": 1.9688321529538633, "grad_norm": 1.0204020738601685, "learning_rate": 3.440045389541752e-07, "loss": 0.101, "step": 84440 }, { "epoch": 1.969065314330681, "grad_norm": 1.3820116519927979, "learning_rate": 3.439268171361064e-07, "loss": 0.1103, "step": 84450 }, { "epoch": 1.969298475707499, "grad_norm": 1.4674922227859497, "learning_rate": 3.438490953180377e-07, "loss": 0.1131, "step": 84460 }, { "epoch": 1.969531637084317, "grad_norm": 1.6570804119110107, "learning_rate": 3.437713734999689e-07, "loss": 0.0986, "step": 84470 }, { "epoch": 1.969764798461135, "grad_norm": 1.9644285440444946, "learning_rate": 3.436936516819001e-07, "loss": 0.1046, "step": 84480 }, { "epoch": 1.969997959837953, "grad_norm": 1.4706209897994995, "learning_rate": 3.4361592986383136e-07, "loss": 0.0934, "step": 84490 }, { "epoch": 1.9702311212147707, "grad_norm": 1.8720849752426147, "learning_rate": 3.435382080457626e-07, "loss": 0.1117, "step": 84500 }, { "epoch": 1.9704642825915886, "grad_norm": 1.9960222244262695, "learning_rate": 3.434604862276938e-07, "loss": 0.1168, "step": 84510 }, { "epoch": 1.9706974439684066, "grad_norm": 1.137034296989441, "learning_rate": 3.4338276440962504e-07, "loss": 0.0985, "step": 84520 }, { "epoch": 1.9709306053452247, "grad_norm": 1.469295859336853, "learning_rate": 3.433050425915563e-07, "loss": 0.1013, "step": 84530 }, { "epoch": 1.9711637667220425, "grad_norm": 1.091758131980896, "learning_rate": 3.432273207734875e-07, "loss": 0.1004, "step": 84540 }, { "epoch": 1.9713969280988604, "grad_norm": 1.0589208602905273, "learning_rate": 3.431495989554187e-07, "loss": 0.1071, "step": 84550 }, { "epoch": 1.9716300894756782, "grad_norm": 1.675071358680725, "learning_rate": 3.4307187713735e-07, "loss": 0.1053, "step": 84560 }, { "epoch": 1.9718632508524963, "grad_norm": 1.3520636558532715, "learning_rate": 3.429941553192812e-07, "loss": 0.113, "step": 84570 }, { "epoch": 1.9720964122293143, "grad_norm": 1.234153151512146, "learning_rate": 3.4291643350121246e-07, "loss": 0.1093, "step": 84580 }, { "epoch": 1.9723295736061321, "grad_norm": 1.610704779624939, "learning_rate": 3.4283871168314367e-07, "loss": 0.1171, "step": 84590 }, { "epoch": 1.97256273498295, "grad_norm": 1.1255238056182861, "learning_rate": 3.427609898650749e-07, "loss": 0.097, "step": 84600 }, { "epoch": 1.972795896359768, "grad_norm": 1.8592594861984253, "learning_rate": 3.4268326804700614e-07, "loss": 0.1099, "step": 84610 }, { "epoch": 1.973029057736586, "grad_norm": 2.9198620319366455, "learning_rate": 3.426055462289374e-07, "loss": 0.1093, "step": 84620 }, { "epoch": 1.973262219113404, "grad_norm": 1.2864761352539062, "learning_rate": 3.4252782441086856e-07, "loss": 0.1116, "step": 84630 }, { "epoch": 1.9734953804902218, "grad_norm": 1.4911251068115234, "learning_rate": 3.424501025927998e-07, "loss": 0.1018, "step": 84640 }, { "epoch": 1.9737285418670396, "grad_norm": 2.0610618591308594, "learning_rate": 3.423723807747311e-07, "loss": 0.1072, "step": 84650 }, { "epoch": 1.9739617032438577, "grad_norm": 1.6588506698608398, "learning_rate": 3.4229465895666234e-07, "loss": 0.0942, "step": 84660 }, { "epoch": 1.9741948646206757, "grad_norm": 1.5535603761672974, "learning_rate": 3.422169371385935e-07, "loss": 0.0981, "step": 84670 }, { "epoch": 1.9744280259974936, "grad_norm": 1.269498348236084, "learning_rate": 3.4213921532052476e-07, "loss": 0.0992, "step": 84680 }, { "epoch": 1.9746611873743114, "grad_norm": 1.8163295984268188, "learning_rate": 3.42061493502456e-07, "loss": 0.1051, "step": 84690 }, { "epoch": 1.9748943487511292, "grad_norm": 1.6273714303970337, "learning_rate": 3.4198377168438723e-07, "loss": 0.1085, "step": 84700 }, { "epoch": 1.9751275101279473, "grad_norm": 1.8732753992080688, "learning_rate": 3.4190604986631844e-07, "loss": 0.0935, "step": 84710 }, { "epoch": 1.9753606715047654, "grad_norm": 1.624518871307373, "learning_rate": 3.418283280482497e-07, "loss": 0.1081, "step": 84720 }, { "epoch": 1.9755938328815832, "grad_norm": 1.963407278060913, "learning_rate": 3.417506062301809e-07, "loss": 0.1008, "step": 84730 }, { "epoch": 1.975826994258401, "grad_norm": 1.742977499961853, "learning_rate": 3.416728844121122e-07, "loss": 0.0981, "step": 84740 }, { "epoch": 1.9760601556352189, "grad_norm": 3.718451738357544, "learning_rate": 3.415951625940434e-07, "loss": 0.107, "step": 84750 }, { "epoch": 1.976293317012037, "grad_norm": 1.5972024202346802, "learning_rate": 3.415174407759746e-07, "loss": 0.1084, "step": 84760 }, { "epoch": 1.976526478388855, "grad_norm": 2.072467803955078, "learning_rate": 3.4143971895790586e-07, "loss": 0.1181, "step": 84770 }, { "epoch": 1.9767596397656728, "grad_norm": 1.0302175283432007, "learning_rate": 3.413619971398371e-07, "loss": 0.1135, "step": 84780 }, { "epoch": 1.9769928011424907, "grad_norm": 1.3574473857879639, "learning_rate": 3.412842753217683e-07, "loss": 0.108, "step": 84790 }, { "epoch": 1.9772259625193087, "grad_norm": 1.3726664781570435, "learning_rate": 3.4120655350369954e-07, "loss": 0.1075, "step": 84800 }, { "epoch": 1.9774591238961268, "grad_norm": 1.7090946435928345, "learning_rate": 3.411288316856308e-07, "loss": 0.0999, "step": 84810 }, { "epoch": 1.9776922852729446, "grad_norm": 1.175571322441101, "learning_rate": 3.41051109867562e-07, "loss": 0.0961, "step": 84820 }, { "epoch": 1.9779254466497624, "grad_norm": 2.784749984741211, "learning_rate": 3.409733880494932e-07, "loss": 0.1021, "step": 84830 }, { "epoch": 1.9781586080265803, "grad_norm": 1.7827317714691162, "learning_rate": 3.408956662314245e-07, "loss": 0.1175, "step": 84840 }, { "epoch": 1.9783917694033983, "grad_norm": 1.5464975833892822, "learning_rate": 3.408179444133557e-07, "loss": 0.1076, "step": 84850 }, { "epoch": 1.9786249307802164, "grad_norm": 2.10771107673645, "learning_rate": 3.4074022259528695e-07, "loss": 0.0967, "step": 84860 }, { "epoch": 1.9788580921570342, "grad_norm": 1.5098507404327393, "learning_rate": 3.4066250077721816e-07, "loss": 0.1043, "step": 84870 }, { "epoch": 1.979091253533852, "grad_norm": 3.7799084186553955, "learning_rate": 3.4058477895914937e-07, "loss": 0.1094, "step": 84880 }, { "epoch": 1.97932441491067, "grad_norm": 1.2660309076309204, "learning_rate": 3.4050705714108063e-07, "loss": 0.1075, "step": 84890 }, { "epoch": 1.979557576287488, "grad_norm": 1.4157451391220093, "learning_rate": 3.404293353230119e-07, "loss": 0.107, "step": 84900 }, { "epoch": 1.979790737664306, "grad_norm": 1.4895784854888916, "learning_rate": 3.4035161350494305e-07, "loss": 0.1132, "step": 84910 }, { "epoch": 1.9800238990411239, "grad_norm": 2.029604911804199, "learning_rate": 3.402738916868743e-07, "loss": 0.1072, "step": 84920 }, { "epoch": 1.9802570604179417, "grad_norm": 4.325341701507568, "learning_rate": 3.401961698688056e-07, "loss": 0.1058, "step": 84930 }, { "epoch": 1.9804902217947598, "grad_norm": 2.7581071853637695, "learning_rate": 3.401184480507368e-07, "loss": 0.1053, "step": 84940 }, { "epoch": 1.9807233831715776, "grad_norm": 1.4629175662994385, "learning_rate": 3.40040726232668e-07, "loss": 0.114, "step": 84950 }, { "epoch": 1.9809565445483956, "grad_norm": 2.1693289279937744, "learning_rate": 3.3996300441459925e-07, "loss": 0.0992, "step": 84960 }, { "epoch": 1.9811897059252135, "grad_norm": 1.481929063796997, "learning_rate": 3.3988528259653046e-07, "loss": 0.1064, "step": 84970 }, { "epoch": 1.9814228673020313, "grad_norm": 1.3673374652862549, "learning_rate": 3.398075607784617e-07, "loss": 0.114, "step": 84980 }, { "epoch": 1.9816560286788494, "grad_norm": 2.028317928314209, "learning_rate": 3.3972983896039293e-07, "loss": 0.1076, "step": 84990 }, { "epoch": 1.9818891900556674, "grad_norm": 1.1475788354873657, "learning_rate": 3.3965211714232414e-07, "loss": 0.1103, "step": 85000 }, { "epoch": 1.9821223514324853, "grad_norm": 1.3649530410766602, "learning_rate": 3.395743953242554e-07, "loss": 0.0963, "step": 85010 }, { "epoch": 1.982355512809303, "grad_norm": 2.2746341228485107, "learning_rate": 3.3949667350618667e-07, "loss": 0.1021, "step": 85020 }, { "epoch": 1.982588674186121, "grad_norm": 2.427182197570801, "learning_rate": 3.394189516881178e-07, "loss": 0.101, "step": 85030 }, { "epoch": 1.982821835562939, "grad_norm": 2.262101888656616, "learning_rate": 3.393412298700491e-07, "loss": 0.1056, "step": 85040 }, { "epoch": 1.983054996939757, "grad_norm": 2.9513065814971924, "learning_rate": 3.3926350805198035e-07, "loss": 0.0989, "step": 85050 }, { "epoch": 1.983288158316575, "grad_norm": 1.1427559852600098, "learning_rate": 3.391857862339116e-07, "loss": 0.1002, "step": 85060 }, { "epoch": 1.9835213196933927, "grad_norm": 1.8022540807724, "learning_rate": 3.3910806441584277e-07, "loss": 0.0908, "step": 85070 }, { "epoch": 1.9837544810702106, "grad_norm": 1.7789720296859741, "learning_rate": 3.3903034259777403e-07, "loss": 0.1059, "step": 85080 }, { "epoch": 1.9839876424470286, "grad_norm": 3.162290096282959, "learning_rate": 3.389526207797053e-07, "loss": 0.1172, "step": 85090 }, { "epoch": 1.9842208038238467, "grad_norm": 1.409331202507019, "learning_rate": 3.388748989616365e-07, "loss": 0.1028, "step": 85100 }, { "epoch": 1.9844539652006645, "grad_norm": 1.3284879922866821, "learning_rate": 3.387971771435677e-07, "loss": 0.1017, "step": 85110 }, { "epoch": 1.9846871265774824, "grad_norm": 2.243300437927246, "learning_rate": 3.3871945532549897e-07, "loss": 0.1103, "step": 85120 }, { "epoch": 1.9849202879543004, "grad_norm": 1.0638850927352905, "learning_rate": 3.386417335074302e-07, "loss": 0.102, "step": 85130 }, { "epoch": 1.9851534493311183, "grad_norm": 1.2016663551330566, "learning_rate": 3.3856401168936144e-07, "loss": 0.1007, "step": 85140 }, { "epoch": 1.9853866107079363, "grad_norm": 1.3073137998580933, "learning_rate": 3.3848628987129265e-07, "loss": 0.0968, "step": 85150 }, { "epoch": 1.9856197720847542, "grad_norm": 2.1094348430633545, "learning_rate": 3.3840856805322386e-07, "loss": 0.1094, "step": 85160 }, { "epoch": 1.985852933461572, "grad_norm": 1.2396782636642456, "learning_rate": 3.383308462351551e-07, "loss": 0.1054, "step": 85170 }, { "epoch": 1.98608609483839, "grad_norm": 1.9140377044677734, "learning_rate": 3.382531244170864e-07, "loss": 0.1004, "step": 85180 }, { "epoch": 1.986319256215208, "grad_norm": 1.34507155418396, "learning_rate": 3.3817540259901754e-07, "loss": 0.1092, "step": 85190 }, { "epoch": 1.986552417592026, "grad_norm": 1.185144305229187, "learning_rate": 3.380976807809488e-07, "loss": 0.1077, "step": 85200 }, { "epoch": 1.9867855789688438, "grad_norm": 3.383486747741699, "learning_rate": 3.3801995896288007e-07, "loss": 0.1056, "step": 85210 }, { "epoch": 1.9870187403456616, "grad_norm": 2.171154737472534, "learning_rate": 3.379422371448113e-07, "loss": 0.1006, "step": 85220 }, { "epoch": 1.9872519017224797, "grad_norm": 2.856701135635376, "learning_rate": 3.378645153267425e-07, "loss": 0.1013, "step": 85230 }, { "epoch": 1.9874850630992977, "grad_norm": 1.3254663944244385, "learning_rate": 3.3778679350867375e-07, "loss": 0.1079, "step": 85240 }, { "epoch": 1.9877182244761156, "grad_norm": 1.6280303001403809, "learning_rate": 3.3770907169060496e-07, "loss": 0.1135, "step": 85250 }, { "epoch": 1.9879513858529334, "grad_norm": 1.479339599609375, "learning_rate": 3.376313498725362e-07, "loss": 0.1001, "step": 85260 }, { "epoch": 1.9881845472297512, "grad_norm": 1.9002609252929688, "learning_rate": 3.3755362805446743e-07, "loss": 0.1011, "step": 85270 }, { "epoch": 1.9884177086065693, "grad_norm": 1.9949843883514404, "learning_rate": 3.3747590623639864e-07, "loss": 0.1035, "step": 85280 }, { "epoch": 1.9886508699833874, "grad_norm": 1.856567621231079, "learning_rate": 3.373981844183299e-07, "loss": 0.1041, "step": 85290 }, { "epoch": 1.9888840313602052, "grad_norm": 2.426711320877075, "learning_rate": 3.3732046260026116e-07, "loss": 0.1029, "step": 85300 }, { "epoch": 1.989117192737023, "grad_norm": 1.324890375137329, "learning_rate": 3.372427407821923e-07, "loss": 0.0963, "step": 85310 }, { "epoch": 1.989350354113841, "grad_norm": 1.181848168373108, "learning_rate": 3.371650189641236e-07, "loss": 0.1036, "step": 85320 }, { "epoch": 1.989583515490659, "grad_norm": 1.5015029907226562, "learning_rate": 3.3708729714605484e-07, "loss": 0.1211, "step": 85330 }, { "epoch": 1.989816676867477, "grad_norm": 1.7029085159301758, "learning_rate": 3.3700957532798605e-07, "loss": 0.1084, "step": 85340 }, { "epoch": 1.9900498382442948, "grad_norm": 1.3374937772750854, "learning_rate": 3.3693185350991726e-07, "loss": 0.1022, "step": 85350 }, { "epoch": 1.9902829996211127, "grad_norm": 2.733013153076172, "learning_rate": 3.368541316918485e-07, "loss": 0.1074, "step": 85360 }, { "epoch": 1.9905161609979307, "grad_norm": 0.9046163558959961, "learning_rate": 3.3677640987377973e-07, "loss": 0.0988, "step": 85370 }, { "epoch": 1.9907493223747488, "grad_norm": 1.5947091579437256, "learning_rate": 3.36698688055711e-07, "loss": 0.101, "step": 85380 }, { "epoch": 1.9909824837515666, "grad_norm": 1.4815102815628052, "learning_rate": 3.366209662376422e-07, "loss": 0.1068, "step": 85390 }, { "epoch": 1.9912156451283844, "grad_norm": 2.338477849960327, "learning_rate": 3.365432444195734e-07, "loss": 0.1029, "step": 85400 }, { "epoch": 1.9914488065052023, "grad_norm": 1.1484955549240112, "learning_rate": 3.364655226015047e-07, "loss": 0.1008, "step": 85410 }, { "epoch": 1.9916819678820203, "grad_norm": 0.9244183897972107, "learning_rate": 3.3638780078343594e-07, "loss": 0.1104, "step": 85420 }, { "epoch": 1.9919151292588384, "grad_norm": 2.2460310459136963, "learning_rate": 3.363100789653671e-07, "loss": 0.1049, "step": 85430 }, { "epoch": 1.9921482906356562, "grad_norm": 3.339663028717041, "learning_rate": 3.3623235714729836e-07, "loss": 0.1085, "step": 85440 }, { "epoch": 1.992381452012474, "grad_norm": 1.5250481367111206, "learning_rate": 3.361546353292296e-07, "loss": 0.0943, "step": 85450 }, { "epoch": 1.992614613389292, "grad_norm": 1.950549602508545, "learning_rate": 3.360769135111609e-07, "loss": 0.1246, "step": 85460 }, { "epoch": 1.99284777476611, "grad_norm": 2.3117613792419434, "learning_rate": 3.3599919169309204e-07, "loss": 0.1086, "step": 85470 }, { "epoch": 1.993080936142928, "grad_norm": 2.372051954269409, "learning_rate": 3.359214698750233e-07, "loss": 0.1053, "step": 85480 }, { "epoch": 1.9933140975197459, "grad_norm": 1.3377716541290283, "learning_rate": 3.3584374805695456e-07, "loss": 0.0968, "step": 85490 }, { "epoch": 1.9935472588965637, "grad_norm": 1.4469786882400513, "learning_rate": 3.3576602623888577e-07, "loss": 0.1148, "step": 85500 }, { "epoch": 1.9937804202733818, "grad_norm": 1.419437050819397, "learning_rate": 3.35688304420817e-07, "loss": 0.1054, "step": 85510 }, { "epoch": 1.9940135816501996, "grad_norm": 2.1384358406066895, "learning_rate": 3.3561058260274824e-07, "loss": 0.1049, "step": 85520 }, { "epoch": 1.9942467430270177, "grad_norm": 2.243596315383911, "learning_rate": 3.3553286078467945e-07, "loss": 0.1069, "step": 85530 }, { "epoch": 1.9944799044038355, "grad_norm": 1.049455165863037, "learning_rate": 3.354551389666107e-07, "loss": 0.0971, "step": 85540 }, { "epoch": 1.9947130657806533, "grad_norm": 1.3905194997787476, "learning_rate": 3.353774171485419e-07, "loss": 0.1031, "step": 85550 }, { "epoch": 1.9949462271574714, "grad_norm": 1.0409414768218994, "learning_rate": 3.3529969533047313e-07, "loss": 0.1018, "step": 85560 }, { "epoch": 1.9951793885342894, "grad_norm": 1.311694860458374, "learning_rate": 3.352219735124044e-07, "loss": 0.1137, "step": 85570 }, { "epoch": 1.9954125499111073, "grad_norm": 2.052943468093872, "learning_rate": 3.3514425169433565e-07, "loss": 0.1026, "step": 85580 }, { "epoch": 1.9956457112879251, "grad_norm": 1.0360783338546753, "learning_rate": 3.350665298762668e-07, "loss": 0.0979, "step": 85590 }, { "epoch": 1.995878872664743, "grad_norm": 2.5251219272613525, "learning_rate": 3.3498880805819807e-07, "loss": 0.1028, "step": 85600 }, { "epoch": 1.996112034041561, "grad_norm": 1.2814661264419556, "learning_rate": 3.3491108624012934e-07, "loss": 0.0994, "step": 85610 }, { "epoch": 1.996345195418379, "grad_norm": 2.229102849960327, "learning_rate": 3.3483336442206054e-07, "loss": 0.104, "step": 85620 }, { "epoch": 1.996578356795197, "grad_norm": 1.3042054176330566, "learning_rate": 3.3475564260399175e-07, "loss": 0.0953, "step": 85630 }, { "epoch": 1.9968115181720147, "grad_norm": 2.4541821479797363, "learning_rate": 3.34677920785923e-07, "loss": 0.1031, "step": 85640 }, { "epoch": 1.9970446795488326, "grad_norm": 1.3147038221359253, "learning_rate": 3.346001989678542e-07, "loss": 0.1118, "step": 85650 }, { "epoch": 1.9972778409256506, "grad_norm": 1.3711247444152832, "learning_rate": 3.345224771497855e-07, "loss": 0.1054, "step": 85660 }, { "epoch": 1.9975110023024687, "grad_norm": 1.6162632703781128, "learning_rate": 3.344447553317167e-07, "loss": 0.1111, "step": 85670 }, { "epoch": 1.9977441636792865, "grad_norm": 1.6097700595855713, "learning_rate": 3.343670335136479e-07, "loss": 0.1043, "step": 85680 }, { "epoch": 1.9979773250561044, "grad_norm": 1.5415867567062378, "learning_rate": 3.3428931169557917e-07, "loss": 0.0982, "step": 85690 }, { "epoch": 1.9982104864329224, "grad_norm": 1.691131353378296, "learning_rate": 3.3421158987751043e-07, "loss": 0.1024, "step": 85700 }, { "epoch": 1.9984436478097403, "grad_norm": 3.53484845161438, "learning_rate": 3.341338680594416e-07, "loss": 0.1065, "step": 85710 }, { "epoch": 1.9986768091865583, "grad_norm": 2.167238712310791, "learning_rate": 3.3405614624137285e-07, "loss": 0.1111, "step": 85720 }, { "epoch": 1.9989099705633762, "grad_norm": 2.087697982788086, "learning_rate": 3.339784244233041e-07, "loss": 0.1051, "step": 85730 }, { "epoch": 1.999143131940194, "grad_norm": 2.6670141220092773, "learning_rate": 3.339007026052353e-07, "loss": 0.1031, "step": 85740 }, { "epoch": 1.999376293317012, "grad_norm": 1.1294676065444946, "learning_rate": 3.338229807871666e-07, "loss": 0.1118, "step": 85750 }, { "epoch": 1.9996094546938301, "grad_norm": 3.4243531227111816, "learning_rate": 3.337452589690978e-07, "loss": 0.1225, "step": 85760 }, { "epoch": 1.999842616070648, "grad_norm": 1.6660847663879395, "learning_rate": 3.33667537151029e-07, "loss": 0.0925, "step": 85770 }, { "epoch": 2.0000932645507272, "grad_norm": 2.4559993743896484, "learning_rate": 3.3358981533296026e-07, "loss": 0.1036, "step": 85780 }, { "epoch": 2.000326425927545, "grad_norm": 1.425004243850708, "learning_rate": 3.335120935148915e-07, "loss": 0.1063, "step": 85790 }, { "epoch": 2.000559587304363, "grad_norm": 1.4679527282714844, "learning_rate": 3.334343716968227e-07, "loss": 0.1039, "step": 85800 }, { "epoch": 2.000792748681181, "grad_norm": 1.5332884788513184, "learning_rate": 3.3335664987875394e-07, "loss": 0.1075, "step": 85810 }, { "epoch": 2.001025910057999, "grad_norm": 1.46601402759552, "learning_rate": 3.332789280606852e-07, "loss": 0.1086, "step": 85820 }, { "epoch": 2.001259071434817, "grad_norm": 1.2337614297866821, "learning_rate": 3.332012062426164e-07, "loss": 0.1049, "step": 85830 }, { "epoch": 2.0014922328116347, "grad_norm": 1.3842517137527466, "learning_rate": 3.331234844245476e-07, "loss": 0.1066, "step": 85840 }, { "epoch": 2.0017253941884525, "grad_norm": 1.693461537361145, "learning_rate": 3.330457626064789e-07, "loss": 0.0968, "step": 85850 }, { "epoch": 2.001958555565271, "grad_norm": 1.4266060590744019, "learning_rate": 3.329680407884101e-07, "loss": 0.1082, "step": 85860 }, { "epoch": 2.0021917169420886, "grad_norm": 2.4704856872558594, "learning_rate": 3.3289031897034136e-07, "loss": 0.1019, "step": 85870 }, { "epoch": 2.0024248783189065, "grad_norm": 1.0772258043289185, "learning_rate": 3.3281259715227257e-07, "loss": 0.1092, "step": 85880 }, { "epoch": 2.0026580396957243, "grad_norm": 2.2694039344787598, "learning_rate": 3.327348753342038e-07, "loss": 0.1011, "step": 85890 }, { "epoch": 2.002891201072542, "grad_norm": 2.0222322940826416, "learning_rate": 3.3265715351613504e-07, "loss": 0.0962, "step": 85900 }, { "epoch": 2.0031243624493604, "grad_norm": 1.8253066539764404, "learning_rate": 3.325794316980663e-07, "loss": 0.1135, "step": 85910 }, { "epoch": 2.0033575238261783, "grad_norm": 1.4216620922088623, "learning_rate": 3.325017098799975e-07, "loss": 0.1047, "step": 85920 }, { "epoch": 2.003590685202996, "grad_norm": 2.293649673461914, "learning_rate": 3.324239880619287e-07, "loss": 0.11, "step": 85930 }, { "epoch": 2.003823846579814, "grad_norm": 2.3256492614746094, "learning_rate": 3.3234626624386e-07, "loss": 0.1069, "step": 85940 }, { "epoch": 2.0040570079566318, "grad_norm": 1.1787787675857544, "learning_rate": 3.3226854442579124e-07, "loss": 0.1105, "step": 85950 }, { "epoch": 2.00429016933345, "grad_norm": 2.7474284172058105, "learning_rate": 3.321908226077224e-07, "loss": 0.1034, "step": 85960 }, { "epoch": 2.004523330710268, "grad_norm": 1.648973822593689, "learning_rate": 3.3211310078965366e-07, "loss": 0.0924, "step": 85970 }, { "epoch": 2.0047564920870857, "grad_norm": 1.3479132652282715, "learning_rate": 3.320353789715849e-07, "loss": 0.1024, "step": 85980 }, { "epoch": 2.0049896534639036, "grad_norm": 0.9706793427467346, "learning_rate": 3.3195765715351613e-07, "loss": 0.1019, "step": 85990 }, { "epoch": 2.005222814840722, "grad_norm": 1.8650754690170288, "learning_rate": 3.3187993533544734e-07, "loss": 0.1112, "step": 86000 }, { "epoch": 2.0054559762175397, "grad_norm": 1.2558451890945435, "learning_rate": 3.318022135173786e-07, "loss": 0.0923, "step": 86010 }, { "epoch": 2.0056891375943575, "grad_norm": 3.8072543144226074, "learning_rate": 3.317244916993098e-07, "loss": 0.1023, "step": 86020 }, { "epoch": 2.0059222989711754, "grad_norm": 2.5044026374816895, "learning_rate": 3.316467698812411e-07, "loss": 0.1157, "step": 86030 }, { "epoch": 2.006155460347993, "grad_norm": 1.8952914476394653, "learning_rate": 3.315690480631723e-07, "loss": 0.0947, "step": 86040 }, { "epoch": 2.0063886217248115, "grad_norm": 1.7168241739273071, "learning_rate": 3.314913262451035e-07, "loss": 0.0964, "step": 86050 }, { "epoch": 2.0066217831016293, "grad_norm": 2.527761697769165, "learning_rate": 3.3141360442703476e-07, "loss": 0.1076, "step": 86060 }, { "epoch": 2.006854944478447, "grad_norm": 1.7597379684448242, "learning_rate": 3.31335882608966e-07, "loss": 0.1047, "step": 86070 }, { "epoch": 2.007088105855265, "grad_norm": 1.7302687168121338, "learning_rate": 3.312581607908972e-07, "loss": 0.1086, "step": 86080 }, { "epoch": 2.007321267232083, "grad_norm": 1.6206623315811157, "learning_rate": 3.3118043897282844e-07, "loss": 0.1101, "step": 86090 }, { "epoch": 2.007554428608901, "grad_norm": 1.1336370706558228, "learning_rate": 3.311027171547597e-07, "loss": 0.1031, "step": 86100 }, { "epoch": 2.007787589985719, "grad_norm": 1.4896786212921143, "learning_rate": 3.310249953366909e-07, "loss": 0.0937, "step": 86110 }, { "epoch": 2.0080207513625368, "grad_norm": 1.2542448043823242, "learning_rate": 3.309472735186221e-07, "loss": 0.0987, "step": 86120 }, { "epoch": 2.0082539127393546, "grad_norm": 1.8406026363372803, "learning_rate": 3.308695517005534e-07, "loss": 0.1069, "step": 86130 }, { "epoch": 2.0084870741161724, "grad_norm": 1.051518201828003, "learning_rate": 3.307918298824846e-07, "loss": 0.1056, "step": 86140 }, { "epoch": 2.0087202354929907, "grad_norm": 1.4066063165664673, "learning_rate": 3.3071410806441585e-07, "loss": 0.1001, "step": 86150 }, { "epoch": 2.0089533968698086, "grad_norm": 1.3503303527832031, "learning_rate": 3.3063638624634706e-07, "loss": 0.1018, "step": 86160 }, { "epoch": 2.0091865582466264, "grad_norm": 1.5138969421386719, "learning_rate": 3.3055866442827827e-07, "loss": 0.0984, "step": 86170 }, { "epoch": 2.0094197196234442, "grad_norm": 1.1838792562484741, "learning_rate": 3.3048094261020953e-07, "loss": 0.1217, "step": 86180 }, { "epoch": 2.0096528810002625, "grad_norm": 1.1975888013839722, "learning_rate": 3.304032207921408e-07, "loss": 0.0968, "step": 86190 }, { "epoch": 2.0098860423770804, "grad_norm": 1.4723917245864868, "learning_rate": 3.3032549897407195e-07, "loss": 0.1045, "step": 86200 }, { "epoch": 2.010119203753898, "grad_norm": 1.7602338790893555, "learning_rate": 3.302477771560032e-07, "loss": 0.1094, "step": 86210 }, { "epoch": 2.010352365130716, "grad_norm": 1.8031245470046997, "learning_rate": 3.301700553379345e-07, "loss": 0.1046, "step": 86220 }, { "epoch": 2.010585526507534, "grad_norm": 1.0636261701583862, "learning_rate": 3.300923335198657e-07, "loss": 0.1128, "step": 86230 }, { "epoch": 2.010818687884352, "grad_norm": 1.5143346786499023, "learning_rate": 3.300146117017969e-07, "loss": 0.1006, "step": 86240 }, { "epoch": 2.01105184926117, "grad_norm": 2.618086099624634, "learning_rate": 3.2993688988372815e-07, "loss": 0.1028, "step": 86250 }, { "epoch": 2.011285010637988, "grad_norm": 2.4546542167663574, "learning_rate": 3.2985916806565936e-07, "loss": 0.1016, "step": 86260 }, { "epoch": 2.0115181720148056, "grad_norm": 1.3984540700912476, "learning_rate": 3.297814462475906e-07, "loss": 0.1064, "step": 86270 }, { "epoch": 2.0117513333916235, "grad_norm": 1.1792528629302979, "learning_rate": 3.2970372442952183e-07, "loss": 0.1152, "step": 86280 }, { "epoch": 2.0119844947684418, "grad_norm": 2.872360944747925, "learning_rate": 3.2962600261145304e-07, "loss": 0.115, "step": 86290 }, { "epoch": 2.0122176561452596, "grad_norm": 1.369743824005127, "learning_rate": 3.295482807933843e-07, "loss": 0.0963, "step": 86300 }, { "epoch": 2.0124508175220774, "grad_norm": 2.010070562362671, "learning_rate": 3.2947055897531557e-07, "loss": 0.1084, "step": 86310 }, { "epoch": 2.0126839788988953, "grad_norm": 1.9541345834732056, "learning_rate": 3.293928371572467e-07, "loss": 0.0927, "step": 86320 }, { "epoch": 2.012917140275713, "grad_norm": 1.9914071559906006, "learning_rate": 3.29315115339178e-07, "loss": 0.1095, "step": 86330 }, { "epoch": 2.0131503016525314, "grad_norm": 3.1852235794067383, "learning_rate": 3.2923739352110925e-07, "loss": 0.11, "step": 86340 }, { "epoch": 2.0133834630293492, "grad_norm": 2.3393733501434326, "learning_rate": 3.291596717030405e-07, "loss": 0.0865, "step": 86350 }, { "epoch": 2.013616624406167, "grad_norm": 1.2456814050674438, "learning_rate": 3.2908194988497167e-07, "loss": 0.095, "step": 86360 }, { "epoch": 2.013849785782985, "grad_norm": 2.349712610244751, "learning_rate": 3.2900422806690293e-07, "loss": 0.1047, "step": 86370 }, { "epoch": 2.014082947159803, "grad_norm": 1.9851884841918945, "learning_rate": 3.289265062488342e-07, "loss": 0.1048, "step": 86380 }, { "epoch": 2.014316108536621, "grad_norm": 1.860904574394226, "learning_rate": 3.288487844307654e-07, "loss": 0.0974, "step": 86390 }, { "epoch": 2.014549269913439, "grad_norm": 1.3646328449249268, "learning_rate": 3.287710626126966e-07, "loss": 0.1017, "step": 86400 }, { "epoch": 2.0147824312902567, "grad_norm": 1.1167469024658203, "learning_rate": 3.2869334079462787e-07, "loss": 0.1031, "step": 86410 }, { "epoch": 2.0150155926670745, "grad_norm": 1.5803648233413696, "learning_rate": 3.286156189765591e-07, "loss": 0.1136, "step": 86420 }, { "epoch": 2.015248754043893, "grad_norm": 1.4862644672393799, "learning_rate": 3.2853789715849034e-07, "loss": 0.0954, "step": 86430 }, { "epoch": 2.0154819154207106, "grad_norm": 2.7633960247039795, "learning_rate": 3.2846017534042155e-07, "loss": 0.0997, "step": 86440 }, { "epoch": 2.0157150767975285, "grad_norm": 3.3077664375305176, "learning_rate": 3.2838245352235276e-07, "loss": 0.1044, "step": 86450 }, { "epoch": 2.0159482381743463, "grad_norm": 2.1077568531036377, "learning_rate": 3.28304731704284e-07, "loss": 0.1024, "step": 86460 }, { "epoch": 2.016181399551164, "grad_norm": 2.34248948097229, "learning_rate": 3.282270098862153e-07, "loss": 0.1095, "step": 86470 }, { "epoch": 2.0164145609279824, "grad_norm": 1.505020022392273, "learning_rate": 3.2814928806814644e-07, "loss": 0.0975, "step": 86480 }, { "epoch": 2.0166477223048003, "grad_norm": 1.509212851524353, "learning_rate": 3.280715662500777e-07, "loss": 0.0975, "step": 86490 }, { "epoch": 2.016880883681618, "grad_norm": 1.930509328842163, "learning_rate": 3.2799384443200897e-07, "loss": 0.1153, "step": 86500 }, { "epoch": 2.017114045058436, "grad_norm": 1.5165467262268066, "learning_rate": 3.279161226139402e-07, "loss": 0.0982, "step": 86510 }, { "epoch": 2.017347206435254, "grad_norm": 1.1724703311920166, "learning_rate": 3.278384007958714e-07, "loss": 0.1056, "step": 86520 }, { "epoch": 2.017580367812072, "grad_norm": 2.041748523712158, "learning_rate": 3.2776067897780265e-07, "loss": 0.0996, "step": 86530 }, { "epoch": 2.01781352918889, "grad_norm": 1.1488932371139526, "learning_rate": 3.2768295715973386e-07, "loss": 0.0998, "step": 86540 }, { "epoch": 2.0180466905657077, "grad_norm": 1.2788532972335815, "learning_rate": 3.276052353416651e-07, "loss": 0.0945, "step": 86550 }, { "epoch": 2.0182798519425256, "grad_norm": 1.3538110256195068, "learning_rate": 3.2752751352359633e-07, "loss": 0.1088, "step": 86560 }, { "epoch": 2.018513013319344, "grad_norm": 1.4767934083938599, "learning_rate": 3.2744979170552754e-07, "loss": 0.0983, "step": 86570 }, { "epoch": 2.0187461746961617, "grad_norm": 2.419625997543335, "learning_rate": 3.273720698874588e-07, "loss": 0.1076, "step": 86580 }, { "epoch": 2.0189793360729795, "grad_norm": 2.4129817485809326, "learning_rate": 3.2729434806939006e-07, "loss": 0.1232, "step": 86590 }, { "epoch": 2.0192124974497974, "grad_norm": 1.798659086227417, "learning_rate": 3.272166262513212e-07, "loss": 0.1029, "step": 86600 }, { "epoch": 2.019445658826615, "grad_norm": 1.2932162284851074, "learning_rate": 3.271389044332525e-07, "loss": 0.1001, "step": 86610 }, { "epoch": 2.0196788202034335, "grad_norm": 1.5208662748336792, "learning_rate": 3.2706118261518374e-07, "loss": 0.1102, "step": 86620 }, { "epoch": 2.0199119815802513, "grad_norm": 1.5883393287658691, "learning_rate": 3.269912329789218e-07, "loss": 0.1102, "step": 86630 }, { "epoch": 2.020145142957069, "grad_norm": 3.723271131515503, "learning_rate": 3.269135111608531e-07, "loss": 0.1075, "step": 86640 }, { "epoch": 2.020378304333887, "grad_norm": 1.3827012777328491, "learning_rate": 3.268357893427843e-07, "loss": 0.1018, "step": 86650 }, { "epoch": 2.020611465710705, "grad_norm": 1.390418529510498, "learning_rate": 3.267580675247155e-07, "loss": 0.1132, "step": 86660 }, { "epoch": 2.020844627087523, "grad_norm": 1.2230823040008545, "learning_rate": 3.2668034570664676e-07, "loss": 0.1111, "step": 86670 }, { "epoch": 2.021077788464341, "grad_norm": 2.6441545486450195, "learning_rate": 3.26602623888578e-07, "loss": 0.1156, "step": 86680 }, { "epoch": 2.0213109498411588, "grad_norm": 2.466446876525879, "learning_rate": 3.265249020705092e-07, "loss": 0.097, "step": 86690 }, { "epoch": 2.0215441112179766, "grad_norm": 2.1624321937561035, "learning_rate": 3.2644718025244044e-07, "loss": 0.1105, "step": 86700 }, { "epoch": 2.021777272594795, "grad_norm": 1.961228847503662, "learning_rate": 3.263694584343717e-07, "loss": 0.0979, "step": 86710 }, { "epoch": 2.0220104339716127, "grad_norm": 1.4191899299621582, "learning_rate": 3.2629173661630296e-07, "loss": 0.0998, "step": 86720 }, { "epoch": 2.0222435953484306, "grad_norm": 1.5431019067764282, "learning_rate": 3.262140147982341e-07, "loss": 0.0993, "step": 86730 }, { "epoch": 2.0224767567252484, "grad_norm": 1.749741554260254, "learning_rate": 3.261362929801654e-07, "loss": 0.0962, "step": 86740 }, { "epoch": 2.0227099181020662, "grad_norm": 1.288257360458374, "learning_rate": 3.2605857116209664e-07, "loss": 0.111, "step": 86750 }, { "epoch": 2.0229430794788845, "grad_norm": 1.942943811416626, "learning_rate": 3.2598084934402785e-07, "loss": 0.1086, "step": 86760 }, { "epoch": 2.0231762408557024, "grad_norm": 1.6567312479019165, "learning_rate": 3.2590312752595906e-07, "loss": 0.1116, "step": 86770 }, { "epoch": 2.02340940223252, "grad_norm": 1.460821509361267, "learning_rate": 3.258254057078903e-07, "loss": 0.1007, "step": 86780 }, { "epoch": 2.023642563609338, "grad_norm": 1.5101817846298218, "learning_rate": 3.2574768388982153e-07, "loss": 0.1088, "step": 86790 }, { "epoch": 2.023875724986156, "grad_norm": 1.5819823741912842, "learning_rate": 3.256699620717528e-07, "loss": 0.1068, "step": 86800 }, { "epoch": 2.024108886362974, "grad_norm": 1.573563814163208, "learning_rate": 3.25592240253684e-07, "loss": 0.0996, "step": 86810 }, { "epoch": 2.024342047739792, "grad_norm": 2.2100515365600586, "learning_rate": 3.255145184356152e-07, "loss": 0.1017, "step": 86820 }, { "epoch": 2.02457520911661, "grad_norm": 1.1903996467590332, "learning_rate": 3.254367966175465e-07, "loss": 0.1015, "step": 86830 }, { "epoch": 2.0248083704934277, "grad_norm": 2.164163827896118, "learning_rate": 3.2535907479947774e-07, "loss": 0.1083, "step": 86840 }, { "epoch": 2.0250415318702455, "grad_norm": 1.6615192890167236, "learning_rate": 3.252813529814089e-07, "loss": 0.0986, "step": 86850 }, { "epoch": 2.0252746932470638, "grad_norm": 1.1612167358398438, "learning_rate": 3.2520363116334016e-07, "loss": 0.1045, "step": 86860 }, { "epoch": 2.0255078546238816, "grad_norm": 1.9275680780410767, "learning_rate": 3.251259093452714e-07, "loss": 0.1052, "step": 86870 }, { "epoch": 2.0257410160006994, "grad_norm": 1.3130624294281006, "learning_rate": 3.2504818752720263e-07, "loss": 0.1037, "step": 86880 }, { "epoch": 2.0259741773775173, "grad_norm": 1.047440528869629, "learning_rate": 3.2497046570913384e-07, "loss": 0.0856, "step": 86890 }, { "epoch": 2.0262073387543356, "grad_norm": 1.1341753005981445, "learning_rate": 3.248927438910651e-07, "loss": 0.1051, "step": 86900 }, { "epoch": 2.0264405001311534, "grad_norm": 1.7971898317337036, "learning_rate": 3.248150220729963e-07, "loss": 0.1027, "step": 86910 }, { "epoch": 2.0266736615079712, "grad_norm": 1.6135993003845215, "learning_rate": 3.2473730025492757e-07, "loss": 0.0976, "step": 86920 }, { "epoch": 2.026906822884789, "grad_norm": 2.358020544052124, "learning_rate": 3.246595784368588e-07, "loss": 0.109, "step": 86930 }, { "epoch": 2.027139984261607, "grad_norm": 1.0836868286132812, "learning_rate": 3.2458185661879e-07, "loss": 0.1027, "step": 86940 }, { "epoch": 2.027373145638425, "grad_norm": 1.9820969104766846, "learning_rate": 3.2450413480072125e-07, "loss": 0.1063, "step": 86950 }, { "epoch": 2.027606307015243, "grad_norm": 1.136049509048462, "learning_rate": 3.244264129826525e-07, "loss": 0.1116, "step": 86960 }, { "epoch": 2.027839468392061, "grad_norm": 1.1420862674713135, "learning_rate": 3.2434869116458367e-07, "loss": 0.1051, "step": 86970 }, { "epoch": 2.0280726297688787, "grad_norm": 1.1752195358276367, "learning_rate": 3.2427096934651493e-07, "loss": 0.1028, "step": 86980 }, { "epoch": 2.0283057911456965, "grad_norm": 2.4693989753723145, "learning_rate": 3.241932475284462e-07, "loss": 0.1115, "step": 86990 }, { "epoch": 2.028538952522515, "grad_norm": 1.027078628540039, "learning_rate": 3.241155257103774e-07, "loss": 0.098, "step": 87000 }, { "epoch": 2.0287721138993327, "grad_norm": 1.4231175184249878, "learning_rate": 3.240378038923086e-07, "loss": 0.1138, "step": 87010 }, { "epoch": 2.0290052752761505, "grad_norm": 1.6039655208587646, "learning_rate": 3.239600820742399e-07, "loss": 0.1107, "step": 87020 }, { "epoch": 2.0292384366529683, "grad_norm": 3.2212002277374268, "learning_rate": 3.238823602561711e-07, "loss": 0.1043, "step": 87030 }, { "epoch": 2.029471598029786, "grad_norm": 2.127096652984619, "learning_rate": 3.2380463843810235e-07, "loss": 0.1127, "step": 87040 }, { "epoch": 2.0297047594066044, "grad_norm": 1.8460252285003662, "learning_rate": 3.2372691662003355e-07, "loss": 0.1095, "step": 87050 }, { "epoch": 2.0299379207834223, "grad_norm": 1.2555115222930908, "learning_rate": 3.2364919480196476e-07, "loss": 0.1043, "step": 87060 }, { "epoch": 2.03017108216024, "grad_norm": 1.3096492290496826, "learning_rate": 3.23571472983896e-07, "loss": 0.0996, "step": 87070 }, { "epoch": 2.030404243537058, "grad_norm": 1.4100407361984253, "learning_rate": 3.234937511658273e-07, "loss": 0.1157, "step": 87080 }, { "epoch": 2.0306374049138762, "grad_norm": 1.2635730504989624, "learning_rate": 3.2341602934775844e-07, "loss": 0.1042, "step": 87090 }, { "epoch": 2.030870566290694, "grad_norm": 1.2377886772155762, "learning_rate": 3.233383075296897e-07, "loss": 0.1028, "step": 87100 }, { "epoch": 2.031103727667512, "grad_norm": 1.3210489749908447, "learning_rate": 3.2326058571162097e-07, "loss": 0.1077, "step": 87110 }, { "epoch": 2.0313368890443297, "grad_norm": 1.0627968311309814, "learning_rate": 3.2318286389355223e-07, "loss": 0.0927, "step": 87120 }, { "epoch": 2.0315700504211476, "grad_norm": 1.0640861988067627, "learning_rate": 3.231051420754834e-07, "loss": 0.0999, "step": 87130 }, { "epoch": 2.031803211797966, "grad_norm": 1.3378115892410278, "learning_rate": 3.2302742025741465e-07, "loss": 0.1053, "step": 87140 }, { "epoch": 2.0320363731747837, "grad_norm": 2.3998568058013916, "learning_rate": 3.229496984393459e-07, "loss": 0.1124, "step": 87150 }, { "epoch": 2.0322695345516015, "grad_norm": 1.1091200113296509, "learning_rate": 3.228719766212771e-07, "loss": 0.108, "step": 87160 }, { "epoch": 2.0325026959284194, "grad_norm": 1.2465839385986328, "learning_rate": 3.2279425480320833e-07, "loss": 0.1049, "step": 87170 }, { "epoch": 2.032735857305237, "grad_norm": 2.897016763687134, "learning_rate": 3.227165329851396e-07, "loss": 0.1099, "step": 87180 }, { "epoch": 2.0329690186820555, "grad_norm": 1.3863197565078735, "learning_rate": 3.226388111670708e-07, "loss": 0.097, "step": 87190 }, { "epoch": 2.0332021800588733, "grad_norm": 3.3675179481506348, "learning_rate": 3.2256108934900206e-07, "loss": 0.1064, "step": 87200 }, { "epoch": 2.033435341435691, "grad_norm": 2.353041410446167, "learning_rate": 3.2248336753093327e-07, "loss": 0.1085, "step": 87210 }, { "epoch": 2.033668502812509, "grad_norm": 1.0974727869033813, "learning_rate": 3.224056457128645e-07, "loss": 0.1036, "step": 87220 }, { "epoch": 2.033901664189327, "grad_norm": 3.1549158096313477, "learning_rate": 3.2232792389479574e-07, "loss": 0.1199, "step": 87230 }, { "epoch": 2.034134825566145, "grad_norm": 2.0085885524749756, "learning_rate": 3.22250202076727e-07, "loss": 0.1144, "step": 87240 }, { "epoch": 2.034367986942963, "grad_norm": 1.216330647468567, "learning_rate": 3.2217248025865816e-07, "loss": 0.0922, "step": 87250 }, { "epoch": 2.034601148319781, "grad_norm": 2.0960938930511475, "learning_rate": 3.220947584405894e-07, "loss": 0.0998, "step": 87260 }, { "epoch": 2.0348343096965986, "grad_norm": 2.6578564643859863, "learning_rate": 3.220170366225207e-07, "loss": 0.0967, "step": 87270 }, { "epoch": 2.035067471073417, "grad_norm": 1.3073256015777588, "learning_rate": 3.219393148044519e-07, "loss": 0.0983, "step": 87280 }, { "epoch": 2.0353006324502347, "grad_norm": 2.8497331142425537, "learning_rate": 3.218615929863831e-07, "loss": 0.1061, "step": 87290 }, { "epoch": 2.0355337938270526, "grad_norm": 1.9045809507369995, "learning_rate": 3.2178387116831437e-07, "loss": 0.0992, "step": 87300 }, { "epoch": 2.0357669552038704, "grad_norm": 1.4591060876846313, "learning_rate": 3.217061493502456e-07, "loss": 0.0973, "step": 87310 }, { "epoch": 2.0360001165806882, "grad_norm": 1.4123936891555786, "learning_rate": 3.2162842753217684e-07, "loss": 0.1014, "step": 87320 }, { "epoch": 2.0362332779575065, "grad_norm": 1.2252689599990845, "learning_rate": 3.2155070571410805e-07, "loss": 0.1031, "step": 87330 }, { "epoch": 2.0364664393343244, "grad_norm": 2.049736976623535, "learning_rate": 3.2147298389603926e-07, "loss": 0.1032, "step": 87340 }, { "epoch": 2.036699600711142, "grad_norm": 1.8504347801208496, "learning_rate": 3.213952620779705e-07, "loss": 0.0919, "step": 87350 }, { "epoch": 2.03693276208796, "grad_norm": 2.8955392837524414, "learning_rate": 3.213175402599018e-07, "loss": 0.1212, "step": 87360 }, { "epoch": 2.037165923464778, "grad_norm": 1.615769386291504, "learning_rate": 3.2123981844183294e-07, "loss": 0.1109, "step": 87370 }, { "epoch": 2.037399084841596, "grad_norm": 3.227752923965454, "learning_rate": 3.211620966237642e-07, "loss": 0.1145, "step": 87380 }, { "epoch": 2.037632246218414, "grad_norm": 1.440700888633728, "learning_rate": 3.2108437480569546e-07, "loss": 0.098, "step": 87390 }, { "epoch": 2.037865407595232, "grad_norm": 2.52463960647583, "learning_rate": 3.2100665298762667e-07, "loss": 0.1066, "step": 87400 }, { "epoch": 2.0380985689720497, "grad_norm": 3.1895387172698975, "learning_rate": 3.209289311695579e-07, "loss": 0.1117, "step": 87410 }, { "epoch": 2.0383317303488675, "grad_norm": 1.8039116859436035, "learning_rate": 3.2085120935148914e-07, "loss": 0.1017, "step": 87420 }, { "epoch": 2.038564891725686, "grad_norm": 1.620628833770752, "learning_rate": 3.2077348753342035e-07, "loss": 0.098, "step": 87430 }, { "epoch": 2.0387980531025036, "grad_norm": 3.111741065979004, "learning_rate": 3.206957657153516e-07, "loss": 0.0963, "step": 87440 }, { "epoch": 2.0390312144793215, "grad_norm": 1.474483609199524, "learning_rate": 3.206180438972828e-07, "loss": 0.0974, "step": 87450 }, { "epoch": 2.0392643758561393, "grad_norm": 2.2794995307922363, "learning_rate": 3.2054032207921403e-07, "loss": 0.1186, "step": 87460 }, { "epoch": 2.0394975372329576, "grad_norm": 1.347537875175476, "learning_rate": 3.204626002611453e-07, "loss": 0.1062, "step": 87470 }, { "epoch": 2.0397306986097754, "grad_norm": 1.1762975454330444, "learning_rate": 3.2038487844307656e-07, "loss": 0.1111, "step": 87480 }, { "epoch": 2.0399638599865932, "grad_norm": 1.9293370246887207, "learning_rate": 3.203071566250077e-07, "loss": 0.0997, "step": 87490 }, { "epoch": 2.040197021363411, "grad_norm": 1.9819600582122803, "learning_rate": 3.20229434806939e-07, "loss": 0.0935, "step": 87500 }, { "epoch": 2.040430182740229, "grad_norm": 2.021986484527588, "learning_rate": 3.2015171298887024e-07, "loss": 0.1059, "step": 87510 }, { "epoch": 2.040663344117047, "grad_norm": 1.4871306419372559, "learning_rate": 3.2007399117080145e-07, "loss": 0.099, "step": 87520 }, { "epoch": 2.040896505493865, "grad_norm": 2.9759624004364014, "learning_rate": 3.1999626935273266e-07, "loss": 0.1124, "step": 87530 }, { "epoch": 2.041129666870683, "grad_norm": 2.763648509979248, "learning_rate": 3.199185475346639e-07, "loss": 0.1143, "step": 87540 }, { "epoch": 2.0413628282475007, "grad_norm": 2.830824136734009, "learning_rate": 3.1984082571659513e-07, "loss": 0.1044, "step": 87550 }, { "epoch": 2.0415959896243185, "grad_norm": 1.6686785221099854, "learning_rate": 3.197631038985264e-07, "loss": 0.0938, "step": 87560 }, { "epoch": 2.041829151001137, "grad_norm": 1.146440863609314, "learning_rate": 3.196853820804576e-07, "loss": 0.0913, "step": 87570 }, { "epoch": 2.0420623123779547, "grad_norm": 2.492893695831299, "learning_rate": 3.1960766026238886e-07, "loss": 0.1048, "step": 87580 }, { "epoch": 2.0422954737547725, "grad_norm": 2.901163339614868, "learning_rate": 3.1952993844432007e-07, "loss": 0.0972, "step": 87590 }, { "epoch": 2.0425286351315903, "grad_norm": 1.6820529699325562, "learning_rate": 3.1945221662625133e-07, "loss": 0.094, "step": 87600 }, { "epoch": 2.042761796508408, "grad_norm": 1.4549058675765991, "learning_rate": 3.1937449480818254e-07, "loss": 0.0953, "step": 87610 }, { "epoch": 2.0429949578852264, "grad_norm": 1.4747343063354492, "learning_rate": 3.1929677299011375e-07, "loss": 0.1005, "step": 87620 }, { "epoch": 2.0432281192620443, "grad_norm": 1.196021318435669, "learning_rate": 3.19219051172045e-07, "loss": 0.1057, "step": 87630 }, { "epoch": 2.043461280638862, "grad_norm": 2.243454694747925, "learning_rate": 3.191413293539763e-07, "loss": 0.114, "step": 87640 }, { "epoch": 2.04369444201568, "grad_norm": 1.359390377998352, "learning_rate": 3.1906360753590743e-07, "loss": 0.1034, "step": 87650 }, { "epoch": 2.0439276033924982, "grad_norm": 3.4667599201202393, "learning_rate": 3.189858857178387e-07, "loss": 0.1193, "step": 87660 }, { "epoch": 2.044160764769316, "grad_norm": 1.550918698310852, "learning_rate": 3.1890816389976995e-07, "loss": 0.1058, "step": 87670 }, { "epoch": 2.044393926146134, "grad_norm": 4.418781757354736, "learning_rate": 3.1883044208170116e-07, "loss": 0.1098, "step": 87680 }, { "epoch": 2.0446270875229517, "grad_norm": 1.9851669073104858, "learning_rate": 3.1875272026363237e-07, "loss": 0.1072, "step": 87690 }, { "epoch": 2.0448602488997696, "grad_norm": 1.30459725856781, "learning_rate": 3.1867499844556364e-07, "loss": 0.1011, "step": 87700 }, { "epoch": 2.045093410276588, "grad_norm": 1.6625454425811768, "learning_rate": 3.1859727662749484e-07, "loss": 0.1001, "step": 87710 }, { "epoch": 2.0453265716534057, "grad_norm": 1.4800019264221191, "learning_rate": 3.185195548094261e-07, "loss": 0.1151, "step": 87720 }, { "epoch": 2.0455597330302235, "grad_norm": 3.73917293548584, "learning_rate": 3.184418329913573e-07, "loss": 0.1153, "step": 87730 }, { "epoch": 2.0457928944070414, "grad_norm": 2.039107322692871, "learning_rate": 3.183641111732885e-07, "loss": 0.1117, "step": 87740 }, { "epoch": 2.046026055783859, "grad_norm": 2.5481183528900146, "learning_rate": 3.182863893552198e-07, "loss": 0.1028, "step": 87750 }, { "epoch": 2.0462592171606775, "grad_norm": 1.3435531854629517, "learning_rate": 3.1820866753715105e-07, "loss": 0.1074, "step": 87760 }, { "epoch": 2.0464923785374953, "grad_norm": 1.8783187866210938, "learning_rate": 3.181309457190822e-07, "loss": 0.1009, "step": 87770 }, { "epoch": 2.046725539914313, "grad_norm": 1.4177197217941284, "learning_rate": 3.1805322390101347e-07, "loss": 0.1026, "step": 87780 }, { "epoch": 2.046958701291131, "grad_norm": 1.1473991870880127, "learning_rate": 3.1797550208294473e-07, "loss": 0.1016, "step": 87790 }, { "epoch": 2.047191862667949, "grad_norm": 1.2080744504928589, "learning_rate": 3.1789778026487594e-07, "loss": 0.1027, "step": 87800 }, { "epoch": 2.047425024044767, "grad_norm": 2.0618791580200195, "learning_rate": 3.1782005844680715e-07, "loss": 0.1131, "step": 87810 }, { "epoch": 2.047658185421585, "grad_norm": 1.1851797103881836, "learning_rate": 3.177423366287384e-07, "loss": 0.1117, "step": 87820 }, { "epoch": 2.047891346798403, "grad_norm": 1.2478293180465698, "learning_rate": 3.176646148106696e-07, "loss": 0.111, "step": 87830 }, { "epoch": 2.0481245081752206, "grad_norm": 2.056211233139038, "learning_rate": 3.175868929926009e-07, "loss": 0.111, "step": 87840 }, { "epoch": 2.048357669552039, "grad_norm": 1.3231501579284668, "learning_rate": 3.175091711745321e-07, "loss": 0.1064, "step": 87850 }, { "epoch": 2.0485908309288567, "grad_norm": 1.5414791107177734, "learning_rate": 3.174314493564633e-07, "loss": 0.095, "step": 87860 }, { "epoch": 2.0488239923056746, "grad_norm": 1.352490782737732, "learning_rate": 3.1735372753839456e-07, "loss": 0.0908, "step": 87870 }, { "epoch": 2.0490571536824924, "grad_norm": 1.4633221626281738, "learning_rate": 3.172760057203258e-07, "loss": 0.1012, "step": 87880 }, { "epoch": 2.0492903150593103, "grad_norm": 1.278401494026184, "learning_rate": 3.17198283902257e-07, "loss": 0.1072, "step": 87890 }, { "epoch": 2.0495234764361285, "grad_norm": 4.129056930541992, "learning_rate": 3.1712056208418824e-07, "loss": 0.1125, "step": 87900 }, { "epoch": 2.0497566378129464, "grad_norm": 1.4320061206817627, "learning_rate": 3.170428402661195e-07, "loss": 0.1214, "step": 87910 }, { "epoch": 2.049989799189764, "grad_norm": 1.7561551332473755, "learning_rate": 3.169651184480507e-07, "loss": 0.1028, "step": 87920 }, { "epoch": 2.050222960566582, "grad_norm": 1.367938756942749, "learning_rate": 3.168873966299819e-07, "loss": 0.0935, "step": 87930 }, { "epoch": 2.0504561219434, "grad_norm": 2.3876802921295166, "learning_rate": 3.168096748119132e-07, "loss": 0.1068, "step": 87940 }, { "epoch": 2.050689283320218, "grad_norm": 1.1041755676269531, "learning_rate": 3.167319529938444e-07, "loss": 0.1193, "step": 87950 }, { "epoch": 2.050922444697036, "grad_norm": 1.7695820331573486, "learning_rate": 3.1665423117577566e-07, "loss": 0.1007, "step": 87960 }, { "epoch": 2.051155606073854, "grad_norm": 1.8280483484268188, "learning_rate": 3.1657650935770687e-07, "loss": 0.102, "step": 87970 }, { "epoch": 2.0513887674506717, "grad_norm": 2.0162577629089355, "learning_rate": 3.164987875396381e-07, "loss": 0.1052, "step": 87980 }, { "epoch": 2.0516219288274895, "grad_norm": 2.6623988151550293, "learning_rate": 3.1642106572156934e-07, "loss": 0.099, "step": 87990 }, { "epoch": 2.051855090204308, "grad_norm": 1.66448175907135, "learning_rate": 3.163433439035006e-07, "loss": 0.1073, "step": 88000 }, { "epoch": 2.0520882515811256, "grad_norm": 1.9383721351623535, "learning_rate": 3.1626562208543176e-07, "loss": 0.1234, "step": 88010 }, { "epoch": 2.0523214129579435, "grad_norm": 2.0021169185638428, "learning_rate": 3.16187900267363e-07, "loss": 0.0983, "step": 88020 }, { "epoch": 2.0525545743347613, "grad_norm": 1.4301872253417969, "learning_rate": 3.161101784492943e-07, "loss": 0.1033, "step": 88030 }, { "epoch": 2.0527877357115796, "grad_norm": 1.8827078342437744, "learning_rate": 3.1603245663122554e-07, "loss": 0.1026, "step": 88040 }, { "epoch": 2.0530208970883974, "grad_norm": 1.2662886381149292, "learning_rate": 3.159547348131567e-07, "loss": 0.1068, "step": 88050 }, { "epoch": 2.0532540584652152, "grad_norm": 1.7623318433761597, "learning_rate": 3.1587701299508796e-07, "loss": 0.1126, "step": 88060 }, { "epoch": 2.053487219842033, "grad_norm": 1.7117528915405273, "learning_rate": 3.157992911770192e-07, "loss": 0.1149, "step": 88070 }, { "epoch": 2.053720381218851, "grad_norm": 2.479910135269165, "learning_rate": 3.1572156935895043e-07, "loss": 0.1086, "step": 88080 }, { "epoch": 2.053953542595669, "grad_norm": 1.299502968788147, "learning_rate": 3.1564384754088164e-07, "loss": 0.0896, "step": 88090 }, { "epoch": 2.054186703972487, "grad_norm": 1.164758324623108, "learning_rate": 3.155661257228129e-07, "loss": 0.109, "step": 88100 }, { "epoch": 2.054419865349305, "grad_norm": 1.1025205850601196, "learning_rate": 3.154884039047441e-07, "loss": 0.1163, "step": 88110 }, { "epoch": 2.0546530267261227, "grad_norm": 1.2262537479400635, "learning_rate": 3.154106820866754e-07, "loss": 0.0944, "step": 88120 }, { "epoch": 2.0548861881029405, "grad_norm": 1.6534944772720337, "learning_rate": 3.153329602686066e-07, "loss": 0.1081, "step": 88130 }, { "epoch": 2.055119349479759, "grad_norm": 1.7830262184143066, "learning_rate": 3.152552384505378e-07, "loss": 0.1106, "step": 88140 }, { "epoch": 2.0553525108565767, "grad_norm": 1.3108782768249512, "learning_rate": 3.1517751663246906e-07, "loss": 0.0918, "step": 88150 }, { "epoch": 2.0555856722333945, "grad_norm": 1.7316933870315552, "learning_rate": 3.150997948144003e-07, "loss": 0.0976, "step": 88160 }, { "epoch": 2.0558188336102123, "grad_norm": 2.2447006702423096, "learning_rate": 3.150220729963315e-07, "loss": 0.1093, "step": 88170 }, { "epoch": 2.0560519949870306, "grad_norm": 1.916540503501892, "learning_rate": 3.1494435117826274e-07, "loss": 0.0982, "step": 88180 }, { "epoch": 2.0562851563638485, "grad_norm": 2.300387144088745, "learning_rate": 3.14866629360194e-07, "loss": 0.1122, "step": 88190 }, { "epoch": 2.0565183177406663, "grad_norm": 1.2753055095672607, "learning_rate": 3.147889075421252e-07, "loss": 0.1082, "step": 88200 }, { "epoch": 2.056751479117484, "grad_norm": 1.2800120115280151, "learning_rate": 3.147111857240564e-07, "loss": 0.1134, "step": 88210 }, { "epoch": 2.056984640494302, "grad_norm": 1.6353247165679932, "learning_rate": 3.146334639059877e-07, "loss": 0.1009, "step": 88220 }, { "epoch": 2.0572178018711202, "grad_norm": 2.752708673477173, "learning_rate": 3.145557420879189e-07, "loss": 0.1083, "step": 88230 }, { "epoch": 2.057450963247938, "grad_norm": 2.219677686691284, "learning_rate": 3.1447802026985015e-07, "loss": 0.1074, "step": 88240 }, { "epoch": 2.057684124624756, "grad_norm": 1.9525461196899414, "learning_rate": 3.1440029845178136e-07, "loss": 0.1081, "step": 88250 }, { "epoch": 2.0579172860015738, "grad_norm": 1.317163348197937, "learning_rate": 3.1432257663371257e-07, "loss": 0.0987, "step": 88260 }, { "epoch": 2.0581504473783916, "grad_norm": 1.9381887912750244, "learning_rate": 3.1424485481564383e-07, "loss": 0.1018, "step": 88270 }, { "epoch": 2.05838360875521, "grad_norm": 2.2456459999084473, "learning_rate": 3.141671329975751e-07, "loss": 0.1014, "step": 88280 }, { "epoch": 2.0586167701320277, "grad_norm": 1.3305177688598633, "learning_rate": 3.1408941117950625e-07, "loss": 0.1145, "step": 88290 }, { "epoch": 2.0588499315088455, "grad_norm": 1.3336331844329834, "learning_rate": 3.140116893614375e-07, "loss": 0.1008, "step": 88300 }, { "epoch": 2.0590830928856634, "grad_norm": 1.6380366086959839, "learning_rate": 3.139339675433688e-07, "loss": 0.101, "step": 88310 }, { "epoch": 2.059316254262481, "grad_norm": 3.3727104663848877, "learning_rate": 3.138562457253e-07, "loss": 0.1104, "step": 88320 }, { "epoch": 2.0595494156392995, "grad_norm": 2.5927064418792725, "learning_rate": 3.137785239072312e-07, "loss": 0.106, "step": 88330 }, { "epoch": 2.0597825770161173, "grad_norm": 1.3462852239608765, "learning_rate": 3.1370080208916245e-07, "loss": 0.1078, "step": 88340 }, { "epoch": 2.060015738392935, "grad_norm": 1.1547356843948364, "learning_rate": 3.1362308027109366e-07, "loss": 0.1074, "step": 88350 }, { "epoch": 2.060248899769753, "grad_norm": 1.4831645488739014, "learning_rate": 3.135453584530249e-07, "loss": 0.0888, "step": 88360 }, { "epoch": 2.0604820611465713, "grad_norm": 2.0898797512054443, "learning_rate": 3.1346763663495613e-07, "loss": 0.0992, "step": 88370 }, { "epoch": 2.060715222523389, "grad_norm": 2.1085152626037598, "learning_rate": 3.1338991481688734e-07, "loss": 0.0996, "step": 88380 }, { "epoch": 2.060948383900207, "grad_norm": 1.2587223052978516, "learning_rate": 3.133121929988186e-07, "loss": 0.1168, "step": 88390 }, { "epoch": 2.061181545277025, "grad_norm": 1.543247938156128, "learning_rate": 3.1323447118074987e-07, "loss": 0.1019, "step": 88400 }, { "epoch": 2.0614147066538426, "grad_norm": 1.576136589050293, "learning_rate": 3.1315674936268113e-07, "loss": 0.1083, "step": 88410 }, { "epoch": 2.061647868030661, "grad_norm": 1.278022050857544, "learning_rate": 3.130790275446123e-07, "loss": 0.0994, "step": 88420 }, { "epoch": 2.0618810294074787, "grad_norm": 1.5034770965576172, "learning_rate": 3.1300130572654355e-07, "loss": 0.1028, "step": 88430 }, { "epoch": 2.0621141907842966, "grad_norm": 1.088557481765747, "learning_rate": 3.129235839084748e-07, "loss": 0.1005, "step": 88440 }, { "epoch": 2.0623473521611144, "grad_norm": 2.7585344314575195, "learning_rate": 3.12845862090406e-07, "loss": 0.1077, "step": 88450 }, { "epoch": 2.0625805135379323, "grad_norm": 1.8933953046798706, "learning_rate": 3.1276814027233723e-07, "loss": 0.1115, "step": 88460 }, { "epoch": 2.0628136749147505, "grad_norm": 1.0326001644134521, "learning_rate": 3.126904184542685e-07, "loss": 0.109, "step": 88470 }, { "epoch": 2.0630468362915684, "grad_norm": 1.6773476600646973, "learning_rate": 3.126126966361997e-07, "loss": 0.1091, "step": 88480 }, { "epoch": 2.063279997668386, "grad_norm": 1.391352891921997, "learning_rate": 3.1253497481813096e-07, "loss": 0.1129, "step": 88490 }, { "epoch": 2.063513159045204, "grad_norm": 1.8094602823257446, "learning_rate": 3.1245725300006217e-07, "loss": 0.0988, "step": 88500 }, { "epoch": 2.063746320422022, "grad_norm": 1.821581244468689, "learning_rate": 3.123795311819934e-07, "loss": 0.1049, "step": 88510 }, { "epoch": 2.06397948179884, "grad_norm": 1.4094434976577759, "learning_rate": 3.1230180936392464e-07, "loss": 0.1088, "step": 88520 }, { "epoch": 2.064212643175658, "grad_norm": 1.7976458072662354, "learning_rate": 3.122240875458559e-07, "loss": 0.1107, "step": 88530 }, { "epoch": 2.064445804552476, "grad_norm": 2.1590681076049805, "learning_rate": 3.1214636572778706e-07, "loss": 0.1019, "step": 88540 }, { "epoch": 2.0646789659292937, "grad_norm": 2.608246326446533, "learning_rate": 3.120686439097183e-07, "loss": 0.1034, "step": 88550 }, { "epoch": 2.064912127306112, "grad_norm": 1.1269338130950928, "learning_rate": 3.119909220916496e-07, "loss": 0.0943, "step": 88560 }, { "epoch": 2.06514528868293, "grad_norm": 1.2070918083190918, "learning_rate": 3.119132002735808e-07, "loss": 0.097, "step": 88570 }, { "epoch": 2.0653784500597476, "grad_norm": 1.0932546854019165, "learning_rate": 3.11835478455512e-07, "loss": 0.113, "step": 88580 }, { "epoch": 2.0656116114365655, "grad_norm": 1.2125422954559326, "learning_rate": 3.1175775663744327e-07, "loss": 0.1056, "step": 88590 }, { "epoch": 2.0658447728133833, "grad_norm": 4.013375282287598, "learning_rate": 3.116800348193745e-07, "loss": 0.1028, "step": 88600 }, { "epoch": 2.0660779341902016, "grad_norm": 1.1967737674713135, "learning_rate": 3.1160231300130574e-07, "loss": 0.1077, "step": 88610 }, { "epoch": 2.0663110955670194, "grad_norm": 1.1645381450653076, "learning_rate": 3.1152459118323695e-07, "loss": 0.1043, "step": 88620 }, { "epoch": 2.0665442569438373, "grad_norm": 1.871955394744873, "learning_rate": 3.1144686936516816e-07, "loss": 0.1013, "step": 88630 }, { "epoch": 2.066777418320655, "grad_norm": 3.0294315814971924, "learning_rate": 3.113691475470994e-07, "loss": 0.1096, "step": 88640 }, { "epoch": 2.067010579697473, "grad_norm": 2.4200732707977295, "learning_rate": 3.112914257290307e-07, "loss": 0.1066, "step": 88650 }, { "epoch": 2.067243741074291, "grad_norm": 1.587921380996704, "learning_rate": 3.1121370391096184e-07, "loss": 0.1029, "step": 88660 }, { "epoch": 2.067476902451109, "grad_norm": 1.799625039100647, "learning_rate": 3.111359820928931e-07, "loss": 0.1054, "step": 88670 }, { "epoch": 2.067710063827927, "grad_norm": 1.3619191646575928, "learning_rate": 3.1105826027482436e-07, "loss": 0.1115, "step": 88680 }, { "epoch": 2.0679432252047447, "grad_norm": 1.8296732902526855, "learning_rate": 3.1098053845675557e-07, "loss": 0.108, "step": 88690 }, { "epoch": 2.0681763865815626, "grad_norm": 2.653916835784912, "learning_rate": 3.109028166386868e-07, "loss": 0.1121, "step": 88700 }, { "epoch": 2.068409547958381, "grad_norm": 1.5499927997589111, "learning_rate": 3.1082509482061804e-07, "loss": 0.1017, "step": 88710 }, { "epoch": 2.0686427093351987, "grad_norm": 3.356346368789673, "learning_rate": 3.1074737300254925e-07, "loss": 0.0966, "step": 88720 }, { "epoch": 2.0688758707120165, "grad_norm": 1.5380396842956543, "learning_rate": 3.106696511844805e-07, "loss": 0.106, "step": 88730 }, { "epoch": 2.0691090320888343, "grad_norm": 4.450819492340088, "learning_rate": 3.105919293664117e-07, "loss": 0.1075, "step": 88740 }, { "epoch": 2.0693421934656526, "grad_norm": 1.6316378116607666, "learning_rate": 3.1051420754834293e-07, "loss": 0.1096, "step": 88750 }, { "epoch": 2.0695753548424705, "grad_norm": 2.6127655506134033, "learning_rate": 3.104364857302742e-07, "loss": 0.1043, "step": 88760 }, { "epoch": 2.0698085162192883, "grad_norm": 2.579206705093384, "learning_rate": 3.1035876391220546e-07, "loss": 0.0857, "step": 88770 }, { "epoch": 2.070041677596106, "grad_norm": 1.0434799194335938, "learning_rate": 3.102810420941366e-07, "loss": 0.1121, "step": 88780 }, { "epoch": 2.070274838972924, "grad_norm": 3.5988385677337646, "learning_rate": 3.102033202760679e-07, "loss": 0.0971, "step": 88790 }, { "epoch": 2.0705080003497423, "grad_norm": 1.6577088832855225, "learning_rate": 3.1012559845799914e-07, "loss": 0.1141, "step": 88800 }, { "epoch": 2.07074116172656, "grad_norm": 1.2583017349243164, "learning_rate": 3.100478766399304e-07, "loss": 0.1018, "step": 88810 }, { "epoch": 2.070974323103378, "grad_norm": 1.6769933700561523, "learning_rate": 3.0997015482186156e-07, "loss": 0.0994, "step": 88820 }, { "epoch": 2.0712074844801958, "grad_norm": 2.100327253341675, "learning_rate": 3.098924330037928e-07, "loss": 0.1081, "step": 88830 }, { "epoch": 2.0714406458570136, "grad_norm": 1.6482069492340088, "learning_rate": 3.098147111857241e-07, "loss": 0.1102, "step": 88840 }, { "epoch": 2.071673807233832, "grad_norm": 1.4046121835708618, "learning_rate": 3.097369893676553e-07, "loss": 0.1032, "step": 88850 }, { "epoch": 2.0719069686106497, "grad_norm": 1.437243103981018, "learning_rate": 3.096592675495865e-07, "loss": 0.1121, "step": 88860 }, { "epoch": 2.0721401299874675, "grad_norm": 1.3992341756820679, "learning_rate": 3.0958154573151776e-07, "loss": 0.0981, "step": 88870 }, { "epoch": 2.0723732913642854, "grad_norm": 1.76912522315979, "learning_rate": 3.0950382391344897e-07, "loss": 0.0998, "step": 88880 }, { "epoch": 2.0726064527411032, "grad_norm": 1.261281967163086, "learning_rate": 3.0942610209538023e-07, "loss": 0.0979, "step": 88890 }, { "epoch": 2.0728396141179215, "grad_norm": 2.0883138179779053, "learning_rate": 3.0934838027731144e-07, "loss": 0.0973, "step": 88900 }, { "epoch": 2.0730727754947393, "grad_norm": 1.605448842048645, "learning_rate": 3.0927065845924265e-07, "loss": 0.1065, "step": 88910 }, { "epoch": 2.073305936871557, "grad_norm": 1.3937733173370361, "learning_rate": 3.091929366411739e-07, "loss": 0.1066, "step": 88920 }, { "epoch": 2.073539098248375, "grad_norm": 1.4545083045959473, "learning_rate": 3.091152148231052e-07, "loss": 0.1045, "step": 88930 }, { "epoch": 2.0737722596251933, "grad_norm": 1.0756431818008423, "learning_rate": 3.0903749300503633e-07, "loss": 0.0951, "step": 88940 }, { "epoch": 2.074005421002011, "grad_norm": 2.3245737552642822, "learning_rate": 3.089597711869676e-07, "loss": 0.1063, "step": 88950 }, { "epoch": 2.074238582378829, "grad_norm": 1.3906419277191162, "learning_rate": 3.0888204936889885e-07, "loss": 0.1041, "step": 88960 }, { "epoch": 2.074471743755647, "grad_norm": 2.749196767807007, "learning_rate": 3.0880432755083006e-07, "loss": 0.1067, "step": 88970 }, { "epoch": 2.0747049051324646, "grad_norm": 3.6591577529907227, "learning_rate": 3.0872660573276127e-07, "loss": 0.1111, "step": 88980 }, { "epoch": 2.074938066509283, "grad_norm": 1.5724701881408691, "learning_rate": 3.0864888391469254e-07, "loss": 0.1071, "step": 88990 }, { "epoch": 2.0751712278861008, "grad_norm": 1.1742439270019531, "learning_rate": 3.0857116209662374e-07, "loss": 0.1016, "step": 89000 }, { "epoch": 2.0754043892629186, "grad_norm": 1.5427743196487427, "learning_rate": 3.08493440278555e-07, "loss": 0.1052, "step": 89010 }, { "epoch": 2.0756375506397364, "grad_norm": 1.455855369567871, "learning_rate": 3.084157184604862e-07, "loss": 0.0904, "step": 89020 }, { "epoch": 2.0758707120165543, "grad_norm": 2.07336163520813, "learning_rate": 3.083379966424174e-07, "loss": 0.0967, "step": 89030 }, { "epoch": 2.0761038733933725, "grad_norm": 2.4702839851379395, "learning_rate": 3.082602748243487e-07, "loss": 0.0905, "step": 89040 }, { "epoch": 2.0763370347701904, "grad_norm": 1.376688838005066, "learning_rate": 3.0818255300627995e-07, "loss": 0.0921, "step": 89050 }, { "epoch": 2.076570196147008, "grad_norm": 2.83034086227417, "learning_rate": 3.081048311882111e-07, "loss": 0.1013, "step": 89060 }, { "epoch": 2.076803357523826, "grad_norm": 1.0929840803146362, "learning_rate": 3.0802710937014237e-07, "loss": 0.0928, "step": 89070 }, { "epoch": 2.077036518900644, "grad_norm": 1.5319044589996338, "learning_rate": 3.0794938755207363e-07, "loss": 0.1092, "step": 89080 }, { "epoch": 2.077269680277462, "grad_norm": 1.2343462705612183, "learning_rate": 3.0787166573400484e-07, "loss": 0.0975, "step": 89090 }, { "epoch": 2.07750284165428, "grad_norm": 1.1998238563537598, "learning_rate": 3.0779394391593605e-07, "loss": 0.0881, "step": 89100 }, { "epoch": 2.077736003031098, "grad_norm": 1.7045215368270874, "learning_rate": 3.077162220978673e-07, "loss": 0.0997, "step": 89110 }, { "epoch": 2.0779691644079157, "grad_norm": 1.6829819679260254, "learning_rate": 3.076385002797985e-07, "loss": 0.1123, "step": 89120 }, { "epoch": 2.078202325784734, "grad_norm": 1.8969942331314087, "learning_rate": 3.075607784617298e-07, "loss": 0.1047, "step": 89130 }, { "epoch": 2.078435487161552, "grad_norm": 1.6500228643417358, "learning_rate": 3.07483056643661e-07, "loss": 0.1036, "step": 89140 }, { "epoch": 2.0786686485383696, "grad_norm": 2.524833917617798, "learning_rate": 3.074053348255922e-07, "loss": 0.1025, "step": 89150 }, { "epoch": 2.0789018099151875, "grad_norm": 2.406586170196533, "learning_rate": 3.0732761300752346e-07, "loss": 0.1076, "step": 89160 }, { "epoch": 2.0791349712920053, "grad_norm": 1.6528692245483398, "learning_rate": 3.072498911894547e-07, "loss": 0.1122, "step": 89170 }, { "epoch": 2.0793681326688236, "grad_norm": 2.1976568698883057, "learning_rate": 3.071721693713859e-07, "loss": 0.1006, "step": 89180 }, { "epoch": 2.0796012940456414, "grad_norm": 2.142672300338745, "learning_rate": 3.0709444755331714e-07, "loss": 0.1136, "step": 89190 }, { "epoch": 2.0798344554224593, "grad_norm": 0.956517219543457, "learning_rate": 3.070167257352484e-07, "loss": 0.1095, "step": 89200 }, { "epoch": 2.080067616799277, "grad_norm": 1.2304487228393555, "learning_rate": 3.069390039171796e-07, "loss": 0.0957, "step": 89210 }, { "epoch": 2.080300778176095, "grad_norm": 1.9331417083740234, "learning_rate": 3.068612820991108e-07, "loss": 0.1075, "step": 89220 }, { "epoch": 2.080533939552913, "grad_norm": 2.369215965270996, "learning_rate": 3.067835602810421e-07, "loss": 0.1174, "step": 89230 }, { "epoch": 2.080767100929731, "grad_norm": 1.7159875631332397, "learning_rate": 3.067058384629733e-07, "loss": 0.1074, "step": 89240 }, { "epoch": 2.081000262306549, "grad_norm": 3.505229949951172, "learning_rate": 3.0662811664490456e-07, "loss": 0.0973, "step": 89250 }, { "epoch": 2.0812334236833667, "grad_norm": 1.3070406913757324, "learning_rate": 3.0655039482683577e-07, "loss": 0.0967, "step": 89260 }, { "epoch": 2.081466585060185, "grad_norm": 1.1523772478103638, "learning_rate": 3.0647267300876703e-07, "loss": 0.1033, "step": 89270 }, { "epoch": 2.081699746437003, "grad_norm": 1.3308054208755493, "learning_rate": 3.0639495119069824e-07, "loss": 0.1212, "step": 89280 }, { "epoch": 2.0819329078138207, "grad_norm": 1.8026151657104492, "learning_rate": 3.063172293726295e-07, "loss": 0.0998, "step": 89290 }, { "epoch": 2.0821660691906385, "grad_norm": 2.2762601375579834, "learning_rate": 3.062395075545607e-07, "loss": 0.1182, "step": 89300 }, { "epoch": 2.0823992305674563, "grad_norm": 3.029355764389038, "learning_rate": 3.061617857364919e-07, "loss": 0.1023, "step": 89310 }, { "epoch": 2.0826323919442746, "grad_norm": 1.5711514949798584, "learning_rate": 3.060840639184232e-07, "loss": 0.1063, "step": 89320 }, { "epoch": 2.0828655533210925, "grad_norm": 1.659615397453308, "learning_rate": 3.0600634210035444e-07, "loss": 0.0998, "step": 89330 }, { "epoch": 2.0830987146979103, "grad_norm": 1.0822041034698486, "learning_rate": 3.059286202822856e-07, "loss": 0.0965, "step": 89340 }, { "epoch": 2.083331876074728, "grad_norm": 1.5536963939666748, "learning_rate": 3.0585089846421686e-07, "loss": 0.103, "step": 89350 }, { "epoch": 2.083565037451546, "grad_norm": 1.918137788772583, "learning_rate": 3.057731766461481e-07, "loss": 0.1042, "step": 89360 }, { "epoch": 2.0837981988283643, "grad_norm": 1.7452470064163208, "learning_rate": 3.0569545482807933e-07, "loss": 0.1054, "step": 89370 }, { "epoch": 2.084031360205182, "grad_norm": 1.652292013168335, "learning_rate": 3.0561773301001054e-07, "loss": 0.1047, "step": 89380 }, { "epoch": 2.084264521582, "grad_norm": 1.8174362182617188, "learning_rate": 3.055400111919418e-07, "loss": 0.12, "step": 89390 }, { "epoch": 2.0844976829588178, "grad_norm": 1.2507646083831787, "learning_rate": 3.05462289373873e-07, "loss": 0.1054, "step": 89400 }, { "epoch": 2.0847308443356356, "grad_norm": 1.1505106687545776, "learning_rate": 3.053845675558043e-07, "loss": 0.1065, "step": 89410 }, { "epoch": 2.084964005712454, "grad_norm": 2.490539312362671, "learning_rate": 3.053068457377355e-07, "loss": 0.1026, "step": 89420 }, { "epoch": 2.0851971670892717, "grad_norm": 3.2908921241760254, "learning_rate": 3.052291239196667e-07, "loss": 0.1137, "step": 89430 }, { "epoch": 2.0854303284660896, "grad_norm": 2.232793092727661, "learning_rate": 3.0515140210159796e-07, "loss": 0.0994, "step": 89440 }, { "epoch": 2.0856634898429074, "grad_norm": 0.9848642349243164, "learning_rate": 3.050736802835292e-07, "loss": 0.1062, "step": 89450 }, { "epoch": 2.0858966512197252, "grad_norm": 1.6334419250488281, "learning_rate": 3.049959584654604e-07, "loss": 0.0973, "step": 89460 }, { "epoch": 2.0861298125965435, "grad_norm": 1.0044299364089966, "learning_rate": 3.0491823664739164e-07, "loss": 0.0953, "step": 89470 }, { "epoch": 2.0863629739733613, "grad_norm": 1.3816146850585938, "learning_rate": 3.048405148293229e-07, "loss": 0.1023, "step": 89480 }, { "epoch": 2.086596135350179, "grad_norm": 1.6824185848236084, "learning_rate": 3.047627930112541e-07, "loss": 0.1003, "step": 89490 }, { "epoch": 2.086829296726997, "grad_norm": 3.0350725650787354, "learning_rate": 3.046850711931853e-07, "loss": 0.1015, "step": 89500 }, { "epoch": 2.0870624581038153, "grad_norm": 1.5436625480651855, "learning_rate": 3.046073493751166e-07, "loss": 0.104, "step": 89510 }, { "epoch": 2.087295619480633, "grad_norm": 1.4430499076843262, "learning_rate": 3.045296275570478e-07, "loss": 0.1078, "step": 89520 }, { "epoch": 2.087528780857451, "grad_norm": 2.338787794113159, "learning_rate": 3.0445190573897905e-07, "loss": 0.108, "step": 89530 }, { "epoch": 2.087761942234269, "grad_norm": 1.6757025718688965, "learning_rate": 3.0437418392091026e-07, "loss": 0.0926, "step": 89540 }, { "epoch": 2.0879951036110866, "grad_norm": 2.82121205329895, "learning_rate": 3.0429646210284147e-07, "loss": 0.0982, "step": 89550 }, { "epoch": 2.088228264987905, "grad_norm": 1.3381214141845703, "learning_rate": 3.0421874028477273e-07, "loss": 0.0964, "step": 89560 }, { "epoch": 2.0884614263647228, "grad_norm": 1.793511986732483, "learning_rate": 3.04141018466704e-07, "loss": 0.0915, "step": 89570 }, { "epoch": 2.0886945877415406, "grad_norm": 1.1475518941879272, "learning_rate": 3.0406329664863515e-07, "loss": 0.1052, "step": 89580 }, { "epoch": 2.0889277491183584, "grad_norm": 2.9258224964141846, "learning_rate": 3.039855748305664e-07, "loss": 0.1085, "step": 89590 }, { "epoch": 2.0891609104951763, "grad_norm": 1.6678074598312378, "learning_rate": 3.0390785301249767e-07, "loss": 0.0997, "step": 89600 }, { "epoch": 2.0893940718719946, "grad_norm": 1.4583275318145752, "learning_rate": 3.038301311944289e-07, "loss": 0.1058, "step": 89610 }, { "epoch": 2.0896272332488124, "grad_norm": 2.023381471633911, "learning_rate": 3.037524093763601e-07, "loss": 0.1068, "step": 89620 }, { "epoch": 2.0898603946256302, "grad_norm": 1.8668700456619263, "learning_rate": 3.0367468755829135e-07, "loss": 0.1059, "step": 89630 }, { "epoch": 2.090093556002448, "grad_norm": 1.8487629890441895, "learning_rate": 3.0359696574022256e-07, "loss": 0.103, "step": 89640 }, { "epoch": 2.0903267173792663, "grad_norm": 1.90544855594635, "learning_rate": 3.035192439221538e-07, "loss": 0.1028, "step": 89650 }, { "epoch": 2.090559878756084, "grad_norm": 1.8843133449554443, "learning_rate": 3.0344152210408503e-07, "loss": 0.1031, "step": 89660 }, { "epoch": 2.090793040132902, "grad_norm": 1.5280060768127441, "learning_rate": 3.0336380028601624e-07, "loss": 0.102, "step": 89670 }, { "epoch": 2.09102620150972, "grad_norm": 1.6377954483032227, "learning_rate": 3.032860784679475e-07, "loss": 0.1157, "step": 89680 }, { "epoch": 2.0912593628865377, "grad_norm": 1.666441559791565, "learning_rate": 3.0320835664987877e-07, "loss": 0.098, "step": 89690 }, { "epoch": 2.091492524263356, "grad_norm": 5.008193016052246, "learning_rate": 3.031306348318099e-07, "loss": 0.1075, "step": 89700 }, { "epoch": 2.091725685640174, "grad_norm": 0.9819686412811279, "learning_rate": 3.030529130137412e-07, "loss": 0.1038, "step": 89710 }, { "epoch": 2.0919588470169916, "grad_norm": 1.6048942804336548, "learning_rate": 3.0297519119567245e-07, "loss": 0.1114, "step": 89720 }, { "epoch": 2.0921920083938095, "grad_norm": 1.570779800415039, "learning_rate": 3.028974693776037e-07, "loss": 0.0977, "step": 89730 }, { "epoch": 2.0924251697706273, "grad_norm": 2.098839521408081, "learning_rate": 3.0281974755953487e-07, "loss": 0.1076, "step": 89740 }, { "epoch": 2.0926583311474456, "grad_norm": 1.7394226789474487, "learning_rate": 3.0274202574146613e-07, "loss": 0.1024, "step": 89750 }, { "epoch": 2.0928914925242634, "grad_norm": 1.5317641496658325, "learning_rate": 3.026643039233974e-07, "loss": 0.1, "step": 89760 }, { "epoch": 2.0931246539010813, "grad_norm": 1.7361515760421753, "learning_rate": 3.025865821053286e-07, "loss": 0.0966, "step": 89770 }, { "epoch": 2.093357815277899, "grad_norm": 1.276424765586853, "learning_rate": 3.025088602872598e-07, "loss": 0.1093, "step": 89780 }, { "epoch": 2.093590976654717, "grad_norm": 3.7633910179138184, "learning_rate": 3.0243113846919107e-07, "loss": 0.0995, "step": 89790 }, { "epoch": 2.093824138031535, "grad_norm": 1.1329536437988281, "learning_rate": 3.023534166511223e-07, "loss": 0.101, "step": 89800 }, { "epoch": 2.094057299408353, "grad_norm": 2.639923572540283, "learning_rate": 3.0227569483305354e-07, "loss": 0.0971, "step": 89810 }, { "epoch": 2.094290460785171, "grad_norm": 2.970797300338745, "learning_rate": 3.0219797301498475e-07, "loss": 0.1008, "step": 89820 }, { "epoch": 2.0945236221619887, "grad_norm": 1.2564172744750977, "learning_rate": 3.0212025119691596e-07, "loss": 0.1047, "step": 89830 }, { "epoch": 2.094756783538807, "grad_norm": 1.7306839227676392, "learning_rate": 3.020425293788472e-07, "loss": 0.1033, "step": 89840 }, { "epoch": 2.094989944915625, "grad_norm": 2.090402126312256, "learning_rate": 3.019648075607785e-07, "loss": 0.1011, "step": 89850 }, { "epoch": 2.0952231062924427, "grad_norm": 1.5813919305801392, "learning_rate": 3.0188708574270964e-07, "loss": 0.1042, "step": 89860 }, { "epoch": 2.0954562676692605, "grad_norm": 1.4556118249893188, "learning_rate": 3.018093639246409e-07, "loss": 0.1049, "step": 89870 }, { "epoch": 2.0956894290460784, "grad_norm": 1.5106465816497803, "learning_rate": 3.0173164210657217e-07, "loss": 0.0973, "step": 89880 }, { "epoch": 2.0959225904228966, "grad_norm": 2.0035953521728516, "learning_rate": 3.016539202885034e-07, "loss": 0.0995, "step": 89890 }, { "epoch": 2.0961557517997145, "grad_norm": 1.6575884819030762, "learning_rate": 3.015761984704346e-07, "loss": 0.1048, "step": 89900 }, { "epoch": 2.0963889131765323, "grad_norm": 1.2088392972946167, "learning_rate": 3.0149847665236585e-07, "loss": 0.1037, "step": 89910 }, { "epoch": 2.09662207455335, "grad_norm": 2.101712226867676, "learning_rate": 3.0142075483429706e-07, "loss": 0.1057, "step": 89920 }, { "epoch": 2.096855235930168, "grad_norm": 1.1950706243515015, "learning_rate": 3.013430330162283e-07, "loss": 0.0915, "step": 89930 }, { "epoch": 2.0970883973069863, "grad_norm": 1.362237572669983, "learning_rate": 3.0126531119815953e-07, "loss": 0.1143, "step": 89940 }, { "epoch": 2.097321558683804, "grad_norm": 3.4581403732299805, "learning_rate": 3.0118758938009074e-07, "loss": 0.1035, "step": 89950 }, { "epoch": 2.097554720060622, "grad_norm": 1.212756633758545, "learning_rate": 3.01109867562022e-07, "loss": 0.0983, "step": 89960 }, { "epoch": 2.0977878814374398, "grad_norm": 2.14471173286438, "learning_rate": 3.0103214574395326e-07, "loss": 0.0984, "step": 89970 }, { "epoch": 2.0980210428142576, "grad_norm": 1.9480311870574951, "learning_rate": 3.009544239258844e-07, "loss": 0.1061, "step": 89980 }, { "epoch": 2.098254204191076, "grad_norm": 2.110234498977661, "learning_rate": 3.008767021078157e-07, "loss": 0.1075, "step": 89990 }, { "epoch": 2.0984873655678937, "grad_norm": 1.976959466934204, "learning_rate": 3.0079898028974694e-07, "loss": 0.1121, "step": 90000 }, { "epoch": 2.0984873655678937, "eval_accuracy": 0.9474462790135951, "eval_f1": 0.9624354370670821, "eval_loss": 0.13703207671642303, "eval_runtime": 5852.1193, "eval_samples_per_second": 312.693, "eval_steps_per_second": 39.087, "step": 90000 }, { "epoch": 2.0987205269447116, "grad_norm": 1.2056165933609009, "learning_rate": 3.0072125847167815e-07, "loss": 0.1149, "step": 90010 }, { "epoch": 2.0989536883215294, "grad_norm": 1.5346128940582275, "learning_rate": 3.0064353665360936e-07, "loss": 0.1022, "step": 90020 }, { "epoch": 2.0991868496983477, "grad_norm": 2.0860726833343506, "learning_rate": 3.005735870173475e-07, "loss": 0.1056, "step": 90030 }, { "epoch": 2.0994200110751655, "grad_norm": 1.1674572229385376, "learning_rate": 3.004958651992787e-07, "loss": 0.099, "step": 90040 }, { "epoch": 2.0996531724519834, "grad_norm": 1.3226288557052612, "learning_rate": 3.0041814338120996e-07, "loss": 0.1049, "step": 90050 }, { "epoch": 2.099886333828801, "grad_norm": 1.1717498302459717, "learning_rate": 3.003404215631412e-07, "loss": 0.1057, "step": 90060 }, { "epoch": 2.100119495205619, "grad_norm": 1.1541348695755005, "learning_rate": 3.002626997450724e-07, "loss": 0.1205, "step": 90070 }, { "epoch": 2.1003526565824373, "grad_norm": 1.3739267587661743, "learning_rate": 3.0018497792700364e-07, "loss": 0.1099, "step": 90080 }, { "epoch": 2.100585817959255, "grad_norm": 1.618378758430481, "learning_rate": 3.001072561089349e-07, "loss": 0.1226, "step": 90090 }, { "epoch": 2.100818979336073, "grad_norm": 1.2460930347442627, "learning_rate": 3.0002953429086616e-07, "loss": 0.1026, "step": 90100 }, { "epoch": 2.101052140712891, "grad_norm": 1.6389445066452026, "learning_rate": 2.999518124727973e-07, "loss": 0.1026, "step": 90110 }, { "epoch": 2.1012853020897087, "grad_norm": 3.279780387878418, "learning_rate": 2.998740906547286e-07, "loss": 0.115, "step": 90120 }, { "epoch": 2.101518463466527, "grad_norm": 1.6829243898391724, "learning_rate": 2.9979636883665984e-07, "loss": 0.1008, "step": 90130 }, { "epoch": 2.1017516248433448, "grad_norm": 1.3368040323257446, "learning_rate": 2.9971864701859105e-07, "loss": 0.0979, "step": 90140 }, { "epoch": 2.1019847862201626, "grad_norm": 1.2564003467559814, "learning_rate": 2.9964092520052226e-07, "loss": 0.0972, "step": 90150 }, { "epoch": 2.1022179475969804, "grad_norm": 1.2476176023483276, "learning_rate": 2.995632033824535e-07, "loss": 0.0989, "step": 90160 }, { "epoch": 2.1024511089737983, "grad_norm": 1.9904536008834839, "learning_rate": 2.9948548156438473e-07, "loss": 0.0994, "step": 90170 }, { "epoch": 2.1026842703506166, "grad_norm": 1.3659411668777466, "learning_rate": 2.99407759746316e-07, "loss": 0.0972, "step": 90180 }, { "epoch": 2.1029174317274344, "grad_norm": 1.5701409578323364, "learning_rate": 2.993300379282472e-07, "loss": 0.0927, "step": 90190 }, { "epoch": 2.1031505931042522, "grad_norm": 2.6488702297210693, "learning_rate": 2.992523161101784e-07, "loss": 0.1111, "step": 90200 }, { "epoch": 2.10338375448107, "grad_norm": 1.3917187452316284, "learning_rate": 2.991745942921097e-07, "loss": 0.1139, "step": 90210 }, { "epoch": 2.1036169158578883, "grad_norm": 2.277284622192383, "learning_rate": 2.9909687247404094e-07, "loss": 0.1173, "step": 90220 }, { "epoch": 2.103850077234706, "grad_norm": 2.424978733062744, "learning_rate": 2.990191506559721e-07, "loss": 0.094, "step": 90230 }, { "epoch": 2.104083238611524, "grad_norm": 1.6862062215805054, "learning_rate": 2.9894142883790336e-07, "loss": 0.1058, "step": 90240 }, { "epoch": 2.104316399988342, "grad_norm": 3.2280399799346924, "learning_rate": 2.988637070198346e-07, "loss": 0.1065, "step": 90250 }, { "epoch": 2.1045495613651597, "grad_norm": 1.8455419540405273, "learning_rate": 2.9878598520176583e-07, "loss": 0.0974, "step": 90260 }, { "epoch": 2.104782722741978, "grad_norm": 2.551913261413574, "learning_rate": 2.9870826338369704e-07, "loss": 0.1148, "step": 90270 }, { "epoch": 2.105015884118796, "grad_norm": 2.060554265975952, "learning_rate": 2.986305415656283e-07, "loss": 0.1078, "step": 90280 }, { "epoch": 2.1052490454956136, "grad_norm": 1.7543985843658447, "learning_rate": 2.985528197475595e-07, "loss": 0.0966, "step": 90290 }, { "epoch": 2.1054822068724315, "grad_norm": 1.4978692531585693, "learning_rate": 2.9847509792949077e-07, "loss": 0.094, "step": 90300 }, { "epoch": 2.1057153682492493, "grad_norm": 1.828139305114746, "learning_rate": 2.98397376111422e-07, "loss": 0.1106, "step": 90310 }, { "epoch": 2.1059485296260676, "grad_norm": 1.7215384244918823, "learning_rate": 2.983196542933532e-07, "loss": 0.1122, "step": 90320 }, { "epoch": 2.1061816910028854, "grad_norm": 1.1718864440917969, "learning_rate": 2.9824193247528445e-07, "loss": 0.0946, "step": 90330 }, { "epoch": 2.1064148523797033, "grad_norm": 1.4802184104919434, "learning_rate": 2.981642106572157e-07, "loss": 0.1077, "step": 90340 }, { "epoch": 2.106648013756521, "grad_norm": 1.476861834526062, "learning_rate": 2.9808648883914687e-07, "loss": 0.0991, "step": 90350 }, { "epoch": 2.1068811751333394, "grad_norm": 1.7016587257385254, "learning_rate": 2.9800876702107813e-07, "loss": 0.0979, "step": 90360 }, { "epoch": 2.1071143365101572, "grad_norm": 1.554193139076233, "learning_rate": 2.979310452030094e-07, "loss": 0.1088, "step": 90370 }, { "epoch": 2.107347497886975, "grad_norm": 1.5750337839126587, "learning_rate": 2.978533233849406e-07, "loss": 0.1153, "step": 90380 }, { "epoch": 2.107580659263793, "grad_norm": 1.6729494333267212, "learning_rate": 2.977756015668718e-07, "loss": 0.106, "step": 90390 }, { "epoch": 2.1078138206406107, "grad_norm": 1.70180344581604, "learning_rate": 2.9769787974880307e-07, "loss": 0.1059, "step": 90400 }, { "epoch": 2.108046982017429, "grad_norm": 2.5878942012786865, "learning_rate": 2.976201579307343e-07, "loss": 0.1006, "step": 90410 }, { "epoch": 2.108280143394247, "grad_norm": 1.8211064338684082, "learning_rate": 2.9754243611266554e-07, "loss": 0.0976, "step": 90420 }, { "epoch": 2.1085133047710647, "grad_norm": 1.3398919105529785, "learning_rate": 2.9746471429459675e-07, "loss": 0.0929, "step": 90430 }, { "epoch": 2.1087464661478825, "grad_norm": 1.568200945854187, "learning_rate": 2.9738699247652796e-07, "loss": 0.0928, "step": 90440 }, { "epoch": 2.1089796275247004, "grad_norm": 1.4958479404449463, "learning_rate": 2.973092706584592e-07, "loss": 0.1137, "step": 90450 }, { "epoch": 2.1092127889015186, "grad_norm": 1.5347892045974731, "learning_rate": 2.972315488403905e-07, "loss": 0.1067, "step": 90460 }, { "epoch": 2.1094459502783365, "grad_norm": 1.7976372241973877, "learning_rate": 2.9715382702232164e-07, "loss": 0.1075, "step": 90470 }, { "epoch": 2.1096791116551543, "grad_norm": 1.1599856615066528, "learning_rate": 2.970761052042529e-07, "loss": 0.096, "step": 90480 }, { "epoch": 2.109912273031972, "grad_norm": 1.7637184858322144, "learning_rate": 2.9699838338618417e-07, "loss": 0.1111, "step": 90490 }, { "epoch": 2.11014543440879, "grad_norm": 1.7538740634918213, "learning_rate": 2.9692066156811543e-07, "loss": 0.1116, "step": 90500 }, { "epoch": 2.1103785957856083, "grad_norm": 1.4229559898376465, "learning_rate": 2.968429397500466e-07, "loss": 0.1007, "step": 90510 }, { "epoch": 2.110611757162426, "grad_norm": 1.0800625085830688, "learning_rate": 2.9676521793197785e-07, "loss": 0.1066, "step": 90520 }, { "epoch": 2.110844918539244, "grad_norm": 3.8029651641845703, "learning_rate": 2.966874961139091e-07, "loss": 0.1066, "step": 90530 }, { "epoch": 2.111078079916062, "grad_norm": 1.5789066553115845, "learning_rate": 2.966097742958403e-07, "loss": 0.0975, "step": 90540 }, { "epoch": 2.1113112412928796, "grad_norm": 2.0696566104888916, "learning_rate": 2.9653205247777153e-07, "loss": 0.1086, "step": 90550 }, { "epoch": 2.111544402669698, "grad_norm": 1.25355863571167, "learning_rate": 2.964543306597028e-07, "loss": 0.0983, "step": 90560 }, { "epoch": 2.1117775640465157, "grad_norm": 1.3901888132095337, "learning_rate": 2.96376608841634e-07, "loss": 0.0967, "step": 90570 }, { "epoch": 2.1120107254233336, "grad_norm": 2.1525609493255615, "learning_rate": 2.9629888702356526e-07, "loss": 0.1079, "step": 90580 }, { "epoch": 2.1122438868001514, "grad_norm": 1.2626069784164429, "learning_rate": 2.9622116520549647e-07, "loss": 0.1086, "step": 90590 }, { "epoch": 2.1124770481769697, "grad_norm": 1.3998761177062988, "learning_rate": 2.961434433874277e-07, "loss": 0.0982, "step": 90600 }, { "epoch": 2.1127102095537875, "grad_norm": 2.0269572734832764, "learning_rate": 2.9606572156935894e-07, "loss": 0.1098, "step": 90610 }, { "epoch": 2.1129433709306054, "grad_norm": 2.593668222427368, "learning_rate": 2.959879997512902e-07, "loss": 0.1006, "step": 90620 }, { "epoch": 2.113176532307423, "grad_norm": 3.369847297668457, "learning_rate": 2.9591027793322136e-07, "loss": 0.1093, "step": 90630 }, { "epoch": 2.113409693684241, "grad_norm": 2.397808790206909, "learning_rate": 2.958325561151526e-07, "loss": 0.0995, "step": 90640 }, { "epoch": 2.1136428550610593, "grad_norm": 1.2315860986709595, "learning_rate": 2.957548342970839e-07, "loss": 0.1019, "step": 90650 }, { "epoch": 2.113876016437877, "grad_norm": 1.5302263498306274, "learning_rate": 2.956771124790151e-07, "loss": 0.1021, "step": 90660 }, { "epoch": 2.114109177814695, "grad_norm": 1.7642505168914795, "learning_rate": 2.955993906609463e-07, "loss": 0.1036, "step": 90670 }, { "epoch": 2.114342339191513, "grad_norm": 1.7175843715667725, "learning_rate": 2.9552166884287757e-07, "loss": 0.1029, "step": 90680 }, { "epoch": 2.1145755005683307, "grad_norm": 1.484773874282837, "learning_rate": 2.954439470248088e-07, "loss": 0.1065, "step": 90690 }, { "epoch": 2.114808661945149, "grad_norm": 1.7745846509933472, "learning_rate": 2.9536622520674004e-07, "loss": 0.0921, "step": 90700 }, { "epoch": 2.1150418233219668, "grad_norm": 2.46895170211792, "learning_rate": 2.9528850338867125e-07, "loss": 0.1078, "step": 90710 }, { "epoch": 2.1152749846987846, "grad_norm": 1.5070672035217285, "learning_rate": 2.9521078157060246e-07, "loss": 0.0993, "step": 90720 }, { "epoch": 2.1155081460756024, "grad_norm": 1.5430101156234741, "learning_rate": 2.951330597525337e-07, "loss": 0.105, "step": 90730 }, { "epoch": 2.1157413074524207, "grad_norm": 1.6243000030517578, "learning_rate": 2.95055337934465e-07, "loss": 0.1075, "step": 90740 }, { "epoch": 2.1159744688292386, "grad_norm": 2.702188730239868, "learning_rate": 2.9497761611639614e-07, "loss": 0.1083, "step": 90750 }, { "epoch": 2.1162076302060564, "grad_norm": 1.4180437326431274, "learning_rate": 2.948998942983274e-07, "loss": 0.1052, "step": 90760 }, { "epoch": 2.1164407915828742, "grad_norm": 1.2660160064697266, "learning_rate": 2.9482217248025866e-07, "loss": 0.1045, "step": 90770 }, { "epoch": 2.116673952959692, "grad_norm": 1.7358412742614746, "learning_rate": 2.9474445066218987e-07, "loss": 0.1139, "step": 90780 }, { "epoch": 2.1169071143365104, "grad_norm": 2.2528228759765625, "learning_rate": 2.946667288441211e-07, "loss": 0.1183, "step": 90790 }, { "epoch": 2.117140275713328, "grad_norm": 3.246886968612671, "learning_rate": 2.9458900702605234e-07, "loss": 0.1089, "step": 90800 }, { "epoch": 2.117373437090146, "grad_norm": 2.0617828369140625, "learning_rate": 2.9451128520798355e-07, "loss": 0.1032, "step": 90810 }, { "epoch": 2.117606598466964, "grad_norm": 1.849209189414978, "learning_rate": 2.944335633899148e-07, "loss": 0.1133, "step": 90820 }, { "epoch": 2.1178397598437817, "grad_norm": 1.170933723449707, "learning_rate": 2.94355841571846e-07, "loss": 0.0955, "step": 90830 }, { "epoch": 2.1180729212206, "grad_norm": 1.1943809986114502, "learning_rate": 2.9427811975377723e-07, "loss": 0.1034, "step": 90840 }, { "epoch": 2.118306082597418, "grad_norm": 1.5301352739334106, "learning_rate": 2.942003979357085e-07, "loss": 0.1078, "step": 90850 }, { "epoch": 2.1185392439742357, "grad_norm": 1.729740023612976, "learning_rate": 2.9412267611763976e-07, "loss": 0.1077, "step": 90860 }, { "epoch": 2.1187724053510535, "grad_norm": 1.319132924079895, "learning_rate": 2.940449542995709e-07, "loss": 0.1024, "step": 90870 }, { "epoch": 2.1190055667278713, "grad_norm": 3.0522775650024414, "learning_rate": 2.939672324815022e-07, "loss": 0.1079, "step": 90880 }, { "epoch": 2.1192387281046896, "grad_norm": 1.1890711784362793, "learning_rate": 2.9388951066343344e-07, "loss": 0.1101, "step": 90890 }, { "epoch": 2.1194718894815074, "grad_norm": 1.770139455795288, "learning_rate": 2.9381178884536465e-07, "loss": 0.1065, "step": 90900 }, { "epoch": 2.1197050508583253, "grad_norm": 1.2357040643692017, "learning_rate": 2.9373406702729585e-07, "loss": 0.1128, "step": 90910 }, { "epoch": 2.119938212235143, "grad_norm": 1.0928325653076172, "learning_rate": 2.936563452092271e-07, "loss": 0.107, "step": 90920 }, { "epoch": 2.120171373611961, "grad_norm": 1.2939910888671875, "learning_rate": 2.935786233911584e-07, "loss": 0.0995, "step": 90930 }, { "epoch": 2.1204045349887792, "grad_norm": 1.332281231880188, "learning_rate": 2.935009015730896e-07, "loss": 0.103, "step": 90940 }, { "epoch": 2.120637696365597, "grad_norm": 2.551358938217163, "learning_rate": 2.934231797550208e-07, "loss": 0.1123, "step": 90950 }, { "epoch": 2.120870857742415, "grad_norm": 1.613110065460205, "learning_rate": 2.9334545793695206e-07, "loss": 0.1108, "step": 90960 }, { "epoch": 2.1211040191192327, "grad_norm": 1.1960132122039795, "learning_rate": 2.9326773611888327e-07, "loss": 0.1041, "step": 90970 }, { "epoch": 2.121337180496051, "grad_norm": 1.534055471420288, "learning_rate": 2.9319001430081453e-07, "loss": 0.1023, "step": 90980 }, { "epoch": 2.121570341872869, "grad_norm": 1.8599814176559448, "learning_rate": 2.9311229248274574e-07, "loss": 0.0877, "step": 90990 }, { "epoch": 2.1218035032496867, "grad_norm": 3.1046183109283447, "learning_rate": 2.9303457066467695e-07, "loss": 0.1115, "step": 91000 }, { "epoch": 2.1220366646265045, "grad_norm": 2.456573486328125, "learning_rate": 2.929568488466082e-07, "loss": 0.0959, "step": 91010 }, { "epoch": 2.1222698260033224, "grad_norm": 1.5925549268722534, "learning_rate": 2.928791270285395e-07, "loss": 0.1085, "step": 91020 }, { "epoch": 2.1225029873801406, "grad_norm": 2.631896734237671, "learning_rate": 2.928014052104707e-07, "loss": 0.1028, "step": 91030 }, { "epoch": 2.1227361487569585, "grad_norm": 1.3484936952590942, "learning_rate": 2.927236833924019e-07, "loss": 0.1007, "step": 91040 }, { "epoch": 2.1229693101337763, "grad_norm": 3.033108711242676, "learning_rate": 2.9264596157433315e-07, "loss": 0.0945, "step": 91050 }, { "epoch": 2.123202471510594, "grad_norm": 1.0975762605667114, "learning_rate": 2.9256823975626436e-07, "loss": 0.094, "step": 91060 }, { "epoch": 2.123435632887412, "grad_norm": 1.7116209268569946, "learning_rate": 2.924905179381956e-07, "loss": 0.1004, "step": 91070 }, { "epoch": 2.1236687942642303, "grad_norm": 1.9792832136154175, "learning_rate": 2.9241279612012684e-07, "loss": 0.0965, "step": 91080 }, { "epoch": 2.123901955641048, "grad_norm": 1.8809279203414917, "learning_rate": 2.9233507430205804e-07, "loss": 0.1022, "step": 91090 }, { "epoch": 2.124135117017866, "grad_norm": 1.2764872312545776, "learning_rate": 2.922573524839893e-07, "loss": 0.0942, "step": 91100 }, { "epoch": 2.124368278394684, "grad_norm": 2.0144288539886475, "learning_rate": 2.9217963066592057e-07, "loss": 0.0976, "step": 91110 }, { "epoch": 2.124601439771502, "grad_norm": 2.969611406326294, "learning_rate": 2.921019088478517e-07, "loss": 0.1076, "step": 91120 }, { "epoch": 2.12483460114832, "grad_norm": 1.156899333000183, "learning_rate": 2.92024187029783e-07, "loss": 0.1029, "step": 91130 }, { "epoch": 2.1250677625251377, "grad_norm": 1.225796103477478, "learning_rate": 2.9194646521171425e-07, "loss": 0.0968, "step": 91140 }, { "epoch": 2.1253009239019556, "grad_norm": 1.4675841331481934, "learning_rate": 2.9186874339364546e-07, "loss": 0.1038, "step": 91150 }, { "epoch": 2.1255340852787734, "grad_norm": 3.525702714920044, "learning_rate": 2.9179102157557667e-07, "loss": 0.1123, "step": 91160 }, { "epoch": 2.1257672466555917, "grad_norm": 1.910338282585144, "learning_rate": 2.9171329975750793e-07, "loss": 0.1153, "step": 91170 }, { "epoch": 2.1260004080324095, "grad_norm": 2.2609736919403076, "learning_rate": 2.9163557793943914e-07, "loss": 0.1035, "step": 91180 }, { "epoch": 2.1262335694092274, "grad_norm": 1.4056339263916016, "learning_rate": 2.915578561213704e-07, "loss": 0.1023, "step": 91190 }, { "epoch": 2.126466730786045, "grad_norm": 3.074922561645508, "learning_rate": 2.914801343033016e-07, "loss": 0.1136, "step": 91200 }, { "epoch": 2.126699892162863, "grad_norm": 1.368812084197998, "learning_rate": 2.914024124852328e-07, "loss": 0.1025, "step": 91210 }, { "epoch": 2.1269330535396813, "grad_norm": 1.162697672843933, "learning_rate": 2.913246906671641e-07, "loss": 0.1037, "step": 91220 }, { "epoch": 2.127166214916499, "grad_norm": 2.022327423095703, "learning_rate": 2.9124696884909534e-07, "loss": 0.093, "step": 91230 }, { "epoch": 2.127399376293317, "grad_norm": 2.223123788833618, "learning_rate": 2.911692470310265e-07, "loss": 0.1009, "step": 91240 }, { "epoch": 2.127632537670135, "grad_norm": 1.9287827014923096, "learning_rate": 2.9109152521295776e-07, "loss": 0.1077, "step": 91250 }, { "epoch": 2.1278656990469527, "grad_norm": 1.1718549728393555, "learning_rate": 2.91013803394889e-07, "loss": 0.0966, "step": 91260 }, { "epoch": 2.128098860423771, "grad_norm": 2.7617506980895996, "learning_rate": 2.9093608157682023e-07, "loss": 0.1157, "step": 91270 }, { "epoch": 2.128332021800589, "grad_norm": 1.377798080444336, "learning_rate": 2.9085835975875144e-07, "loss": 0.0986, "step": 91280 }, { "epoch": 2.1285651831774066, "grad_norm": 1.591047763824463, "learning_rate": 2.907806379406827e-07, "loss": 0.1028, "step": 91290 }, { "epoch": 2.1287983445542245, "grad_norm": 1.9384773969650269, "learning_rate": 2.907029161226139e-07, "loss": 0.0973, "step": 91300 }, { "epoch": 2.1290315059310423, "grad_norm": 3.269984483718872, "learning_rate": 2.906251943045452e-07, "loss": 0.1091, "step": 91310 }, { "epoch": 2.1292646673078606, "grad_norm": 3.6469898223876953, "learning_rate": 2.905474724864764e-07, "loss": 0.1119, "step": 91320 }, { "epoch": 2.1294978286846784, "grad_norm": 1.2805204391479492, "learning_rate": 2.904697506684076e-07, "loss": 0.1126, "step": 91330 }, { "epoch": 2.1297309900614962, "grad_norm": 2.891263484954834, "learning_rate": 2.9039202885033886e-07, "loss": 0.1097, "step": 91340 }, { "epoch": 2.129964151438314, "grad_norm": 1.6315410137176514, "learning_rate": 2.903143070322701e-07, "loss": 0.1111, "step": 91350 }, { "epoch": 2.1301973128151324, "grad_norm": 1.56320321559906, "learning_rate": 2.902365852142013e-07, "loss": 0.1132, "step": 91360 }, { "epoch": 2.13043047419195, "grad_norm": 1.370783805847168, "learning_rate": 2.9015886339613254e-07, "loss": 0.1005, "step": 91370 }, { "epoch": 2.130663635568768, "grad_norm": 1.5951873064041138, "learning_rate": 2.900811415780638e-07, "loss": 0.1119, "step": 91380 }, { "epoch": 2.130896796945586, "grad_norm": 3.1986501216888428, "learning_rate": 2.9000341975999506e-07, "loss": 0.1019, "step": 91390 }, { "epoch": 2.1311299583224037, "grad_norm": 1.5012162923812866, "learning_rate": 2.899256979419262e-07, "loss": 0.1155, "step": 91400 }, { "epoch": 2.131363119699222, "grad_norm": 2.201870918273926, "learning_rate": 2.898479761238575e-07, "loss": 0.1052, "step": 91410 }, { "epoch": 2.13159628107604, "grad_norm": 1.6553550958633423, "learning_rate": 2.8977025430578874e-07, "loss": 0.0935, "step": 91420 }, { "epoch": 2.1318294424528577, "grad_norm": 1.5200258493423462, "learning_rate": 2.8969253248771995e-07, "loss": 0.0972, "step": 91430 }, { "epoch": 2.1320626038296755, "grad_norm": 2.8214521408081055, "learning_rate": 2.8961481066965116e-07, "loss": 0.1182, "step": 91440 }, { "epoch": 2.1322957652064938, "grad_norm": 2.846405029296875, "learning_rate": 2.895370888515824e-07, "loss": 0.1072, "step": 91450 }, { "epoch": 2.1325289265833116, "grad_norm": 2.2240960597991943, "learning_rate": 2.8945936703351363e-07, "loss": 0.1023, "step": 91460 }, { "epoch": 2.1327620879601294, "grad_norm": 2.155486822128296, "learning_rate": 2.893816452154449e-07, "loss": 0.1027, "step": 91470 }, { "epoch": 2.1329952493369473, "grad_norm": 1.258327841758728, "learning_rate": 2.893039233973761e-07, "loss": 0.0966, "step": 91480 }, { "epoch": 2.133228410713765, "grad_norm": 1.5104323625564575, "learning_rate": 2.892262015793073e-07, "loss": 0.099, "step": 91490 }, { "epoch": 2.1334615720905834, "grad_norm": 2.5949647426605225, "learning_rate": 2.891484797612386e-07, "loss": 0.1024, "step": 91500 }, { "epoch": 2.1336947334674012, "grad_norm": 2.3950464725494385, "learning_rate": 2.8907075794316984e-07, "loss": 0.0972, "step": 91510 }, { "epoch": 2.133927894844219, "grad_norm": 1.8264541625976562, "learning_rate": 2.88993036125101e-07, "loss": 0.1004, "step": 91520 }, { "epoch": 2.134161056221037, "grad_norm": 1.6790729761123657, "learning_rate": 2.8891531430703226e-07, "loss": 0.103, "step": 91530 }, { "epoch": 2.1343942175978547, "grad_norm": 2.3383684158325195, "learning_rate": 2.888375924889635e-07, "loss": 0.1114, "step": 91540 }, { "epoch": 2.134627378974673, "grad_norm": 2.420978307723999, "learning_rate": 2.8875987067089473e-07, "loss": 0.1031, "step": 91550 }, { "epoch": 2.134860540351491, "grad_norm": 1.5626130104064941, "learning_rate": 2.8868214885282594e-07, "loss": 0.0979, "step": 91560 }, { "epoch": 2.1350937017283087, "grad_norm": 1.3555608987808228, "learning_rate": 2.886044270347572e-07, "loss": 0.1021, "step": 91570 }, { "epoch": 2.1353268631051265, "grad_norm": 2.0587892532348633, "learning_rate": 2.885267052166884e-07, "loss": 0.0881, "step": 91580 }, { "epoch": 2.1355600244819444, "grad_norm": 1.5355175733566284, "learning_rate": 2.8844898339861967e-07, "loss": 0.1035, "step": 91590 }, { "epoch": 2.1357931858587627, "grad_norm": 1.5853736400604248, "learning_rate": 2.883712615805509e-07, "loss": 0.105, "step": 91600 }, { "epoch": 2.1360263472355805, "grad_norm": 2.2593436241149902, "learning_rate": 2.882935397624821e-07, "loss": 0.101, "step": 91610 }, { "epoch": 2.1362595086123983, "grad_norm": 1.4588475227355957, "learning_rate": 2.8821581794441335e-07, "loss": 0.1033, "step": 91620 }, { "epoch": 2.136492669989216, "grad_norm": 2.128328800201416, "learning_rate": 2.881380961263446e-07, "loss": 0.1101, "step": 91630 }, { "epoch": 2.136725831366034, "grad_norm": 1.9776110649108887, "learning_rate": 2.8806037430827577e-07, "loss": 0.1034, "step": 91640 }, { "epoch": 2.1369589927428523, "grad_norm": 1.5097893476486206, "learning_rate": 2.8798265249020703e-07, "loss": 0.0959, "step": 91650 }, { "epoch": 2.13719215411967, "grad_norm": 3.3988776206970215, "learning_rate": 2.879049306721383e-07, "loss": 0.1053, "step": 91660 }, { "epoch": 2.137425315496488, "grad_norm": 1.3579580783843994, "learning_rate": 2.878272088540695e-07, "loss": 0.0981, "step": 91670 }, { "epoch": 2.137658476873306, "grad_norm": 4.820269584655762, "learning_rate": 2.877494870360007e-07, "loss": 0.1125, "step": 91680 }, { "epoch": 2.137891638250124, "grad_norm": 2.045344591140747, "learning_rate": 2.8767176521793197e-07, "loss": 0.1143, "step": 91690 }, { "epoch": 2.138124799626942, "grad_norm": 1.320054054260254, "learning_rate": 2.875940433998632e-07, "loss": 0.1029, "step": 91700 }, { "epoch": 2.1383579610037597, "grad_norm": 2.948389768600464, "learning_rate": 2.8751632158179444e-07, "loss": 0.0941, "step": 91710 }, { "epoch": 2.1385911223805776, "grad_norm": 2.884568691253662, "learning_rate": 2.8743859976372565e-07, "loss": 0.0992, "step": 91720 }, { "epoch": 2.1388242837573954, "grad_norm": 2.57358455657959, "learning_rate": 2.8736087794565686e-07, "loss": 0.1027, "step": 91730 }, { "epoch": 2.1390574451342137, "grad_norm": 1.5156607627868652, "learning_rate": 2.872831561275881e-07, "loss": 0.1025, "step": 91740 }, { "epoch": 2.1392906065110315, "grad_norm": 2.0382492542266846, "learning_rate": 2.872054343095194e-07, "loss": 0.1045, "step": 91750 }, { "epoch": 2.1395237678878494, "grad_norm": 1.3341041803359985, "learning_rate": 2.8712771249145054e-07, "loss": 0.0977, "step": 91760 }, { "epoch": 2.139756929264667, "grad_norm": 1.6739332675933838, "learning_rate": 2.870499906733818e-07, "loss": 0.0946, "step": 91770 }, { "epoch": 2.139990090641485, "grad_norm": 1.40274178981781, "learning_rate": 2.8697226885531307e-07, "loss": 0.0902, "step": 91780 }, { "epoch": 2.1402232520183033, "grad_norm": 1.3159377574920654, "learning_rate": 2.8689454703724433e-07, "loss": 0.0905, "step": 91790 }, { "epoch": 2.140456413395121, "grad_norm": 1.4783483743667603, "learning_rate": 2.868168252191755e-07, "loss": 0.1015, "step": 91800 }, { "epoch": 2.140689574771939, "grad_norm": 1.0590778589248657, "learning_rate": 2.8673910340110675e-07, "loss": 0.0909, "step": 91810 }, { "epoch": 2.140922736148757, "grad_norm": 1.1813756227493286, "learning_rate": 2.86661381583038e-07, "loss": 0.1024, "step": 91820 }, { "epoch": 2.141155897525575, "grad_norm": 1.2164994478225708, "learning_rate": 2.865836597649692e-07, "loss": 0.1083, "step": 91830 }, { "epoch": 2.141389058902393, "grad_norm": 2.833559036254883, "learning_rate": 2.8650593794690043e-07, "loss": 0.1001, "step": 91840 }, { "epoch": 2.141622220279211, "grad_norm": 1.8957537412643433, "learning_rate": 2.864282161288317e-07, "loss": 0.1026, "step": 91850 }, { "epoch": 2.1418553816560286, "grad_norm": 1.391913652420044, "learning_rate": 2.863504943107629e-07, "loss": 0.1013, "step": 91860 }, { "epoch": 2.1420885430328465, "grad_norm": 1.4724587202072144, "learning_rate": 2.8627277249269416e-07, "loss": 0.1073, "step": 91870 }, { "epoch": 2.1423217044096647, "grad_norm": 1.500234603881836, "learning_rate": 2.8619505067462537e-07, "loss": 0.1004, "step": 91880 }, { "epoch": 2.1425548657864826, "grad_norm": 1.2741483449935913, "learning_rate": 2.861173288565566e-07, "loss": 0.1091, "step": 91890 }, { "epoch": 2.1427880271633004, "grad_norm": 1.429667592048645, "learning_rate": 2.8603960703848784e-07, "loss": 0.1054, "step": 91900 }, { "epoch": 2.1430211885401182, "grad_norm": 2.0270731449127197, "learning_rate": 2.859618852204191e-07, "loss": 0.0964, "step": 91910 }, { "epoch": 2.143254349916936, "grad_norm": 1.4810250997543335, "learning_rate": 2.8588416340235026e-07, "loss": 0.1016, "step": 91920 }, { "epoch": 2.1434875112937544, "grad_norm": 1.1950265169143677, "learning_rate": 2.858064415842815e-07, "loss": 0.106, "step": 91930 }, { "epoch": 2.143720672670572, "grad_norm": 1.24570631980896, "learning_rate": 2.857287197662128e-07, "loss": 0.0981, "step": 91940 }, { "epoch": 2.14395383404739, "grad_norm": 2.217057466506958, "learning_rate": 2.85650997948144e-07, "loss": 0.1133, "step": 91950 }, { "epoch": 2.144186995424208, "grad_norm": 1.1311334371566772, "learning_rate": 2.855732761300752e-07, "loss": 0.0979, "step": 91960 }, { "epoch": 2.1444201568010257, "grad_norm": 2.600938558578491, "learning_rate": 2.8549555431200647e-07, "loss": 0.1108, "step": 91970 }, { "epoch": 2.144653318177844, "grad_norm": 1.3551208972930908, "learning_rate": 2.854178324939377e-07, "loss": 0.1004, "step": 91980 }, { "epoch": 2.144886479554662, "grad_norm": 1.3949111700057983, "learning_rate": 2.8534011067586894e-07, "loss": 0.0951, "step": 91990 }, { "epoch": 2.1451196409314797, "grad_norm": 1.7307605743408203, "learning_rate": 2.8526238885780015e-07, "loss": 0.1047, "step": 92000 }, { "epoch": 2.1453528023082975, "grad_norm": 1.8818949460983276, "learning_rate": 2.8518466703973136e-07, "loss": 0.095, "step": 92010 }, { "epoch": 2.1455859636851153, "grad_norm": 2.054490804672241, "learning_rate": 2.851069452216626e-07, "loss": 0.0963, "step": 92020 }, { "epoch": 2.1458191250619336, "grad_norm": 2.468693256378174, "learning_rate": 2.850292234035939e-07, "loss": 0.108, "step": 92030 }, { "epoch": 2.1460522864387515, "grad_norm": 1.463753342628479, "learning_rate": 2.8495150158552504e-07, "loss": 0.1019, "step": 92040 }, { "epoch": 2.1462854478155693, "grad_norm": 2.855454444885254, "learning_rate": 2.848737797674563e-07, "loss": 0.0981, "step": 92050 }, { "epoch": 2.146518609192387, "grad_norm": 1.2796562910079956, "learning_rate": 2.8479605794938756e-07, "loss": 0.1004, "step": 92060 }, { "epoch": 2.1467517705692054, "grad_norm": 1.1562176942825317, "learning_rate": 2.8471833613131877e-07, "loss": 0.1046, "step": 92070 }, { "epoch": 2.1469849319460232, "grad_norm": 2.311934232711792, "learning_rate": 2.8464061431325e-07, "loss": 0.1113, "step": 92080 }, { "epoch": 2.147218093322841, "grad_norm": 1.232558012008667, "learning_rate": 2.8456289249518124e-07, "loss": 0.105, "step": 92090 }, { "epoch": 2.147451254699659, "grad_norm": 1.8712103366851807, "learning_rate": 2.8448517067711245e-07, "loss": 0.0998, "step": 92100 }, { "epoch": 2.1476844160764768, "grad_norm": 2.388265371322632, "learning_rate": 2.844074488590437e-07, "loss": 0.0999, "step": 92110 }, { "epoch": 2.147917577453295, "grad_norm": 1.6661347150802612, "learning_rate": 2.843297270409749e-07, "loss": 0.1114, "step": 92120 }, { "epoch": 2.148150738830113, "grad_norm": 1.0608075857162476, "learning_rate": 2.8425200522290613e-07, "loss": 0.0998, "step": 92130 }, { "epoch": 2.1483839002069307, "grad_norm": 1.5402897596359253, "learning_rate": 2.841742834048374e-07, "loss": 0.1017, "step": 92140 }, { "epoch": 2.1486170615837485, "grad_norm": 1.1509075164794922, "learning_rate": 2.8409656158676866e-07, "loss": 0.1117, "step": 92150 }, { "epoch": 2.1488502229605664, "grad_norm": 2.3125100135803223, "learning_rate": 2.840188397686998e-07, "loss": 0.1169, "step": 92160 }, { "epoch": 2.1490833843373847, "grad_norm": 1.2044343948364258, "learning_rate": 2.839411179506311e-07, "loss": 0.0948, "step": 92170 }, { "epoch": 2.1493165457142025, "grad_norm": 1.6745593547821045, "learning_rate": 2.8386339613256234e-07, "loss": 0.1062, "step": 92180 }, { "epoch": 2.1495497070910203, "grad_norm": 1.3591818809509277, "learning_rate": 2.837856743144936e-07, "loss": 0.1074, "step": 92190 }, { "epoch": 2.149782868467838, "grad_norm": 2.0198771953582764, "learning_rate": 2.8370795249642475e-07, "loss": 0.1083, "step": 92200 }, { "epoch": 2.1500160298446565, "grad_norm": 1.0770686864852905, "learning_rate": 2.83630230678356e-07, "loss": 0.0997, "step": 92210 }, { "epoch": 2.1502491912214743, "grad_norm": 1.5677261352539062, "learning_rate": 2.835525088602873e-07, "loss": 0.1098, "step": 92220 }, { "epoch": 2.150482352598292, "grad_norm": 1.3713487386703491, "learning_rate": 2.834747870422185e-07, "loss": 0.1038, "step": 92230 }, { "epoch": 2.15071551397511, "grad_norm": 1.1973185539245605, "learning_rate": 2.833970652241497e-07, "loss": 0.0992, "step": 92240 }, { "epoch": 2.150948675351928, "grad_norm": 1.5036178827285767, "learning_rate": 2.8331934340608096e-07, "loss": 0.1067, "step": 92250 }, { "epoch": 2.151181836728746, "grad_norm": 1.5161184072494507, "learning_rate": 2.8324162158801217e-07, "loss": 0.1021, "step": 92260 }, { "epoch": 2.151414998105564, "grad_norm": 2.4038052558898926, "learning_rate": 2.8316389976994343e-07, "loss": 0.1042, "step": 92270 }, { "epoch": 2.1516481594823818, "grad_norm": 1.6941300630569458, "learning_rate": 2.8308617795187464e-07, "loss": 0.1114, "step": 92280 }, { "epoch": 2.1518813208591996, "grad_norm": 1.617699384689331, "learning_rate": 2.8300845613380585e-07, "loss": 0.1127, "step": 92290 }, { "epoch": 2.1521144822360174, "grad_norm": 2.3446691036224365, "learning_rate": 2.829307343157371e-07, "loss": 0.0976, "step": 92300 }, { "epoch": 2.1523476436128357, "grad_norm": 1.3257808685302734, "learning_rate": 2.828530124976684e-07, "loss": 0.112, "step": 92310 }, { "epoch": 2.1525808049896535, "grad_norm": 2.443218469619751, "learning_rate": 2.8277529067959953e-07, "loss": 0.0984, "step": 92320 }, { "epoch": 2.1528139663664714, "grad_norm": 2.459817886352539, "learning_rate": 2.826975688615308e-07, "loss": 0.102, "step": 92330 }, { "epoch": 2.153047127743289, "grad_norm": 3.134060859680176, "learning_rate": 2.8261984704346205e-07, "loss": 0.1035, "step": 92340 }, { "epoch": 2.153280289120107, "grad_norm": 1.8789288997650146, "learning_rate": 2.8254212522539326e-07, "loss": 0.1001, "step": 92350 }, { "epoch": 2.1535134504969253, "grad_norm": 1.3148130178451538, "learning_rate": 2.8246440340732447e-07, "loss": 0.1028, "step": 92360 }, { "epoch": 2.153746611873743, "grad_norm": 2.0254764556884766, "learning_rate": 2.8238668158925573e-07, "loss": 0.1129, "step": 92370 }, { "epoch": 2.153979773250561, "grad_norm": 1.5079342126846313, "learning_rate": 2.8230895977118694e-07, "loss": 0.0971, "step": 92380 }, { "epoch": 2.154212934627379, "grad_norm": 1.1818034648895264, "learning_rate": 2.822312379531182e-07, "loss": 0.1084, "step": 92390 }, { "epoch": 2.1544460960041967, "grad_norm": 1.4119136333465576, "learning_rate": 2.821535161350494e-07, "loss": 0.1014, "step": 92400 }, { "epoch": 2.154679257381015, "grad_norm": 1.7994109392166138, "learning_rate": 2.820757943169806e-07, "loss": 0.0977, "step": 92410 }, { "epoch": 2.154912418757833, "grad_norm": 1.6394197940826416, "learning_rate": 2.819980724989119e-07, "loss": 0.1039, "step": 92420 }, { "epoch": 2.1551455801346506, "grad_norm": 1.274523377418518, "learning_rate": 2.8192035068084315e-07, "loss": 0.1115, "step": 92430 }, { "epoch": 2.1553787415114685, "grad_norm": 0.9702813029289246, "learning_rate": 2.818426288627743e-07, "loss": 0.0972, "step": 92440 }, { "epoch": 2.1556119028882867, "grad_norm": 1.9176442623138428, "learning_rate": 2.8176490704470557e-07, "loss": 0.1075, "step": 92450 }, { "epoch": 2.1558450642651046, "grad_norm": 1.0817292928695679, "learning_rate": 2.8168718522663683e-07, "loss": 0.093, "step": 92460 }, { "epoch": 2.1560782256419224, "grad_norm": 1.4617093801498413, "learning_rate": 2.8160946340856804e-07, "loss": 0.0936, "step": 92470 }, { "epoch": 2.1563113870187403, "grad_norm": 1.515236258506775, "learning_rate": 2.8153174159049925e-07, "loss": 0.106, "step": 92480 }, { "epoch": 2.156544548395558, "grad_norm": 1.9639456272125244, "learning_rate": 2.814540197724305e-07, "loss": 0.1001, "step": 92490 }, { "epoch": 2.1567777097723764, "grad_norm": 1.1811678409576416, "learning_rate": 2.813762979543617e-07, "loss": 0.0984, "step": 92500 }, { "epoch": 2.157010871149194, "grad_norm": 1.2719917297363281, "learning_rate": 2.81298576136293e-07, "loss": 0.1029, "step": 92510 }, { "epoch": 2.157244032526012, "grad_norm": 1.7674825191497803, "learning_rate": 2.812208543182242e-07, "loss": 0.1035, "step": 92520 }, { "epoch": 2.15747719390283, "grad_norm": 1.7812217473983765, "learning_rate": 2.811431325001554e-07, "loss": 0.0895, "step": 92530 }, { "epoch": 2.1577103552796477, "grad_norm": 2.607983112335205, "learning_rate": 2.8106541068208666e-07, "loss": 0.112, "step": 92540 }, { "epoch": 2.157943516656466, "grad_norm": 1.4372210502624512, "learning_rate": 2.809876888640179e-07, "loss": 0.092, "step": 92550 }, { "epoch": 2.158176678033284, "grad_norm": 1.4713622331619263, "learning_rate": 2.809099670459491e-07, "loss": 0.0983, "step": 92560 }, { "epoch": 2.1584098394101017, "grad_norm": 2.258039712905884, "learning_rate": 2.8083224522788034e-07, "loss": 0.1186, "step": 92570 }, { "epoch": 2.1586430007869195, "grad_norm": 1.7769792079925537, "learning_rate": 2.807545234098116e-07, "loss": 0.1126, "step": 92580 }, { "epoch": 2.158876162163738, "grad_norm": 1.5456455945968628, "learning_rate": 2.806768015917428e-07, "loss": 0.0946, "step": 92590 }, { "epoch": 2.1591093235405556, "grad_norm": 2.551509380340576, "learning_rate": 2.80599079773674e-07, "loss": 0.0993, "step": 92600 }, { "epoch": 2.1593424849173735, "grad_norm": 1.5977884531021118, "learning_rate": 2.805213579556053e-07, "loss": 0.0965, "step": 92610 }, { "epoch": 2.1595756462941913, "grad_norm": 1.671575903892517, "learning_rate": 2.8044363613753655e-07, "loss": 0.1171, "step": 92620 }, { "epoch": 2.159808807671009, "grad_norm": 2.535630464553833, "learning_rate": 2.8036591431946776e-07, "loss": 0.1033, "step": 92630 }, { "epoch": 2.1600419690478274, "grad_norm": 2.1473145484924316, "learning_rate": 2.8028819250139897e-07, "loss": 0.1048, "step": 92640 }, { "epoch": 2.1602751304246453, "grad_norm": 1.891550898551941, "learning_rate": 2.8021047068333023e-07, "loss": 0.1093, "step": 92650 }, { "epoch": 2.160508291801463, "grad_norm": 1.668594241142273, "learning_rate": 2.8013274886526144e-07, "loss": 0.0914, "step": 92660 }, { "epoch": 2.160741453178281, "grad_norm": 1.3715095520019531, "learning_rate": 2.800550270471927e-07, "loss": 0.1097, "step": 92670 }, { "epoch": 2.1609746145550988, "grad_norm": 1.9469234943389893, "learning_rate": 2.799773052291239e-07, "loss": 0.0907, "step": 92680 }, { "epoch": 2.161207775931917, "grad_norm": 1.5870457887649536, "learning_rate": 2.798995834110551e-07, "loss": 0.1152, "step": 92690 }, { "epoch": 2.161440937308735, "grad_norm": 1.3182053565979004, "learning_rate": 2.798218615929864e-07, "loss": 0.1123, "step": 92700 }, { "epoch": 2.1616740986855527, "grad_norm": 1.9691882133483887, "learning_rate": 2.7974413977491764e-07, "loss": 0.1154, "step": 92710 }, { "epoch": 2.1619072600623706, "grad_norm": 1.0756032466888428, "learning_rate": 2.796664179568488e-07, "loss": 0.0946, "step": 92720 }, { "epoch": 2.1621404214391884, "grad_norm": 1.5775214433670044, "learning_rate": 2.7958869613878006e-07, "loss": 0.0974, "step": 92730 }, { "epoch": 2.1623735828160067, "grad_norm": 2.149364948272705, "learning_rate": 2.795109743207113e-07, "loss": 0.1034, "step": 92740 }, { "epoch": 2.1626067441928245, "grad_norm": 1.341925024986267, "learning_rate": 2.7943325250264253e-07, "loss": 0.0953, "step": 92750 }, { "epoch": 2.1628399055696423, "grad_norm": 2.3054797649383545, "learning_rate": 2.7935553068457374e-07, "loss": 0.1018, "step": 92760 }, { "epoch": 2.16307306694646, "grad_norm": 1.1812736988067627, "learning_rate": 2.79277808866505e-07, "loss": 0.1081, "step": 92770 }, { "epoch": 2.163306228323278, "grad_norm": 1.2084484100341797, "learning_rate": 2.792000870484362e-07, "loss": 0.1029, "step": 92780 }, { "epoch": 2.1635393897000963, "grad_norm": 2.13179874420166, "learning_rate": 2.791223652303675e-07, "loss": 0.0933, "step": 92790 }, { "epoch": 2.163772551076914, "grad_norm": 1.1427558660507202, "learning_rate": 2.790446434122987e-07, "loss": 0.1065, "step": 92800 }, { "epoch": 2.164005712453732, "grad_norm": 1.9143444299697876, "learning_rate": 2.789669215942299e-07, "loss": 0.0935, "step": 92810 }, { "epoch": 2.16423887383055, "grad_norm": 1.4280518293380737, "learning_rate": 2.7888919977616116e-07, "loss": 0.1113, "step": 92820 }, { "epoch": 2.164472035207368, "grad_norm": 1.451638102531433, "learning_rate": 2.788114779580924e-07, "loss": 0.1027, "step": 92830 }, { "epoch": 2.164705196584186, "grad_norm": 1.7283504009246826, "learning_rate": 2.787337561400236e-07, "loss": 0.1132, "step": 92840 }, { "epoch": 2.1649383579610038, "grad_norm": 1.6553274393081665, "learning_rate": 2.7865603432195484e-07, "loss": 0.1087, "step": 92850 }, { "epoch": 2.1651715193378216, "grad_norm": 1.6972661018371582, "learning_rate": 2.785783125038861e-07, "loss": 0.1142, "step": 92860 }, { "epoch": 2.1654046807146394, "grad_norm": 1.2979058027267456, "learning_rate": 2.785005906858173e-07, "loss": 0.1014, "step": 92870 }, { "epoch": 2.1656378420914577, "grad_norm": 1.4822289943695068, "learning_rate": 2.784228688677485e-07, "loss": 0.0964, "step": 92880 }, { "epoch": 2.1658710034682755, "grad_norm": 1.0253161191940308, "learning_rate": 2.783451470496798e-07, "loss": 0.1192, "step": 92890 }, { "epoch": 2.1661041648450934, "grad_norm": 1.6858797073364258, "learning_rate": 2.78267425231611e-07, "loss": 0.1145, "step": 92900 }, { "epoch": 2.166337326221911, "grad_norm": 1.3336474895477295, "learning_rate": 2.7818970341354225e-07, "loss": 0.0983, "step": 92910 }, { "epoch": 2.1665704875987295, "grad_norm": 0.924937903881073, "learning_rate": 2.7811198159547346e-07, "loss": 0.1028, "step": 92920 }, { "epoch": 2.1668036489755473, "grad_norm": 1.1388814449310303, "learning_rate": 2.7803425977740467e-07, "loss": 0.1018, "step": 92930 }, { "epoch": 2.167036810352365, "grad_norm": 2.1501760482788086, "learning_rate": 2.7795653795933593e-07, "loss": 0.0952, "step": 92940 }, { "epoch": 2.167269971729183, "grad_norm": 3.635838508605957, "learning_rate": 2.778788161412672e-07, "loss": 0.1081, "step": 92950 }, { "epoch": 2.167503133106001, "grad_norm": 1.4513976573944092, "learning_rate": 2.7780109432319835e-07, "loss": 0.1026, "step": 92960 }, { "epoch": 2.167736294482819, "grad_norm": 1.5658386945724487, "learning_rate": 2.777233725051296e-07, "loss": 0.104, "step": 92970 }, { "epoch": 2.167969455859637, "grad_norm": 1.8231244087219238, "learning_rate": 2.7764565068706087e-07, "loss": 0.0974, "step": 92980 }, { "epoch": 2.168202617236455, "grad_norm": 1.2012908458709717, "learning_rate": 2.775679288689921e-07, "loss": 0.1108, "step": 92990 }, { "epoch": 2.1684357786132726, "grad_norm": 1.274464726448059, "learning_rate": 2.774902070509233e-07, "loss": 0.0966, "step": 93000 }, { "epoch": 2.1686689399900905, "grad_norm": 1.4078902006149292, "learning_rate": 2.7741248523285455e-07, "loss": 0.1056, "step": 93010 }, { "epoch": 2.1689021013669088, "grad_norm": 1.6773370504379272, "learning_rate": 2.7733476341478576e-07, "loss": 0.0986, "step": 93020 }, { "epoch": 2.1691352627437266, "grad_norm": 1.3005574941635132, "learning_rate": 2.77257041596717e-07, "loss": 0.0954, "step": 93030 }, { "epoch": 2.1693684241205444, "grad_norm": 1.1457946300506592, "learning_rate": 2.7717931977864823e-07, "loss": 0.1044, "step": 93040 }, { "epoch": 2.1696015854973623, "grad_norm": 3.113339424133301, "learning_rate": 2.7710159796057944e-07, "loss": 0.1024, "step": 93050 }, { "epoch": 2.16983474687418, "grad_norm": 1.1096577644348145, "learning_rate": 2.770238761425107e-07, "loss": 0.1068, "step": 93060 }, { "epoch": 2.1700679082509984, "grad_norm": 1.2548877000808716, "learning_rate": 2.7694615432444197e-07, "loss": 0.0993, "step": 93070 }, { "epoch": 2.170301069627816, "grad_norm": 1.1582529544830322, "learning_rate": 2.768684325063732e-07, "loss": 0.1083, "step": 93080 }, { "epoch": 2.170534231004634, "grad_norm": 2.7510621547698975, "learning_rate": 2.767907106883044e-07, "loss": 0.103, "step": 93090 }, { "epoch": 2.170767392381452, "grad_norm": 1.300796627998352, "learning_rate": 2.7671298887023565e-07, "loss": 0.1088, "step": 93100 }, { "epoch": 2.1710005537582697, "grad_norm": 2.7094085216522217, "learning_rate": 2.766352670521669e-07, "loss": 0.1053, "step": 93110 }, { "epoch": 2.171233715135088, "grad_norm": 1.4588608741760254, "learning_rate": 2.7655754523409807e-07, "loss": 0.1026, "step": 93120 }, { "epoch": 2.171466876511906, "grad_norm": 1.016488790512085, "learning_rate": 2.7647982341602933e-07, "loss": 0.0882, "step": 93130 }, { "epoch": 2.1717000378887237, "grad_norm": 2.1109423637390137, "learning_rate": 2.764021015979606e-07, "loss": 0.1117, "step": 93140 }, { "epoch": 2.1719331992655415, "grad_norm": 1.2672441005706787, "learning_rate": 2.763243797798918e-07, "loss": 0.1051, "step": 93150 }, { "epoch": 2.17216636064236, "grad_norm": 1.907701015472412, "learning_rate": 2.76246657961823e-07, "loss": 0.1025, "step": 93160 }, { "epoch": 2.1723995220191776, "grad_norm": 1.9146339893341064, "learning_rate": 2.7616893614375427e-07, "loss": 0.0971, "step": 93170 }, { "epoch": 2.1726326833959955, "grad_norm": 1.3468542098999023, "learning_rate": 2.760912143256855e-07, "loss": 0.0969, "step": 93180 }, { "epoch": 2.1728658447728133, "grad_norm": 1.2518408298492432, "learning_rate": 2.7601349250761674e-07, "loss": 0.1027, "step": 93190 }, { "epoch": 2.173099006149631, "grad_norm": 1.1626688241958618, "learning_rate": 2.7593577068954795e-07, "loss": 0.0931, "step": 93200 }, { "epoch": 2.1733321675264494, "grad_norm": 1.4769517183303833, "learning_rate": 2.7585804887147916e-07, "loss": 0.0926, "step": 93210 }, { "epoch": 2.1735653289032673, "grad_norm": 2.346435546875, "learning_rate": 2.757803270534104e-07, "loss": 0.1051, "step": 93220 }, { "epoch": 2.173798490280085, "grad_norm": 1.295487642288208, "learning_rate": 2.757026052353417e-07, "loss": 0.1113, "step": 93230 }, { "epoch": 2.174031651656903, "grad_norm": 1.621248722076416, "learning_rate": 2.7562488341727284e-07, "loss": 0.1086, "step": 93240 }, { "epoch": 2.1742648130337208, "grad_norm": 1.5265865325927734, "learning_rate": 2.755471615992041e-07, "loss": 0.1034, "step": 93250 }, { "epoch": 2.174497974410539, "grad_norm": 1.4863113164901733, "learning_rate": 2.7546943978113537e-07, "loss": 0.104, "step": 93260 }, { "epoch": 2.174731135787357, "grad_norm": 1.284976840019226, "learning_rate": 2.753917179630666e-07, "loss": 0.1107, "step": 93270 }, { "epoch": 2.1749642971641747, "grad_norm": 1.8234449625015259, "learning_rate": 2.753139961449978e-07, "loss": 0.0968, "step": 93280 }, { "epoch": 2.1751974585409926, "grad_norm": 1.652146339416504, "learning_rate": 2.7523627432692905e-07, "loss": 0.1035, "step": 93290 }, { "epoch": 2.175430619917811, "grad_norm": 3.9311962127685547, "learning_rate": 2.7515855250886026e-07, "loss": 0.1041, "step": 93300 }, { "epoch": 2.1756637812946287, "grad_norm": 2.7049922943115234, "learning_rate": 2.750808306907915e-07, "loss": 0.1104, "step": 93310 }, { "epoch": 2.1758969426714465, "grad_norm": 1.501368761062622, "learning_rate": 2.7500310887272273e-07, "loss": 0.1011, "step": 93320 }, { "epoch": 2.1761301040482643, "grad_norm": 1.1289664506912231, "learning_rate": 2.7492538705465394e-07, "loss": 0.0949, "step": 93330 }, { "epoch": 2.176363265425082, "grad_norm": 3.584758996963501, "learning_rate": 2.748476652365852e-07, "loss": 0.0947, "step": 93340 }, { "epoch": 2.1765964268019005, "grad_norm": 1.3527497053146362, "learning_rate": 2.7476994341851646e-07, "loss": 0.1014, "step": 93350 }, { "epoch": 2.1768295881787183, "grad_norm": 2.3712100982666016, "learning_rate": 2.746922216004476e-07, "loss": 0.1158, "step": 93360 }, { "epoch": 2.177062749555536, "grad_norm": 2.3575360774993896, "learning_rate": 2.746144997823789e-07, "loss": 0.1019, "step": 93370 }, { "epoch": 2.177295910932354, "grad_norm": 1.7332706451416016, "learning_rate": 2.7453677796431014e-07, "loss": 0.1074, "step": 93380 }, { "epoch": 2.177529072309172, "grad_norm": 1.041153907775879, "learning_rate": 2.7445905614624135e-07, "loss": 0.0978, "step": 93390 }, { "epoch": 2.17776223368599, "grad_norm": 1.2637100219726562, "learning_rate": 2.7438133432817256e-07, "loss": 0.0944, "step": 93400 }, { "epoch": 2.177995395062808, "grad_norm": 1.362696886062622, "learning_rate": 2.743036125101038e-07, "loss": 0.0973, "step": 93410 }, { "epoch": 2.1782285564396258, "grad_norm": 1.528143048286438, "learning_rate": 2.7422589069203503e-07, "loss": 0.1172, "step": 93420 }, { "epoch": 2.1784617178164436, "grad_norm": 1.4823172092437744, "learning_rate": 2.741481688739663e-07, "loss": 0.0945, "step": 93430 }, { "epoch": 2.1786948791932614, "grad_norm": 1.2094581127166748, "learning_rate": 2.7407044705589756e-07, "loss": 0.0973, "step": 93440 }, { "epoch": 2.1789280405700797, "grad_norm": 1.2999553680419922, "learning_rate": 2.739927252378287e-07, "loss": 0.1023, "step": 93450 }, { "epoch": 2.1791612019468976, "grad_norm": 1.939172625541687, "learning_rate": 2.7391500341976e-07, "loss": 0.1014, "step": 93460 }, { "epoch": 2.1793943633237154, "grad_norm": 1.7727632522583008, "learning_rate": 2.7383728160169124e-07, "loss": 0.11, "step": 93470 }, { "epoch": 2.1796275247005332, "grad_norm": 2.461665630340576, "learning_rate": 2.737595597836225e-07, "loss": 0.1005, "step": 93480 }, { "epoch": 2.179860686077351, "grad_norm": 2.042919397354126, "learning_rate": 2.7368183796555365e-07, "loss": 0.1204, "step": 93490 }, { "epoch": 2.1800938474541693, "grad_norm": 1.3104581832885742, "learning_rate": 2.736041161474849e-07, "loss": 0.1179, "step": 93500 }, { "epoch": 2.180327008830987, "grad_norm": 1.4784021377563477, "learning_rate": 2.735263943294162e-07, "loss": 0.1086, "step": 93510 }, { "epoch": 2.180560170207805, "grad_norm": 1.2685106992721558, "learning_rate": 2.734486725113474e-07, "loss": 0.1012, "step": 93520 }, { "epoch": 2.180793331584623, "grad_norm": 1.5590888261795044, "learning_rate": 2.733709506932786e-07, "loss": 0.1, "step": 93530 }, { "epoch": 2.181026492961441, "grad_norm": 1.2678033113479614, "learning_rate": 2.7329322887520986e-07, "loss": 0.11, "step": 93540 }, { "epoch": 2.181259654338259, "grad_norm": 1.0628281831741333, "learning_rate": 2.7321550705714107e-07, "loss": 0.0967, "step": 93550 }, { "epoch": 2.181492815715077, "grad_norm": 1.586859107017517, "learning_rate": 2.7313778523907233e-07, "loss": 0.1009, "step": 93560 }, { "epoch": 2.1817259770918946, "grad_norm": 1.3174484968185425, "learning_rate": 2.7306006342100354e-07, "loss": 0.1091, "step": 93570 }, { "epoch": 2.1819591384687125, "grad_norm": 1.2935194969177246, "learning_rate": 2.7298234160293475e-07, "loss": 0.1248, "step": 93580 }, { "epoch": 2.1821922998455308, "grad_norm": 3.376845359802246, "learning_rate": 2.72904619784866e-07, "loss": 0.1153, "step": 93590 }, { "epoch": 2.1824254612223486, "grad_norm": 1.5505694150924683, "learning_rate": 2.728268979667973e-07, "loss": 0.1018, "step": 93600 }, { "epoch": 2.1826586225991664, "grad_norm": 1.2748358249664307, "learning_rate": 2.7274917614872843e-07, "loss": 0.1118, "step": 93610 }, { "epoch": 2.1828917839759843, "grad_norm": 1.9063979387283325, "learning_rate": 2.726714543306597e-07, "loss": 0.1024, "step": 93620 }, { "epoch": 2.183124945352802, "grad_norm": 1.1257076263427734, "learning_rate": 2.7259373251259095e-07, "loss": 0.1151, "step": 93630 }, { "epoch": 2.1833581067296204, "grad_norm": 1.632358431816101, "learning_rate": 2.7251601069452216e-07, "loss": 0.1055, "step": 93640 }, { "epoch": 2.183591268106438, "grad_norm": 1.5275580883026123, "learning_rate": 2.7243828887645337e-07, "loss": 0.0906, "step": 93650 }, { "epoch": 2.183824429483256, "grad_norm": 1.6159632205963135, "learning_rate": 2.7236056705838463e-07, "loss": 0.0967, "step": 93660 }, { "epoch": 2.184057590860074, "grad_norm": 1.7492468357086182, "learning_rate": 2.7228284524031584e-07, "loss": 0.0928, "step": 93670 }, { "epoch": 2.184290752236892, "grad_norm": 1.874342918395996, "learning_rate": 2.722051234222471e-07, "loss": 0.1035, "step": 93680 }, { "epoch": 2.18452391361371, "grad_norm": 1.9257946014404297, "learning_rate": 2.721274016041783e-07, "loss": 0.1034, "step": 93690 }, { "epoch": 2.184757074990528, "grad_norm": 1.1865234375, "learning_rate": 2.720496797861095e-07, "loss": 0.1114, "step": 93700 }, { "epoch": 2.1849902363673457, "grad_norm": 2.248584508895874, "learning_rate": 2.719719579680408e-07, "loss": 0.1154, "step": 93710 }, { "epoch": 2.1852233977441635, "grad_norm": 1.2572643756866455, "learning_rate": 2.7189423614997205e-07, "loss": 0.0998, "step": 93720 }, { "epoch": 2.185456559120982, "grad_norm": 1.0938893556594849, "learning_rate": 2.718165143319032e-07, "loss": 0.1034, "step": 93730 }, { "epoch": 2.1856897204977996, "grad_norm": 1.2752256393432617, "learning_rate": 2.7173879251383447e-07, "loss": 0.1082, "step": 93740 }, { "epoch": 2.1859228818746175, "grad_norm": 1.1927605867385864, "learning_rate": 2.7166107069576573e-07, "loss": 0.1072, "step": 93750 }, { "epoch": 2.1861560432514353, "grad_norm": 1.2348384857177734, "learning_rate": 2.7158334887769694e-07, "loss": 0.1052, "step": 93760 }, { "epoch": 2.186389204628253, "grad_norm": 1.744896650314331, "learning_rate": 2.7150562705962815e-07, "loss": 0.1084, "step": 93770 }, { "epoch": 2.1866223660050714, "grad_norm": 1.4544811248779297, "learning_rate": 2.714279052415594e-07, "loss": 0.115, "step": 93780 }, { "epoch": 2.1868555273818893, "grad_norm": 2.536658525466919, "learning_rate": 2.713501834234906e-07, "loss": 0.1166, "step": 93790 }, { "epoch": 2.187088688758707, "grad_norm": 1.7110092639923096, "learning_rate": 2.712724616054219e-07, "loss": 0.1058, "step": 93800 }, { "epoch": 2.187321850135525, "grad_norm": 1.1633946895599365, "learning_rate": 2.711947397873531e-07, "loss": 0.1057, "step": 93810 }, { "epoch": 2.1875550115123428, "grad_norm": 1.7547011375427246, "learning_rate": 2.711170179692843e-07, "loss": 0.1062, "step": 93820 }, { "epoch": 2.187788172889161, "grad_norm": 2.0329833030700684, "learning_rate": 2.7103929615121556e-07, "loss": 0.0976, "step": 93830 }, { "epoch": 2.188021334265979, "grad_norm": 0.9902629256248474, "learning_rate": 2.709615743331468e-07, "loss": 0.0991, "step": 93840 }, { "epoch": 2.1882544956427967, "grad_norm": 1.1916769742965698, "learning_rate": 2.70883852515078e-07, "loss": 0.1081, "step": 93850 }, { "epoch": 2.1884876570196146, "grad_norm": 1.1273843050003052, "learning_rate": 2.7080613069700924e-07, "loss": 0.1033, "step": 93860 }, { "epoch": 2.1887208183964324, "grad_norm": 1.3810133934020996, "learning_rate": 2.707284088789405e-07, "loss": 0.1005, "step": 93870 }, { "epoch": 2.1889539797732507, "grad_norm": 1.4845781326293945, "learning_rate": 2.7065068706087177e-07, "loss": 0.1084, "step": 93880 }, { "epoch": 2.1891871411500685, "grad_norm": 1.25413179397583, "learning_rate": 2.705729652428029e-07, "loss": 0.0993, "step": 93890 }, { "epoch": 2.1894203025268864, "grad_norm": 1.3726930618286133, "learning_rate": 2.704952434247342e-07, "loss": 0.0972, "step": 93900 }, { "epoch": 2.189653463903704, "grad_norm": 1.365344524383545, "learning_rate": 2.7041752160666545e-07, "loss": 0.1027, "step": 93910 }, { "epoch": 2.1898866252805225, "grad_norm": 1.0356756448745728, "learning_rate": 2.7033979978859666e-07, "loss": 0.1022, "step": 93920 }, { "epoch": 2.1901197866573403, "grad_norm": 1.7188103199005127, "learning_rate": 2.7026207797052787e-07, "loss": 0.1046, "step": 93930 }, { "epoch": 2.190352948034158, "grad_norm": 1.8660235404968262, "learning_rate": 2.7018435615245913e-07, "loss": 0.1078, "step": 93940 }, { "epoch": 2.190586109410976, "grad_norm": 1.3941048383712769, "learning_rate": 2.7010663433439034e-07, "loss": 0.0889, "step": 93950 }, { "epoch": 2.190819270787794, "grad_norm": 1.5048227310180664, "learning_rate": 2.700289125163216e-07, "loss": 0.1062, "step": 93960 }, { "epoch": 2.191052432164612, "grad_norm": 1.4567655324935913, "learning_rate": 2.699511906982528e-07, "loss": 0.0918, "step": 93970 }, { "epoch": 2.19128559354143, "grad_norm": 1.8259960412979126, "learning_rate": 2.69873468880184e-07, "loss": 0.099, "step": 93980 }, { "epoch": 2.1915187549182478, "grad_norm": 1.1677688360214233, "learning_rate": 2.697957470621153e-07, "loss": 0.0968, "step": 93990 }, { "epoch": 2.1917519162950656, "grad_norm": 1.1896194219589233, "learning_rate": 2.6971802524404654e-07, "loss": 0.1064, "step": 94000 }, { "epoch": 2.191985077671884, "grad_norm": 1.300724744796753, "learning_rate": 2.696403034259777e-07, "loss": 0.1134, "step": 94010 }, { "epoch": 2.1922182390487017, "grad_norm": 1.2769372463226318, "learning_rate": 2.6956258160790896e-07, "loss": 0.1133, "step": 94020 }, { "epoch": 2.1924514004255196, "grad_norm": 1.4441814422607422, "learning_rate": 2.694848597898402e-07, "loss": 0.1161, "step": 94030 }, { "epoch": 2.1926845618023374, "grad_norm": 1.0897241830825806, "learning_rate": 2.694149101535783e-07, "loss": 0.1043, "step": 94040 }, { "epoch": 2.1929177231791552, "grad_norm": 1.3527015447616577, "learning_rate": 2.6933718833550956e-07, "loss": 0.1087, "step": 94050 }, { "epoch": 2.1931508845559735, "grad_norm": 1.4484213590621948, "learning_rate": 2.6925946651744077e-07, "loss": 0.1042, "step": 94060 }, { "epoch": 2.1933840459327913, "grad_norm": 1.131019949913025, "learning_rate": 2.69181744699372e-07, "loss": 0.1057, "step": 94070 }, { "epoch": 2.193617207309609, "grad_norm": 1.5684868097305298, "learning_rate": 2.6910402288130324e-07, "loss": 0.1107, "step": 94080 }, { "epoch": 2.193850368686427, "grad_norm": 3.6960718631744385, "learning_rate": 2.690263010632345e-07, "loss": 0.1276, "step": 94090 }, { "epoch": 2.194083530063245, "grad_norm": 1.4218655824661255, "learning_rate": 2.6894857924516566e-07, "loss": 0.1171, "step": 94100 }, { "epoch": 2.194316691440063, "grad_norm": 1.632529616355896, "learning_rate": 2.688708574270969e-07, "loss": 0.0943, "step": 94110 }, { "epoch": 2.194549852816881, "grad_norm": 1.6346428394317627, "learning_rate": 2.687931356090282e-07, "loss": 0.1032, "step": 94120 }, { "epoch": 2.194783014193699, "grad_norm": 2.129775047302246, "learning_rate": 2.687154137909594e-07, "loss": 0.1033, "step": 94130 }, { "epoch": 2.1950161755705166, "grad_norm": 2.4161365032196045, "learning_rate": 2.686376919728906e-07, "loss": 0.1069, "step": 94140 }, { "epoch": 2.1952493369473345, "grad_norm": 2.304093837738037, "learning_rate": 2.6855997015482186e-07, "loss": 0.1057, "step": 94150 }, { "epoch": 2.1954824983241528, "grad_norm": 1.3868770599365234, "learning_rate": 2.6848224833675307e-07, "loss": 0.0983, "step": 94160 }, { "epoch": 2.1957156597009706, "grad_norm": 1.4582674503326416, "learning_rate": 2.6840452651868433e-07, "loss": 0.1116, "step": 94170 }, { "epoch": 2.1959488210777884, "grad_norm": 1.4318667650222778, "learning_rate": 2.6832680470061554e-07, "loss": 0.1069, "step": 94180 }, { "epoch": 2.1961819824546063, "grad_norm": 3.6779630184173584, "learning_rate": 2.6824908288254675e-07, "loss": 0.0955, "step": 94190 }, { "epoch": 2.196415143831424, "grad_norm": 2.8428502082824707, "learning_rate": 2.68171361064478e-07, "loss": 0.1128, "step": 94200 }, { "epoch": 2.1966483052082424, "grad_norm": 1.1322425603866577, "learning_rate": 2.680936392464093e-07, "loss": 0.1124, "step": 94210 }, { "epoch": 2.1968814665850602, "grad_norm": 3.099823236465454, "learning_rate": 2.6801591742834043e-07, "loss": 0.1043, "step": 94220 }, { "epoch": 2.197114627961878, "grad_norm": 2.320918083190918, "learning_rate": 2.679381956102717e-07, "loss": 0.1062, "step": 94230 }, { "epoch": 2.197347789338696, "grad_norm": 1.2509111166000366, "learning_rate": 2.6786047379220296e-07, "loss": 0.1086, "step": 94240 }, { "epoch": 2.197580950715514, "grad_norm": 1.9444093704223633, "learning_rate": 2.6778275197413416e-07, "loss": 0.1067, "step": 94250 }, { "epoch": 2.197814112092332, "grad_norm": 1.0756274461746216, "learning_rate": 2.677050301560654e-07, "loss": 0.1, "step": 94260 }, { "epoch": 2.19804727346915, "grad_norm": 2.0282931327819824, "learning_rate": 2.6762730833799664e-07, "loss": 0.1121, "step": 94270 }, { "epoch": 2.1982804348459677, "grad_norm": 1.263135552406311, "learning_rate": 2.675495865199279e-07, "loss": 0.0857, "step": 94280 }, { "epoch": 2.1985135962227855, "grad_norm": 2.6142196655273438, "learning_rate": 2.674718647018591e-07, "loss": 0.1128, "step": 94290 }, { "epoch": 2.198746757599604, "grad_norm": 1.5112700462341309, "learning_rate": 2.673941428837903e-07, "loss": 0.098, "step": 94300 }, { "epoch": 2.1989799189764216, "grad_norm": 1.3145272731781006, "learning_rate": 2.673164210657216e-07, "loss": 0.1002, "step": 94310 }, { "epoch": 2.1992130803532395, "grad_norm": 1.905842900276184, "learning_rate": 2.672386992476528e-07, "loss": 0.1072, "step": 94320 }, { "epoch": 2.1994462417300573, "grad_norm": 1.6349880695343018, "learning_rate": 2.6716097742958405e-07, "loss": 0.1082, "step": 94330 }, { "epoch": 2.199679403106875, "grad_norm": 1.4942439794540405, "learning_rate": 2.6708325561151526e-07, "loss": 0.111, "step": 94340 }, { "epoch": 2.1999125644836934, "grad_norm": 1.3843927383422852, "learning_rate": 2.6700553379344647e-07, "loss": 0.1048, "step": 94350 }, { "epoch": 2.2001457258605113, "grad_norm": 1.4492242336273193, "learning_rate": 2.6692781197537773e-07, "loss": 0.0943, "step": 94360 }, { "epoch": 2.200378887237329, "grad_norm": 2.0826611518859863, "learning_rate": 2.66850090157309e-07, "loss": 0.1095, "step": 94370 }, { "epoch": 2.200612048614147, "grad_norm": 2.212070941925049, "learning_rate": 2.6677236833924015e-07, "loss": 0.1102, "step": 94380 }, { "epoch": 2.2008452099909652, "grad_norm": 1.3775728940963745, "learning_rate": 2.666946465211714e-07, "loss": 0.1133, "step": 94390 }, { "epoch": 2.201078371367783, "grad_norm": 1.345597505569458, "learning_rate": 2.666169247031027e-07, "loss": 0.1101, "step": 94400 }, { "epoch": 2.201311532744601, "grad_norm": 2.4646522998809814, "learning_rate": 2.665392028850339e-07, "loss": 0.1042, "step": 94410 }, { "epoch": 2.2015446941214187, "grad_norm": 1.331559181213379, "learning_rate": 2.664614810669651e-07, "loss": 0.1093, "step": 94420 }, { "epoch": 2.2017778554982366, "grad_norm": 1.4613521099090576, "learning_rate": 2.6638375924889635e-07, "loss": 0.1035, "step": 94430 }, { "epoch": 2.202011016875055, "grad_norm": 1.6615737676620483, "learning_rate": 2.6630603743082756e-07, "loss": 0.0984, "step": 94440 }, { "epoch": 2.2022441782518727, "grad_norm": 1.360853910446167, "learning_rate": 2.662283156127588e-07, "loss": 0.0973, "step": 94450 }, { "epoch": 2.2024773396286905, "grad_norm": 3.357130527496338, "learning_rate": 2.6615059379469003e-07, "loss": 0.1099, "step": 94460 }, { "epoch": 2.2027105010055084, "grad_norm": 1.8835208415985107, "learning_rate": 2.6607287197662124e-07, "loss": 0.1021, "step": 94470 }, { "epoch": 2.202943662382326, "grad_norm": 1.5124977827072144, "learning_rate": 2.659951501585525e-07, "loss": 0.1104, "step": 94480 }, { "epoch": 2.2031768237591445, "grad_norm": 2.7493910789489746, "learning_rate": 2.6591742834048377e-07, "loss": 0.1, "step": 94490 }, { "epoch": 2.2034099851359623, "grad_norm": 3.0106725692749023, "learning_rate": 2.658397065224149e-07, "loss": 0.1044, "step": 94500 }, { "epoch": 2.20364314651278, "grad_norm": 1.2976258993148804, "learning_rate": 2.657619847043462e-07, "loss": 0.1028, "step": 94510 }, { "epoch": 2.203876307889598, "grad_norm": 1.719996690750122, "learning_rate": 2.6568426288627745e-07, "loss": 0.1075, "step": 94520 }, { "epoch": 2.204109469266416, "grad_norm": 2.8998289108276367, "learning_rate": 2.6560654106820866e-07, "loss": 0.1062, "step": 94530 }, { "epoch": 2.204342630643234, "grad_norm": 2.640995740890503, "learning_rate": 2.6552881925013987e-07, "loss": 0.0987, "step": 94540 }, { "epoch": 2.204575792020052, "grad_norm": 1.4062691926956177, "learning_rate": 2.6545109743207113e-07, "loss": 0.11, "step": 94550 }, { "epoch": 2.2048089533968698, "grad_norm": 1.180954098701477, "learning_rate": 2.6537337561400234e-07, "loss": 0.0945, "step": 94560 }, { "epoch": 2.2050421147736876, "grad_norm": 1.5779260396957397, "learning_rate": 2.652956537959336e-07, "loss": 0.1016, "step": 94570 }, { "epoch": 2.2052752761505054, "grad_norm": 1.1708976030349731, "learning_rate": 2.652179319778648e-07, "loss": 0.1065, "step": 94580 }, { "epoch": 2.2055084375273237, "grad_norm": 1.5677648782730103, "learning_rate": 2.65140210159796e-07, "loss": 0.1055, "step": 94590 }, { "epoch": 2.2057415989041416, "grad_norm": 1.8582749366760254, "learning_rate": 2.650624883417273e-07, "loss": 0.1181, "step": 94600 }, { "epoch": 2.2059747602809594, "grad_norm": 1.606432557106018, "learning_rate": 2.6498476652365854e-07, "loss": 0.1026, "step": 94610 }, { "epoch": 2.2062079216577772, "grad_norm": 1.7696633338928223, "learning_rate": 2.649070447055897e-07, "loss": 0.1058, "step": 94620 }, { "epoch": 2.2064410830345955, "grad_norm": 1.1960572004318237, "learning_rate": 2.6482932288752096e-07, "loss": 0.1088, "step": 94630 }, { "epoch": 2.2066742444114134, "grad_norm": 1.4317307472229004, "learning_rate": 2.647516010694522e-07, "loss": 0.0998, "step": 94640 }, { "epoch": 2.206907405788231, "grad_norm": 1.5111335515975952, "learning_rate": 2.6467387925138343e-07, "loss": 0.0986, "step": 94650 }, { "epoch": 2.207140567165049, "grad_norm": 1.1837340593338013, "learning_rate": 2.6459615743331464e-07, "loss": 0.1022, "step": 94660 }, { "epoch": 2.207373728541867, "grad_norm": 2.8430559635162354, "learning_rate": 2.645184356152459e-07, "loss": 0.113, "step": 94670 }, { "epoch": 2.207606889918685, "grad_norm": 1.2413787841796875, "learning_rate": 2.644407137971771e-07, "loss": 0.0973, "step": 94680 }, { "epoch": 2.207840051295503, "grad_norm": 1.8428922891616821, "learning_rate": 2.643629919791084e-07, "loss": 0.1113, "step": 94690 }, { "epoch": 2.208073212672321, "grad_norm": 1.5374410152435303, "learning_rate": 2.642852701610396e-07, "loss": 0.1053, "step": 94700 }, { "epoch": 2.2083063740491387, "grad_norm": 1.0800834894180298, "learning_rate": 2.642075483429708e-07, "loss": 0.1082, "step": 94710 }, { "epoch": 2.2085395354259565, "grad_norm": 2.339108943939209, "learning_rate": 2.6412982652490206e-07, "loss": 0.1165, "step": 94720 }, { "epoch": 2.2087726968027748, "grad_norm": 1.4527372121810913, "learning_rate": 2.640521047068333e-07, "loss": 0.1149, "step": 94730 }, { "epoch": 2.2090058581795926, "grad_norm": 1.1075506210327148, "learning_rate": 2.6397438288876453e-07, "loss": 0.1028, "step": 94740 }, { "epoch": 2.2092390195564104, "grad_norm": 1.8165438175201416, "learning_rate": 2.6389666107069574e-07, "loss": 0.1039, "step": 94750 }, { "epoch": 2.2094721809332283, "grad_norm": 1.192159652709961, "learning_rate": 2.63818939252627e-07, "loss": 0.1005, "step": 94760 }, { "epoch": 2.2097053423100466, "grad_norm": 2.476489305496216, "learning_rate": 2.6374121743455826e-07, "loss": 0.1016, "step": 94770 }, { "epoch": 2.2099385036868644, "grad_norm": 2.9036717414855957, "learning_rate": 2.636634956164894e-07, "loss": 0.1145, "step": 94780 }, { "epoch": 2.2101716650636822, "grad_norm": 1.3617706298828125, "learning_rate": 2.635857737984207e-07, "loss": 0.1112, "step": 94790 }, { "epoch": 2.2104048264405, "grad_norm": 2.5795252323150635, "learning_rate": 2.6350805198035194e-07, "loss": 0.1015, "step": 94800 }, { "epoch": 2.210637987817318, "grad_norm": 1.5713187456130981, "learning_rate": 2.6343033016228315e-07, "loss": 0.1015, "step": 94810 }, { "epoch": 2.210871149194136, "grad_norm": 2.919691324234009, "learning_rate": 2.6335260834421436e-07, "loss": 0.1023, "step": 94820 }, { "epoch": 2.211104310570954, "grad_norm": 1.3828092813491821, "learning_rate": 2.632748865261456e-07, "loss": 0.1024, "step": 94830 }, { "epoch": 2.211337471947772, "grad_norm": 2.203688383102417, "learning_rate": 2.6319716470807683e-07, "loss": 0.1045, "step": 94840 }, { "epoch": 2.2115706333245897, "grad_norm": 3.238863706588745, "learning_rate": 2.631194428900081e-07, "loss": 0.0998, "step": 94850 }, { "epoch": 2.2118037947014075, "grad_norm": 1.4001861810684204, "learning_rate": 2.630417210719393e-07, "loss": 0.0983, "step": 94860 }, { "epoch": 2.212036956078226, "grad_norm": 2.193070411682129, "learning_rate": 2.629717714356774e-07, "loss": 0.1033, "step": 94870 }, { "epoch": 2.2122701174550436, "grad_norm": 1.914845585823059, "learning_rate": 2.6289404961760864e-07, "loss": 0.1068, "step": 94880 }, { "epoch": 2.2125032788318615, "grad_norm": 1.2647227048873901, "learning_rate": 2.628163277995399e-07, "loss": 0.1118, "step": 94890 }, { "epoch": 2.2127364402086793, "grad_norm": 1.1411818265914917, "learning_rate": 2.627386059814711e-07, "loss": 0.1036, "step": 94900 }, { "epoch": 2.212969601585497, "grad_norm": 1.3534128665924072, "learning_rate": 2.626608841634023e-07, "loss": 0.1085, "step": 94910 }, { "epoch": 2.2132027629623154, "grad_norm": 1.4015789031982422, "learning_rate": 2.625831623453336e-07, "loss": 0.0985, "step": 94920 }, { "epoch": 2.2134359243391333, "grad_norm": 3.3844869136810303, "learning_rate": 2.625054405272648e-07, "loss": 0.1211, "step": 94930 }, { "epoch": 2.213669085715951, "grad_norm": 2.2561140060424805, "learning_rate": 2.6242771870919605e-07, "loss": 0.1184, "step": 94940 }, { "epoch": 2.213902247092769, "grad_norm": 1.2471545934677124, "learning_rate": 2.6234999689112726e-07, "loss": 0.1099, "step": 94950 }, { "epoch": 2.214135408469587, "grad_norm": 1.3191272020339966, "learning_rate": 2.6227227507305847e-07, "loss": 0.107, "step": 94960 }, { "epoch": 2.214368569846405, "grad_norm": 1.5067768096923828, "learning_rate": 2.6219455325498973e-07, "loss": 0.0981, "step": 94970 }, { "epoch": 2.214601731223223, "grad_norm": 2.807070255279541, "learning_rate": 2.62116831436921e-07, "loss": 0.1027, "step": 94980 }, { "epoch": 2.2148348926000407, "grad_norm": 2.0877761840820312, "learning_rate": 2.6203910961885215e-07, "loss": 0.092, "step": 94990 }, { "epoch": 2.2150680539768586, "grad_norm": 1.1191315650939941, "learning_rate": 2.619613878007834e-07, "loss": 0.0917, "step": 95000 }, { "epoch": 2.215301215353677, "grad_norm": 2.0670244693756104, "learning_rate": 2.618836659827147e-07, "loss": 0.1081, "step": 95010 }, { "epoch": 2.2155343767304947, "grad_norm": 1.7179453372955322, "learning_rate": 2.618059441646459e-07, "loss": 0.1117, "step": 95020 }, { "epoch": 2.2157675381073125, "grad_norm": 1.2471797466278076, "learning_rate": 2.617282223465771e-07, "loss": 0.1084, "step": 95030 }, { "epoch": 2.2160006994841304, "grad_norm": 2.5723962783813477, "learning_rate": 2.6165050052850836e-07, "loss": 0.1059, "step": 95040 }, { "epoch": 2.216233860860948, "grad_norm": 1.77978515625, "learning_rate": 2.6157277871043956e-07, "loss": 0.0937, "step": 95050 }, { "epoch": 2.2164670222377665, "grad_norm": 2.098991870880127, "learning_rate": 2.6149505689237083e-07, "loss": 0.1106, "step": 95060 }, { "epoch": 2.2167001836145843, "grad_norm": 1.2375895977020264, "learning_rate": 2.6141733507430204e-07, "loss": 0.0924, "step": 95070 }, { "epoch": 2.216933344991402, "grad_norm": 1.0929293632507324, "learning_rate": 2.6133961325623325e-07, "loss": 0.1013, "step": 95080 }, { "epoch": 2.21716650636822, "grad_norm": 1.5225269794464111, "learning_rate": 2.612618914381645e-07, "loss": 0.1079, "step": 95090 }, { "epoch": 2.217399667745038, "grad_norm": 1.4119949340820312, "learning_rate": 2.6118416962009577e-07, "loss": 0.1032, "step": 95100 }, { "epoch": 2.217632829121856, "grad_norm": 1.3654274940490723, "learning_rate": 2.611064478020269e-07, "loss": 0.1079, "step": 95110 }, { "epoch": 2.217865990498674, "grad_norm": 2.7300448417663574, "learning_rate": 2.610287259839582e-07, "loss": 0.0997, "step": 95120 }, { "epoch": 2.218099151875492, "grad_norm": 1.3575576543807983, "learning_rate": 2.6095100416588945e-07, "loss": 0.1025, "step": 95130 }, { "epoch": 2.2183323132523096, "grad_norm": 3.9285213947296143, "learning_rate": 2.608732823478207e-07, "loss": 0.0981, "step": 95140 }, { "epoch": 2.218565474629128, "grad_norm": 3.449493646621704, "learning_rate": 2.6079556052975187e-07, "loss": 0.1011, "step": 95150 }, { "epoch": 2.2187986360059457, "grad_norm": 1.872119426727295, "learning_rate": 2.6071783871168313e-07, "loss": 0.0915, "step": 95160 }, { "epoch": 2.2190317973827636, "grad_norm": 1.7778812646865845, "learning_rate": 2.606401168936144e-07, "loss": 0.1088, "step": 95170 }, { "epoch": 2.2192649587595814, "grad_norm": 1.9045753479003906, "learning_rate": 2.605623950755456e-07, "loss": 0.0989, "step": 95180 }, { "epoch": 2.2194981201363992, "grad_norm": 1.724150538444519, "learning_rate": 2.604846732574768e-07, "loss": 0.1049, "step": 95190 }, { "epoch": 2.2197312815132175, "grad_norm": 2.1608316898345947, "learning_rate": 2.604069514394081e-07, "loss": 0.1044, "step": 95200 }, { "epoch": 2.2199644428900354, "grad_norm": 1.2636724710464478, "learning_rate": 2.603292296213393e-07, "loss": 0.0997, "step": 95210 }, { "epoch": 2.220197604266853, "grad_norm": 3.0892181396484375, "learning_rate": 2.6025150780327055e-07, "loss": 0.0932, "step": 95220 }, { "epoch": 2.220430765643671, "grad_norm": 1.5673820972442627, "learning_rate": 2.6017378598520175e-07, "loss": 0.0986, "step": 95230 }, { "epoch": 2.220663927020489, "grad_norm": 3.3055756092071533, "learning_rate": 2.6009606416713296e-07, "loss": 0.1115, "step": 95240 }, { "epoch": 2.220897088397307, "grad_norm": 3.0227925777435303, "learning_rate": 2.600183423490642e-07, "loss": 0.1169, "step": 95250 }, { "epoch": 2.221130249774125, "grad_norm": 1.3837777376174927, "learning_rate": 2.599406205309955e-07, "loss": 0.1112, "step": 95260 }, { "epoch": 2.221363411150943, "grad_norm": 2.0051381587982178, "learning_rate": 2.5986289871292664e-07, "loss": 0.1014, "step": 95270 }, { "epoch": 2.2215965725277607, "grad_norm": 2.998523712158203, "learning_rate": 2.597851768948579e-07, "loss": 0.0995, "step": 95280 }, { "epoch": 2.2218297339045785, "grad_norm": 1.5009987354278564, "learning_rate": 2.5970745507678917e-07, "loss": 0.1123, "step": 95290 }, { "epoch": 2.2220628952813968, "grad_norm": 1.5310027599334717, "learning_rate": 2.596297332587204e-07, "loss": 0.11, "step": 95300 }, { "epoch": 2.2222960566582146, "grad_norm": 1.988446831703186, "learning_rate": 2.595520114406516e-07, "loss": 0.1063, "step": 95310 }, { "epoch": 2.2225292180350325, "grad_norm": 1.4503982067108154, "learning_rate": 2.5947428962258285e-07, "loss": 0.1016, "step": 95320 }, { "epoch": 2.2227623794118503, "grad_norm": 3.1641664505004883, "learning_rate": 2.5939656780451406e-07, "loss": 0.1131, "step": 95330 }, { "epoch": 2.222995540788668, "grad_norm": 1.441617727279663, "learning_rate": 2.593188459864453e-07, "loss": 0.0916, "step": 95340 }, { "epoch": 2.2232287021654864, "grad_norm": 1.2836010456085205, "learning_rate": 2.5924112416837653e-07, "loss": 0.1077, "step": 95350 }, { "epoch": 2.2234618635423042, "grad_norm": 1.1584453582763672, "learning_rate": 2.5916340235030774e-07, "loss": 0.0975, "step": 95360 }, { "epoch": 2.223695024919122, "grad_norm": 1.220633864402771, "learning_rate": 2.59085680532239e-07, "loss": 0.0996, "step": 95370 }, { "epoch": 2.22392818629594, "grad_norm": 1.366500973701477, "learning_rate": 2.5900795871417026e-07, "loss": 0.1068, "step": 95380 }, { "epoch": 2.224161347672758, "grad_norm": 1.8073511123657227, "learning_rate": 2.589302368961014e-07, "loss": 0.1115, "step": 95390 }, { "epoch": 2.224394509049576, "grad_norm": 1.4865564107894897, "learning_rate": 2.588525150780327e-07, "loss": 0.1057, "step": 95400 }, { "epoch": 2.224627670426394, "grad_norm": 1.5094560384750366, "learning_rate": 2.5877479325996394e-07, "loss": 0.0988, "step": 95410 }, { "epoch": 2.2248608318032117, "grad_norm": 1.3649349212646484, "learning_rate": 2.5869707144189515e-07, "loss": 0.1003, "step": 95420 }, { "epoch": 2.2250939931800295, "grad_norm": 1.1682125329971313, "learning_rate": 2.5861934962382636e-07, "loss": 0.0975, "step": 95430 }, { "epoch": 2.225327154556848, "grad_norm": 1.2129143476486206, "learning_rate": 2.585416278057576e-07, "loss": 0.1021, "step": 95440 }, { "epoch": 2.2255603159336657, "grad_norm": 3.6145401000976562, "learning_rate": 2.5846390598768883e-07, "loss": 0.0943, "step": 95450 }, { "epoch": 2.2257934773104835, "grad_norm": 1.4078173637390137, "learning_rate": 2.583861841696201e-07, "loss": 0.0891, "step": 95460 }, { "epoch": 2.2260266386873013, "grad_norm": 2.127833366394043, "learning_rate": 2.583084623515513e-07, "loss": 0.1085, "step": 95470 }, { "epoch": 2.2262598000641196, "grad_norm": 3.0005288124084473, "learning_rate": 2.582307405334825e-07, "loss": 0.1044, "step": 95480 }, { "epoch": 2.2264929614409374, "grad_norm": 1.8080604076385498, "learning_rate": 2.581530187154138e-07, "loss": 0.1011, "step": 95490 }, { "epoch": 2.2267261228177553, "grad_norm": 1.8752774000167847, "learning_rate": 2.5807529689734504e-07, "loss": 0.1074, "step": 95500 }, { "epoch": 2.226959284194573, "grad_norm": 3.2243683338165283, "learning_rate": 2.579975750792762e-07, "loss": 0.1077, "step": 95510 }, { "epoch": 2.227192445571391, "grad_norm": 1.901036024093628, "learning_rate": 2.5791985326120746e-07, "loss": 0.1094, "step": 95520 }, { "epoch": 2.2274256069482092, "grad_norm": 2.136573314666748, "learning_rate": 2.578421314431387e-07, "loss": 0.0934, "step": 95530 }, { "epoch": 2.227658768325027, "grad_norm": 2.8656914234161377, "learning_rate": 2.5776440962507e-07, "loss": 0.0995, "step": 95540 }, { "epoch": 2.227891929701845, "grad_norm": 1.3144094944000244, "learning_rate": 2.5768668780700114e-07, "loss": 0.0941, "step": 95550 }, { "epoch": 2.2281250910786627, "grad_norm": 2.7755656242370605, "learning_rate": 2.576089659889324e-07, "loss": 0.1149, "step": 95560 }, { "epoch": 2.2283582524554806, "grad_norm": 1.4399751424789429, "learning_rate": 2.5753124417086366e-07, "loss": 0.0986, "step": 95570 }, { "epoch": 2.228591413832299, "grad_norm": 2.7605481147766113, "learning_rate": 2.5745352235279487e-07, "loss": 0.1184, "step": 95580 }, { "epoch": 2.2288245752091167, "grad_norm": 1.3317586183547974, "learning_rate": 2.573758005347261e-07, "loss": 0.0968, "step": 95590 }, { "epoch": 2.2290577365859345, "grad_norm": 1.815990924835205, "learning_rate": 2.5729807871665734e-07, "loss": 0.1004, "step": 95600 }, { "epoch": 2.2292908979627524, "grad_norm": 3.4611544609069824, "learning_rate": 2.5722035689858855e-07, "loss": 0.1176, "step": 95610 }, { "epoch": 2.22952405933957, "grad_norm": 1.8038029670715332, "learning_rate": 2.571426350805198e-07, "loss": 0.1073, "step": 95620 }, { "epoch": 2.2297572207163885, "grad_norm": 2.4186856746673584, "learning_rate": 2.57064913262451e-07, "loss": 0.1114, "step": 95630 }, { "epoch": 2.2299903820932063, "grad_norm": 1.575736165046692, "learning_rate": 2.5698719144438223e-07, "loss": 0.1005, "step": 95640 }, { "epoch": 2.230223543470024, "grad_norm": 1.519853949546814, "learning_rate": 2.569094696263135e-07, "loss": 0.1095, "step": 95650 }, { "epoch": 2.230456704846842, "grad_norm": 2.425424575805664, "learning_rate": 2.5683174780824476e-07, "loss": 0.1139, "step": 95660 }, { "epoch": 2.23068986622366, "grad_norm": 1.8422157764434814, "learning_rate": 2.567540259901759e-07, "loss": 0.1053, "step": 95670 }, { "epoch": 2.230923027600478, "grad_norm": 1.274048089981079, "learning_rate": 2.566763041721072e-07, "loss": 0.1053, "step": 95680 }, { "epoch": 2.231156188977296, "grad_norm": 3.362576484680176, "learning_rate": 2.5659858235403844e-07, "loss": 0.1051, "step": 95690 }, { "epoch": 2.231389350354114, "grad_norm": 1.55478835105896, "learning_rate": 2.5652086053596965e-07, "loss": 0.1126, "step": 95700 }, { "epoch": 2.2316225117309316, "grad_norm": 1.424621820449829, "learning_rate": 2.5644313871790086e-07, "loss": 0.1111, "step": 95710 }, { "epoch": 2.23185567310775, "grad_norm": 1.2036153078079224, "learning_rate": 2.563654168998321e-07, "loss": 0.0972, "step": 95720 }, { "epoch": 2.2320888344845677, "grad_norm": 1.1368485689163208, "learning_rate": 2.5628769508176333e-07, "loss": 0.1107, "step": 95730 }, { "epoch": 2.2323219958613856, "grad_norm": 1.1205605268478394, "learning_rate": 2.562099732636946e-07, "loss": 0.1011, "step": 95740 }, { "epoch": 2.2325551572382034, "grad_norm": 1.6653882265090942, "learning_rate": 2.561322514456258e-07, "loss": 0.0959, "step": 95750 }, { "epoch": 2.2327883186150213, "grad_norm": 1.2437012195587158, "learning_rate": 2.56054529627557e-07, "loss": 0.1057, "step": 95760 }, { "epoch": 2.2330214799918395, "grad_norm": 1.7479119300842285, "learning_rate": 2.5597680780948827e-07, "loss": 0.1006, "step": 95770 }, { "epoch": 2.2332546413686574, "grad_norm": 2.069108247756958, "learning_rate": 2.5589908599141953e-07, "loss": 0.105, "step": 95780 }, { "epoch": 2.233487802745475, "grad_norm": 1.4221258163452148, "learning_rate": 2.558213641733507e-07, "loss": 0.0961, "step": 95790 }, { "epoch": 2.233720964122293, "grad_norm": 1.7585992813110352, "learning_rate": 2.5574364235528195e-07, "loss": 0.1001, "step": 95800 }, { "epoch": 2.233954125499111, "grad_norm": 2.031174659729004, "learning_rate": 2.556659205372132e-07, "loss": 0.1001, "step": 95810 }, { "epoch": 2.234187286875929, "grad_norm": 1.1637725830078125, "learning_rate": 2.555881987191444e-07, "loss": 0.1163, "step": 95820 }, { "epoch": 2.234420448252747, "grad_norm": 2.4678592681884766, "learning_rate": 2.5551047690107563e-07, "loss": 0.1081, "step": 95830 }, { "epoch": 2.234653609629565, "grad_norm": 2.0271496772766113, "learning_rate": 2.554327550830069e-07, "loss": 0.1033, "step": 95840 }, { "epoch": 2.2348867710063827, "grad_norm": 3.3469080924987793, "learning_rate": 2.553550332649381e-07, "loss": 0.1103, "step": 95850 }, { "epoch": 2.235119932383201, "grad_norm": 1.9567432403564453, "learning_rate": 2.5527731144686936e-07, "loss": 0.0985, "step": 95860 }, { "epoch": 2.235353093760019, "grad_norm": 1.3590203523635864, "learning_rate": 2.5519958962880057e-07, "loss": 0.0966, "step": 95870 }, { "epoch": 2.2355862551368366, "grad_norm": 1.0416041612625122, "learning_rate": 2.551218678107318e-07, "loss": 0.1045, "step": 95880 }, { "epoch": 2.2358194165136545, "grad_norm": 1.86635160446167, "learning_rate": 2.5504414599266304e-07, "loss": 0.1207, "step": 95890 }, { "epoch": 2.2360525778904723, "grad_norm": 1.0876545906066895, "learning_rate": 2.549664241745943e-07, "loss": 0.0976, "step": 95900 }, { "epoch": 2.2362857392672906, "grad_norm": 2.476583957672119, "learning_rate": 2.5488870235652546e-07, "loss": 0.1089, "step": 95910 }, { "epoch": 2.2365189006441084, "grad_norm": 2.543318510055542, "learning_rate": 2.548109805384567e-07, "loss": 0.1165, "step": 95920 }, { "epoch": 2.2367520620209262, "grad_norm": 2.599466562271118, "learning_rate": 2.54733258720388e-07, "loss": 0.1038, "step": 95930 }, { "epoch": 2.236985223397744, "grad_norm": 1.4201594591140747, "learning_rate": 2.5465553690231925e-07, "loss": 0.1094, "step": 95940 }, { "epoch": 2.237218384774562, "grad_norm": 1.525972843170166, "learning_rate": 2.545778150842504e-07, "loss": 0.1122, "step": 95950 }, { "epoch": 2.23745154615138, "grad_norm": 1.1807461977005005, "learning_rate": 2.5450009326618167e-07, "loss": 0.1084, "step": 95960 }, { "epoch": 2.237684707528198, "grad_norm": 3.303562879562378, "learning_rate": 2.5442237144811293e-07, "loss": 0.098, "step": 95970 }, { "epoch": 2.237917868905016, "grad_norm": 1.8400652408599854, "learning_rate": 2.5434464963004414e-07, "loss": 0.1083, "step": 95980 }, { "epoch": 2.2381510302818337, "grad_norm": 1.9405293464660645, "learning_rate": 2.5426692781197535e-07, "loss": 0.1129, "step": 95990 }, { "epoch": 2.2383841916586515, "grad_norm": 1.2944937944412231, "learning_rate": 2.541892059939066e-07, "loss": 0.1036, "step": 96000 }, { "epoch": 2.23861735303547, "grad_norm": 1.4391857385635376, "learning_rate": 2.541114841758378e-07, "loss": 0.1194, "step": 96010 }, { "epoch": 2.2388505144122877, "grad_norm": 1.4951266050338745, "learning_rate": 2.540337623577691e-07, "loss": 0.1093, "step": 96020 }, { "epoch": 2.2390836757891055, "grad_norm": 1.3573077917099, "learning_rate": 2.539560405397003e-07, "loss": 0.0964, "step": 96030 }, { "epoch": 2.2393168371659233, "grad_norm": 1.285455584526062, "learning_rate": 2.538783187216315e-07, "loss": 0.108, "step": 96040 }, { "epoch": 2.239549998542741, "grad_norm": 3.8701701164245605, "learning_rate": 2.5380059690356276e-07, "loss": 0.0995, "step": 96050 }, { "epoch": 2.2397831599195595, "grad_norm": 1.3795173168182373, "learning_rate": 2.53722875085494e-07, "loss": 0.1152, "step": 96060 }, { "epoch": 2.2400163212963773, "grad_norm": 2.634298324584961, "learning_rate": 2.536451532674252e-07, "loss": 0.104, "step": 96070 }, { "epoch": 2.240249482673195, "grad_norm": 1.6394646167755127, "learning_rate": 2.5356743144935644e-07, "loss": 0.1037, "step": 96080 }, { "epoch": 2.240482644050013, "grad_norm": 3.6969709396362305, "learning_rate": 2.534897096312877e-07, "loss": 0.1012, "step": 96090 }, { "epoch": 2.2407158054268312, "grad_norm": 3.8651199340820312, "learning_rate": 2.534119878132189e-07, "loss": 0.1048, "step": 96100 }, { "epoch": 2.240948966803649, "grad_norm": 1.8115154504776, "learning_rate": 2.533342659951501e-07, "loss": 0.11, "step": 96110 }, { "epoch": 2.241182128180467, "grad_norm": 1.1756055355072021, "learning_rate": 2.532565441770814e-07, "loss": 0.1022, "step": 96120 }, { "epoch": 2.2414152895572848, "grad_norm": 1.7519950866699219, "learning_rate": 2.531788223590126e-07, "loss": 0.0911, "step": 96130 }, { "epoch": 2.2416484509341026, "grad_norm": 2.562936782836914, "learning_rate": 2.5310110054094386e-07, "loss": 0.109, "step": 96140 }, { "epoch": 2.241881612310921, "grad_norm": 1.312783122062683, "learning_rate": 2.5302337872287507e-07, "loss": 0.1073, "step": 96150 }, { "epoch": 2.2421147736877387, "grad_norm": 2.021583318710327, "learning_rate": 2.529456569048063e-07, "loss": 0.1028, "step": 96160 }, { "epoch": 2.2423479350645565, "grad_norm": 1.8719509840011597, "learning_rate": 2.5286793508673754e-07, "loss": 0.0958, "step": 96170 }, { "epoch": 2.2425810964413744, "grad_norm": 1.3855355978012085, "learning_rate": 2.527902132686688e-07, "loss": 0.0979, "step": 96180 }, { "epoch": 2.242814257818192, "grad_norm": 2.003103256225586, "learning_rate": 2.5271249145059996e-07, "loss": 0.1066, "step": 96190 }, { "epoch": 2.2430474191950105, "grad_norm": 1.0499275922775269, "learning_rate": 2.526347696325312e-07, "loss": 0.103, "step": 96200 }, { "epoch": 2.2432805805718283, "grad_norm": 1.8247098922729492, "learning_rate": 2.525570478144625e-07, "loss": 0.0985, "step": 96210 }, { "epoch": 2.243513741948646, "grad_norm": 1.4066165685653687, "learning_rate": 2.524793259963937e-07, "loss": 0.0998, "step": 96220 }, { "epoch": 2.243746903325464, "grad_norm": 1.193914771080017, "learning_rate": 2.524016041783249e-07, "loss": 0.0917, "step": 96230 }, { "epoch": 2.2439800647022823, "grad_norm": 1.7893476486206055, "learning_rate": 2.5232388236025616e-07, "loss": 0.1015, "step": 96240 }, { "epoch": 2.2442132260791, "grad_norm": 3.1174042224884033, "learning_rate": 2.5224616054218737e-07, "loss": 0.1075, "step": 96250 }, { "epoch": 2.244446387455918, "grad_norm": 1.4003784656524658, "learning_rate": 2.5216843872411863e-07, "loss": 0.0953, "step": 96260 }, { "epoch": 2.244679548832736, "grad_norm": 2.219797372817993, "learning_rate": 2.5209071690604984e-07, "loss": 0.0901, "step": 96270 }, { "epoch": 2.2449127102095536, "grad_norm": 1.0908406972885132, "learning_rate": 2.5201299508798105e-07, "loss": 0.103, "step": 96280 }, { "epoch": 2.245145871586372, "grad_norm": 2.6471781730651855, "learning_rate": 2.519352732699123e-07, "loss": 0.0973, "step": 96290 }, { "epoch": 2.2453790329631897, "grad_norm": 1.3392724990844727, "learning_rate": 2.518575514518436e-07, "loss": 0.0928, "step": 96300 }, { "epoch": 2.2456121943400076, "grad_norm": 3.100128412246704, "learning_rate": 2.517798296337748e-07, "loss": 0.104, "step": 96310 }, { "epoch": 2.2458453557168254, "grad_norm": 1.9018995761871338, "learning_rate": 2.51702107815706e-07, "loss": 0.1028, "step": 96320 }, { "epoch": 2.2460785170936433, "grad_norm": 1.3102184534072876, "learning_rate": 2.5162438599763726e-07, "loss": 0.104, "step": 96330 }, { "epoch": 2.2463116784704615, "grad_norm": 1.5620077848434448, "learning_rate": 2.5154666417956846e-07, "loss": 0.1061, "step": 96340 }, { "epoch": 2.2465448398472794, "grad_norm": 1.677857756614685, "learning_rate": 2.5146894236149973e-07, "loss": 0.1083, "step": 96350 }, { "epoch": 2.246778001224097, "grad_norm": 3.321758985519409, "learning_rate": 2.5139122054343094e-07, "loss": 0.1031, "step": 96360 }, { "epoch": 2.247011162600915, "grad_norm": 1.653395414352417, "learning_rate": 2.513134987253622e-07, "loss": 0.1103, "step": 96370 }, { "epoch": 2.247244323977733, "grad_norm": 3.3134775161743164, "learning_rate": 2.512357769072934e-07, "loss": 0.1043, "step": 96380 }, { "epoch": 2.247477485354551, "grad_norm": 1.7084288597106934, "learning_rate": 2.5115805508922467e-07, "loss": 0.1147, "step": 96390 }, { "epoch": 2.247710646731369, "grad_norm": 1.6645604372024536, "learning_rate": 2.510803332711559e-07, "loss": 0.0974, "step": 96400 }, { "epoch": 2.247943808108187, "grad_norm": 2.894846200942993, "learning_rate": 2.510026114530871e-07, "loss": 0.1101, "step": 96410 }, { "epoch": 2.2481769694850047, "grad_norm": 2.3043932914733887, "learning_rate": 2.5092488963501835e-07, "loss": 0.1089, "step": 96420 }, { "epoch": 2.2484101308618225, "grad_norm": 1.898446798324585, "learning_rate": 2.508471678169496e-07, "loss": 0.1018, "step": 96430 }, { "epoch": 2.248643292238641, "grad_norm": 3.4324545860290527, "learning_rate": 2.5076944599888077e-07, "loss": 0.107, "step": 96440 }, { "epoch": 2.2488764536154586, "grad_norm": 1.9646488428115845, "learning_rate": 2.5069172418081203e-07, "loss": 0.1057, "step": 96450 }, { "epoch": 2.2491096149922765, "grad_norm": 1.3709697723388672, "learning_rate": 2.506140023627433e-07, "loss": 0.1062, "step": 96460 }, { "epoch": 2.2493427763690943, "grad_norm": 1.765053153038025, "learning_rate": 2.505362805446745e-07, "loss": 0.1046, "step": 96470 }, { "epoch": 2.2495759377459126, "grad_norm": 1.232406497001648, "learning_rate": 2.504585587266057e-07, "loss": 0.112, "step": 96480 }, { "epoch": 2.2498090991227304, "grad_norm": 2.5659232139587402, "learning_rate": 2.50380836908537e-07, "loss": 0.1088, "step": 96490 }, { "epoch": 2.2500422604995483, "grad_norm": 1.8645739555358887, "learning_rate": 2.503031150904682e-07, "loss": 0.1044, "step": 96500 }, { "epoch": 2.250275421876366, "grad_norm": 1.5550158023834229, "learning_rate": 2.5022539327239944e-07, "loss": 0.1135, "step": 96510 }, { "epoch": 2.250508583253184, "grad_norm": 1.1062015295028687, "learning_rate": 2.5014767145433065e-07, "loss": 0.1075, "step": 96520 }, { "epoch": 2.250741744630002, "grad_norm": 1.388009786605835, "learning_rate": 2.5006994963626186e-07, "loss": 0.106, "step": 96530 }, { "epoch": 2.25097490600682, "grad_norm": 1.782434344291687, "learning_rate": 2.499922278181931e-07, "loss": 0.1005, "step": 96540 }, { "epoch": 2.251208067383638, "grad_norm": 2.4825446605682373, "learning_rate": 2.4991450600012433e-07, "loss": 0.1027, "step": 96550 }, { "epoch": 2.2514412287604557, "grad_norm": 2.102534294128418, "learning_rate": 2.498367841820556e-07, "loss": 0.102, "step": 96560 }, { "epoch": 2.251674390137274, "grad_norm": 2.3530781269073486, "learning_rate": 2.497590623639868e-07, "loss": 0.0991, "step": 96570 }, { "epoch": 2.251907551514092, "grad_norm": 2.7290732860565186, "learning_rate": 2.4968134054591807e-07, "loss": 0.1176, "step": 96580 }, { "epoch": 2.2521407128909097, "grad_norm": 1.3035507202148438, "learning_rate": 2.496036187278493e-07, "loss": 0.1064, "step": 96590 }, { "epoch": 2.2523738742677275, "grad_norm": 1.5326324701309204, "learning_rate": 2.4952589690978054e-07, "loss": 0.113, "step": 96600 }, { "epoch": 2.2526070356445453, "grad_norm": 2.485084295272827, "learning_rate": 2.4944817509171175e-07, "loss": 0.1082, "step": 96610 }, { "epoch": 2.2528401970213636, "grad_norm": 2.37724232673645, "learning_rate": 2.4937045327364296e-07, "loss": 0.1052, "step": 96620 }, { "epoch": 2.2530733583981815, "grad_norm": 1.3111183643341064, "learning_rate": 2.492927314555742e-07, "loss": 0.1004, "step": 96630 }, { "epoch": 2.2533065197749993, "grad_norm": 1.7827765941619873, "learning_rate": 2.4921500963750543e-07, "loss": 0.1061, "step": 96640 }, { "epoch": 2.253539681151817, "grad_norm": 1.493270993232727, "learning_rate": 2.4913728781943664e-07, "loss": 0.1075, "step": 96650 }, { "epoch": 2.253772842528635, "grad_norm": 1.5052767992019653, "learning_rate": 2.490595660013679e-07, "loss": 0.099, "step": 96660 }, { "epoch": 2.2540060039054532, "grad_norm": 1.2886537313461304, "learning_rate": 2.489818441832991e-07, "loss": 0.0962, "step": 96670 }, { "epoch": 2.254239165282271, "grad_norm": 1.5596781969070435, "learning_rate": 2.4890412236523037e-07, "loss": 0.1095, "step": 96680 }, { "epoch": 2.254472326659089, "grad_norm": 2.6406142711639404, "learning_rate": 2.488264005471616e-07, "loss": 0.1136, "step": 96690 }, { "epoch": 2.2547054880359068, "grad_norm": 1.5863561630249023, "learning_rate": 2.4874867872909284e-07, "loss": 0.1158, "step": 96700 }, { "epoch": 2.2549386494127246, "grad_norm": 1.5115904808044434, "learning_rate": 2.4867095691102405e-07, "loss": 0.0988, "step": 96710 }, { "epoch": 2.255171810789543, "grad_norm": 1.849968671798706, "learning_rate": 2.485932350929553e-07, "loss": 0.1072, "step": 96720 }, { "epoch": 2.2554049721663607, "grad_norm": 1.9434754848480225, "learning_rate": 2.485155132748865e-07, "loss": 0.1029, "step": 96730 }, { "epoch": 2.2556381335431785, "grad_norm": 1.3330327272415161, "learning_rate": 2.4843779145681773e-07, "loss": 0.1046, "step": 96740 }, { "epoch": 2.2558712949199964, "grad_norm": 1.3258861303329468, "learning_rate": 2.48360069638749e-07, "loss": 0.1077, "step": 96750 }, { "epoch": 2.256104456296814, "grad_norm": 3.2276196479797363, "learning_rate": 2.482823478206802e-07, "loss": 0.1041, "step": 96760 }, { "epoch": 2.2563376176736325, "grad_norm": 1.956540584564209, "learning_rate": 2.482046260026114e-07, "loss": 0.1109, "step": 96770 }, { "epoch": 2.2565707790504503, "grad_norm": 1.4007104635238647, "learning_rate": 2.481269041845427e-07, "loss": 0.0943, "step": 96780 }, { "epoch": 2.256803940427268, "grad_norm": 2.7113497257232666, "learning_rate": 2.480491823664739e-07, "loss": 0.0942, "step": 96790 }, { "epoch": 2.257037101804086, "grad_norm": 1.978489875793457, "learning_rate": 2.4797146054840515e-07, "loss": 0.1022, "step": 96800 }, { "epoch": 2.257270263180904, "grad_norm": 1.9016790390014648, "learning_rate": 2.4789373873033636e-07, "loss": 0.1147, "step": 96810 }, { "epoch": 2.257503424557722, "grad_norm": 2.5237033367156982, "learning_rate": 2.478160169122676e-07, "loss": 0.1016, "step": 96820 }, { "epoch": 2.25773658593454, "grad_norm": 1.883334755897522, "learning_rate": 2.4773829509419883e-07, "loss": 0.0979, "step": 96830 }, { "epoch": 2.257969747311358, "grad_norm": 3.5885443687438965, "learning_rate": 2.476605732761301e-07, "loss": 0.1122, "step": 96840 }, { "epoch": 2.2582029086881756, "grad_norm": 1.230857014656067, "learning_rate": 2.475828514580613e-07, "loss": 0.1176, "step": 96850 }, { "epoch": 2.258436070064994, "grad_norm": 1.1976182460784912, "learning_rate": 2.4750512963999256e-07, "loss": 0.1021, "step": 96860 }, { "epoch": 2.2586692314418118, "grad_norm": 1.1909029483795166, "learning_rate": 2.4742740782192377e-07, "loss": 0.1057, "step": 96870 }, { "epoch": 2.2589023928186296, "grad_norm": 1.4784334897994995, "learning_rate": 2.47349686003855e-07, "loss": 0.1186, "step": 96880 }, { "epoch": 2.2591355541954474, "grad_norm": 1.6970564126968384, "learning_rate": 2.4727196418578624e-07, "loss": 0.0969, "step": 96890 }, { "epoch": 2.2593687155722653, "grad_norm": 1.3026766777038574, "learning_rate": 2.4719424236771745e-07, "loss": 0.1025, "step": 96900 }, { "epoch": 2.2596018769490835, "grad_norm": 2.095471143722534, "learning_rate": 2.4711652054964866e-07, "loss": 0.1047, "step": 96910 }, { "epoch": 2.2598350383259014, "grad_norm": 1.1727566719055176, "learning_rate": 2.470387987315799e-07, "loss": 0.0964, "step": 96920 }, { "epoch": 2.260068199702719, "grad_norm": 2.029005289077759, "learning_rate": 2.4696107691351113e-07, "loss": 0.1026, "step": 96930 }, { "epoch": 2.260301361079537, "grad_norm": 2.16640043258667, "learning_rate": 2.468833550954424e-07, "loss": 0.1066, "step": 96940 }, { "epoch": 2.2605345224563553, "grad_norm": 1.5406960248947144, "learning_rate": 2.468056332773736e-07, "loss": 0.1058, "step": 96950 }, { "epoch": 2.260767683833173, "grad_norm": 1.4766136407852173, "learning_rate": 2.4672791145930487e-07, "loss": 0.1026, "step": 96960 }, { "epoch": 2.261000845209991, "grad_norm": 4.441348075866699, "learning_rate": 2.466501896412361e-07, "loss": 0.1064, "step": 96970 }, { "epoch": 2.261234006586809, "grad_norm": 1.7540647983551025, "learning_rate": 2.4657246782316734e-07, "loss": 0.1045, "step": 96980 }, { "epoch": 2.2614671679636267, "grad_norm": 1.9237881898880005, "learning_rate": 2.4649474600509855e-07, "loss": 0.1108, "step": 96990 }, { "epoch": 2.261700329340445, "grad_norm": 1.5129834413528442, "learning_rate": 2.464170241870298e-07, "loss": 0.1084, "step": 97000 }, { "epoch": 2.261933490717263, "grad_norm": 1.3680065870285034, "learning_rate": 2.46339302368961e-07, "loss": 0.0966, "step": 97010 }, { "epoch": 2.2621666520940806, "grad_norm": 1.5924410820007324, "learning_rate": 2.462615805508922e-07, "loss": 0.1046, "step": 97020 }, { "epoch": 2.2623998134708985, "grad_norm": 1.3329355716705322, "learning_rate": 2.461838587328235e-07, "loss": 0.1038, "step": 97030 }, { "epoch": 2.2626329748477163, "grad_norm": 1.3671351671218872, "learning_rate": 2.461061369147547e-07, "loss": 0.1084, "step": 97040 }, { "epoch": 2.2628661362245346, "grad_norm": 1.4436715841293335, "learning_rate": 2.460284150966859e-07, "loss": 0.1012, "step": 97050 }, { "epoch": 2.2630992976013524, "grad_norm": 1.209580421447754, "learning_rate": 2.4595069327861717e-07, "loss": 0.1089, "step": 97060 }, { "epoch": 2.2633324589781703, "grad_norm": 1.9244332313537598, "learning_rate": 2.458729714605484e-07, "loss": 0.0993, "step": 97070 }, { "epoch": 2.263565620354988, "grad_norm": 2.633807420730591, "learning_rate": 2.4579524964247964e-07, "loss": 0.1049, "step": 97080 }, { "epoch": 2.263798781731806, "grad_norm": 1.397249460220337, "learning_rate": 2.4571752782441085e-07, "loss": 0.1054, "step": 97090 }, { "epoch": 2.264031943108624, "grad_norm": 1.7384997606277466, "learning_rate": 2.456398060063421e-07, "loss": 0.1005, "step": 97100 }, { "epoch": 2.264265104485442, "grad_norm": 1.3952888250350952, "learning_rate": 2.455620841882733e-07, "loss": 0.1114, "step": 97110 }, { "epoch": 2.26449826586226, "grad_norm": 1.1643195152282715, "learning_rate": 2.454843623702046e-07, "loss": 0.1135, "step": 97120 }, { "epoch": 2.2647314272390777, "grad_norm": 1.2365316152572632, "learning_rate": 2.454066405521358e-07, "loss": 0.1129, "step": 97130 }, { "epoch": 2.2649645886158956, "grad_norm": 1.1236034631729126, "learning_rate": 2.45328918734067e-07, "loss": 0.0883, "step": 97140 }, { "epoch": 2.265197749992714, "grad_norm": 1.7725389003753662, "learning_rate": 2.4525119691599826e-07, "loss": 0.121, "step": 97150 }, { "epoch": 2.2654309113695317, "grad_norm": 2.3363993167877197, "learning_rate": 2.4517347509792947e-07, "loss": 0.1086, "step": 97160 }, { "epoch": 2.2656640727463495, "grad_norm": 1.2404720783233643, "learning_rate": 2.450957532798607e-07, "loss": 0.1016, "step": 97170 }, { "epoch": 2.2658972341231673, "grad_norm": 2.5023200511932373, "learning_rate": 2.4501803146179194e-07, "loss": 0.1041, "step": 97180 }, { "epoch": 2.266130395499985, "grad_norm": 1.1708651781082153, "learning_rate": 2.4494030964372315e-07, "loss": 0.0993, "step": 97190 }, { "epoch": 2.2663635568768035, "grad_norm": 1.293540120124817, "learning_rate": 2.448625878256544e-07, "loss": 0.1009, "step": 97200 }, { "epoch": 2.2665967182536213, "grad_norm": 1.5634682178497314, "learning_rate": 2.447848660075856e-07, "loss": 0.1066, "step": 97210 }, { "epoch": 2.266829879630439, "grad_norm": 2.65954852104187, "learning_rate": 2.447071441895169e-07, "loss": 0.093, "step": 97220 }, { "epoch": 2.267063041007257, "grad_norm": 1.662285327911377, "learning_rate": 2.446294223714481e-07, "loss": 0.0988, "step": 97230 }, { "epoch": 2.2672962023840753, "grad_norm": 1.348883032798767, "learning_rate": 2.4455170055337936e-07, "loss": 0.1018, "step": 97240 }, { "epoch": 2.267529363760893, "grad_norm": 1.0458475351333618, "learning_rate": 2.4447397873531057e-07, "loss": 0.1051, "step": 97250 }, { "epoch": 2.267762525137711, "grad_norm": 1.231728196144104, "learning_rate": 2.4439625691724183e-07, "loss": 0.1091, "step": 97260 }, { "epoch": 2.2679956865145288, "grad_norm": 1.2182490825653076, "learning_rate": 2.4431853509917304e-07, "loss": 0.0998, "step": 97270 }, { "epoch": 2.268228847891347, "grad_norm": 1.495337724685669, "learning_rate": 2.4424081328110425e-07, "loss": 0.1054, "step": 97280 }, { "epoch": 2.268462009268165, "grad_norm": 2.1474106311798096, "learning_rate": 2.441630914630355e-07, "loss": 0.1143, "step": 97290 }, { "epoch": 2.2686951706449827, "grad_norm": 4.122100353240967, "learning_rate": 2.440853696449667e-07, "loss": 0.1161, "step": 97300 }, { "epoch": 2.2689283320218006, "grad_norm": 1.2906216382980347, "learning_rate": 2.4400764782689793e-07, "loss": 0.1134, "step": 97310 }, { "epoch": 2.2691614933986184, "grad_norm": 0.9415032267570496, "learning_rate": 2.439299260088292e-07, "loss": 0.1009, "step": 97320 }, { "epoch": 2.2693946547754367, "grad_norm": 1.770212173461914, "learning_rate": 2.438522041907604e-07, "loss": 0.0958, "step": 97330 }, { "epoch": 2.2696278161522545, "grad_norm": 1.2735912799835205, "learning_rate": 2.4377448237269166e-07, "loss": 0.1107, "step": 97340 }, { "epoch": 2.2698609775290723, "grad_norm": 1.611991047859192, "learning_rate": 2.4369676055462287e-07, "loss": 0.095, "step": 97350 }, { "epoch": 2.27009413890589, "grad_norm": 1.9800423383712769, "learning_rate": 2.4361903873655413e-07, "loss": 0.1083, "step": 97360 }, { "epoch": 2.270327300282708, "grad_norm": 1.347794532775879, "learning_rate": 2.4354131691848534e-07, "loss": 0.1023, "step": 97370 }, { "epoch": 2.2705604616595263, "grad_norm": 1.6731449365615845, "learning_rate": 2.434635951004166e-07, "loss": 0.1063, "step": 97380 }, { "epoch": 2.270793623036344, "grad_norm": 4.082549571990967, "learning_rate": 2.433858732823478e-07, "loss": 0.1017, "step": 97390 }, { "epoch": 2.271026784413162, "grad_norm": 1.3660862445831299, "learning_rate": 2.43308151464279e-07, "loss": 0.1023, "step": 97400 }, { "epoch": 2.27125994578998, "grad_norm": 1.1504030227661133, "learning_rate": 2.432304296462103e-07, "loss": 0.0958, "step": 97410 }, { "epoch": 2.2714931071667976, "grad_norm": 1.4442212581634521, "learning_rate": 2.431527078281415e-07, "loss": 0.1008, "step": 97420 }, { "epoch": 2.271726268543616, "grad_norm": 1.4599286317825317, "learning_rate": 2.430749860100727e-07, "loss": 0.1085, "step": 97430 }, { "epoch": 2.2719594299204338, "grad_norm": 1.7888532876968384, "learning_rate": 2.4299726419200397e-07, "loss": 0.1026, "step": 97440 }, { "epoch": 2.2721925912972516, "grad_norm": 1.922951340675354, "learning_rate": 2.429195423739352e-07, "loss": 0.0927, "step": 97450 }, { "epoch": 2.2724257526740694, "grad_norm": 1.7260339260101318, "learning_rate": 2.4284182055586644e-07, "loss": 0.1005, "step": 97460 }, { "epoch": 2.2726589140508873, "grad_norm": 1.9958808422088623, "learning_rate": 2.4276409873779765e-07, "loss": 0.1025, "step": 97470 }, { "epoch": 2.2728920754277055, "grad_norm": 1.190266728401184, "learning_rate": 2.426863769197289e-07, "loss": 0.0952, "step": 97480 }, { "epoch": 2.2731252368045234, "grad_norm": 1.2632380723953247, "learning_rate": 2.426086551016601e-07, "loss": 0.1005, "step": 97490 }, { "epoch": 2.2733583981813412, "grad_norm": 0.9618971943855286, "learning_rate": 2.425309332835914e-07, "loss": 0.1045, "step": 97500 }, { "epoch": 2.273591559558159, "grad_norm": 1.656579613685608, "learning_rate": 2.424532114655226e-07, "loss": 0.0952, "step": 97510 }, { "epoch": 2.273824720934977, "grad_norm": 2.612548589706421, "learning_rate": 2.4237548964745385e-07, "loss": 0.1024, "step": 97520 }, { "epoch": 2.274057882311795, "grad_norm": 2.437953233718872, "learning_rate": 2.4229776782938506e-07, "loss": 0.1074, "step": 97530 }, { "epoch": 2.274291043688613, "grad_norm": 0.9594926238059998, "learning_rate": 2.4222004601131627e-07, "loss": 0.0978, "step": 97540 }, { "epoch": 2.274524205065431, "grad_norm": 1.6083118915557861, "learning_rate": 2.4214232419324753e-07, "loss": 0.1108, "step": 97550 }, { "epoch": 2.2747573664422487, "grad_norm": 2.081599712371826, "learning_rate": 2.4206460237517874e-07, "loss": 0.1093, "step": 97560 }, { "epoch": 2.274990527819067, "grad_norm": 1.0291005373001099, "learning_rate": 2.4198688055710995e-07, "loss": 0.1058, "step": 97570 }, { "epoch": 2.275223689195885, "grad_norm": 2.1757631301879883, "learning_rate": 2.419091587390412e-07, "loss": 0.1033, "step": 97580 }, { "epoch": 2.2754568505727026, "grad_norm": 1.5513771772384644, "learning_rate": 2.418314369209724e-07, "loss": 0.1038, "step": 97590 }, { "epoch": 2.2756900119495205, "grad_norm": 1.220737099647522, "learning_rate": 2.417537151029037e-07, "loss": 0.0955, "step": 97600 }, { "epoch": 2.2759231733263383, "grad_norm": 1.2376735210418701, "learning_rate": 2.416759932848349e-07, "loss": 0.1079, "step": 97610 }, { "epoch": 2.2761563347031566, "grad_norm": 2.157695770263672, "learning_rate": 2.4159827146676616e-07, "loss": 0.1077, "step": 97620 }, { "epoch": 2.2763894960799744, "grad_norm": 1.7806098461151123, "learning_rate": 2.4152054964869736e-07, "loss": 0.101, "step": 97630 }, { "epoch": 2.2766226574567923, "grad_norm": 1.9028408527374268, "learning_rate": 2.4144282783062863e-07, "loss": 0.108, "step": 97640 }, { "epoch": 2.27685581883361, "grad_norm": 1.6484625339508057, "learning_rate": 2.4136510601255984e-07, "loss": 0.1044, "step": 97650 }, { "epoch": 2.2770889802104284, "grad_norm": 1.4929510354995728, "learning_rate": 2.412873841944911e-07, "loss": 0.1105, "step": 97660 }, { "epoch": 2.277322141587246, "grad_norm": 1.2902507781982422, "learning_rate": 2.412096623764223e-07, "loss": 0.1042, "step": 97670 }, { "epoch": 2.277555302964064, "grad_norm": 2.2217516899108887, "learning_rate": 2.411319405583535e-07, "loss": 0.1014, "step": 97680 }, { "epoch": 2.277788464340882, "grad_norm": 1.6829044818878174, "learning_rate": 2.410542187402848e-07, "loss": 0.1012, "step": 97690 }, { "epoch": 2.2780216257176997, "grad_norm": 1.5888763666152954, "learning_rate": 2.40976496922216e-07, "loss": 0.1118, "step": 97700 }, { "epoch": 2.278254787094518, "grad_norm": 1.3529551029205322, "learning_rate": 2.408987751041472e-07, "loss": 0.1054, "step": 97710 }, { "epoch": 2.278487948471336, "grad_norm": 2.499516725540161, "learning_rate": 2.4082105328607846e-07, "loss": 0.1038, "step": 97720 }, { "epoch": 2.2787211098481537, "grad_norm": 4.030032634735107, "learning_rate": 2.4074333146800967e-07, "loss": 0.1194, "step": 97730 }, { "epoch": 2.2789542712249715, "grad_norm": 1.3028887510299683, "learning_rate": 2.4066560964994093e-07, "loss": 0.1097, "step": 97740 }, { "epoch": 2.2791874326017894, "grad_norm": 1.729164719581604, "learning_rate": 2.4058788783187214e-07, "loss": 0.1028, "step": 97750 }, { "epoch": 2.2794205939786076, "grad_norm": 1.1801775693893433, "learning_rate": 2.405101660138034e-07, "loss": 0.1056, "step": 97760 }, { "epoch": 2.2796537553554255, "grad_norm": 1.3825762271881104, "learning_rate": 2.404324441957346e-07, "loss": 0.1083, "step": 97770 }, { "epoch": 2.2798869167322433, "grad_norm": 1.5799593925476074, "learning_rate": 2.4035472237766587e-07, "loss": 0.1066, "step": 97780 }, { "epoch": 2.280120078109061, "grad_norm": 1.8684570789337158, "learning_rate": 2.402770005595971e-07, "loss": 0.0924, "step": 97790 }, { "epoch": 2.280353239485879, "grad_norm": 1.1712920665740967, "learning_rate": 2.401992787415283e-07, "loss": 0.106, "step": 97800 }, { "epoch": 2.2805864008626973, "grad_norm": 2.4756276607513428, "learning_rate": 2.4012155692345955e-07, "loss": 0.1076, "step": 97810 }, { "epoch": 2.280819562239515, "grad_norm": 1.3942513465881348, "learning_rate": 2.4004383510539076e-07, "loss": 0.1059, "step": 97820 }, { "epoch": 2.281052723616333, "grad_norm": 2.1262521743774414, "learning_rate": 2.3996611328732197e-07, "loss": 0.1003, "step": 97830 }, { "epoch": 2.2812858849931508, "grad_norm": 1.9358429908752441, "learning_rate": 2.3988839146925323e-07, "loss": 0.1049, "step": 97840 }, { "epoch": 2.2815190463699686, "grad_norm": 1.9485512971878052, "learning_rate": 2.3981066965118444e-07, "loss": 0.1059, "step": 97850 }, { "epoch": 2.281752207746787, "grad_norm": 1.7322437763214111, "learning_rate": 2.397329478331157e-07, "loss": 0.1151, "step": 97860 }, { "epoch": 2.2819853691236047, "grad_norm": 2.510009288787842, "learning_rate": 2.396552260150469e-07, "loss": 0.1055, "step": 97870 }, { "epoch": 2.2822185305004226, "grad_norm": 1.5876773595809937, "learning_rate": 2.395775041969782e-07, "loss": 0.1046, "step": 97880 }, { "epoch": 2.2824516918772404, "grad_norm": 1.4028993844985962, "learning_rate": 2.394997823789094e-07, "loss": 0.1028, "step": 97890 }, { "epoch": 2.2826848532540582, "grad_norm": 1.9267767667770386, "learning_rate": 2.3942206056084065e-07, "loss": 0.0924, "step": 97900 }, { "epoch": 2.2829180146308765, "grad_norm": 1.0353285074234009, "learning_rate": 2.3934433874277186e-07, "loss": 0.1058, "step": 97910 }, { "epoch": 2.2831511760076943, "grad_norm": 1.3701320886611938, "learning_rate": 2.392666169247031e-07, "loss": 0.1073, "step": 97920 }, { "epoch": 2.283384337384512, "grad_norm": 1.3367501497268677, "learning_rate": 2.3918889510663433e-07, "loss": 0.1052, "step": 97930 }, { "epoch": 2.28361749876133, "grad_norm": 1.648293137550354, "learning_rate": 2.3911117328856554e-07, "loss": 0.1015, "step": 97940 }, { "epoch": 2.2838506601381483, "grad_norm": 2.675542116165161, "learning_rate": 2.390334514704968e-07, "loss": 0.1015, "step": 97950 }, { "epoch": 2.284083821514966, "grad_norm": 2.2282354831695557, "learning_rate": 2.38955729652428e-07, "loss": 0.1064, "step": 97960 }, { "epoch": 2.284316982891784, "grad_norm": 1.4406887292861938, "learning_rate": 2.388780078343592e-07, "loss": 0.0986, "step": 97970 }, { "epoch": 2.284550144268602, "grad_norm": 1.3680908679962158, "learning_rate": 2.388002860162905e-07, "loss": 0.1022, "step": 97980 }, { "epoch": 2.2847833056454196, "grad_norm": 1.1601004600524902, "learning_rate": 2.387225641982217e-07, "loss": 0.1005, "step": 97990 }, { "epoch": 2.285016467022238, "grad_norm": 1.794519066810608, "learning_rate": 2.3864484238015295e-07, "loss": 0.0964, "step": 98000 }, { "epoch": 2.2852496283990558, "grad_norm": 2.940333127975464, "learning_rate": 2.3856712056208416e-07, "loss": 0.1018, "step": 98010 }, { "epoch": 2.2854827897758736, "grad_norm": 1.351480484008789, "learning_rate": 2.384893987440154e-07, "loss": 0.1049, "step": 98020 }, { "epoch": 2.2857159511526914, "grad_norm": 2.2966814041137695, "learning_rate": 2.3841167692594663e-07, "loss": 0.104, "step": 98030 }, { "epoch": 2.2859491125295097, "grad_norm": 2.755539655685425, "learning_rate": 2.3833395510787787e-07, "loss": 0.1126, "step": 98040 }, { "epoch": 2.2861822739063276, "grad_norm": 2.2700746059417725, "learning_rate": 2.382562332898091e-07, "loss": 0.1163, "step": 98050 }, { "epoch": 2.2864154352831454, "grad_norm": 3.2853598594665527, "learning_rate": 2.3817851147174034e-07, "loss": 0.0987, "step": 98060 }, { "epoch": 2.2866485966599632, "grad_norm": 2.504133462905884, "learning_rate": 2.3810078965367155e-07, "loss": 0.0934, "step": 98070 }, { "epoch": 2.286881758036781, "grad_norm": 1.4805243015289307, "learning_rate": 2.380230678356028e-07, "loss": 0.1012, "step": 98080 }, { "epoch": 2.2871149194135993, "grad_norm": 2.552844285964966, "learning_rate": 2.3794534601753402e-07, "loss": 0.1041, "step": 98090 }, { "epoch": 2.287348080790417, "grad_norm": 1.689278244972229, "learning_rate": 2.3786762419946526e-07, "loss": 0.1004, "step": 98100 }, { "epoch": 2.287581242167235, "grad_norm": 2.449782609939575, "learning_rate": 2.377899023813965e-07, "loss": 0.0956, "step": 98110 }, { "epoch": 2.287814403544053, "grad_norm": 1.6284676790237427, "learning_rate": 2.3771218056332773e-07, "loss": 0.1073, "step": 98120 }, { "epoch": 2.2880475649208707, "grad_norm": 2.0353126525878906, "learning_rate": 2.3763445874525894e-07, "loss": 0.1094, "step": 98130 }, { "epoch": 2.288280726297689, "grad_norm": 3.2739953994750977, "learning_rate": 2.375567369271902e-07, "loss": 0.1087, "step": 98140 }, { "epoch": 2.288513887674507, "grad_norm": 2.8240294456481934, "learning_rate": 2.374790151091214e-07, "loss": 0.1175, "step": 98150 }, { "epoch": 2.2887470490513246, "grad_norm": 1.2687238454818726, "learning_rate": 2.3740129329105264e-07, "loss": 0.0976, "step": 98160 }, { "epoch": 2.2889802104281425, "grad_norm": 1.1896238327026367, "learning_rate": 2.3732357147298388e-07, "loss": 0.1049, "step": 98170 }, { "epoch": 2.2892133718049603, "grad_norm": 1.1466916799545288, "learning_rate": 2.3724584965491512e-07, "loss": 0.1021, "step": 98180 }, { "epoch": 2.2894465331817786, "grad_norm": 1.7102351188659668, "learning_rate": 2.3716812783684635e-07, "loss": 0.0955, "step": 98190 }, { "epoch": 2.2896796945585964, "grad_norm": 2.023858070373535, "learning_rate": 2.3709040601877759e-07, "loss": 0.0935, "step": 98200 }, { "epoch": 2.2899128559354143, "grad_norm": 1.295080304145813, "learning_rate": 2.370126842007088e-07, "loss": 0.1007, "step": 98210 }, { "epoch": 2.290146017312232, "grad_norm": 1.2473586797714233, "learning_rate": 2.3693496238264006e-07, "loss": 0.0956, "step": 98220 }, { "epoch": 2.29037917868905, "grad_norm": 1.018829584121704, "learning_rate": 2.3685724056457127e-07, "loss": 0.0935, "step": 98230 }, { "epoch": 2.2906123400658682, "grad_norm": 1.1963647603988647, "learning_rate": 2.367795187465025e-07, "loss": 0.105, "step": 98240 }, { "epoch": 2.290845501442686, "grad_norm": 1.0169845819473267, "learning_rate": 2.3670179692843374e-07, "loss": 0.1041, "step": 98250 }, { "epoch": 2.291078662819504, "grad_norm": 2.4398550987243652, "learning_rate": 2.3662407511036497e-07, "loss": 0.102, "step": 98260 }, { "epoch": 2.2913118241963217, "grad_norm": 2.8944575786590576, "learning_rate": 2.3654635329229618e-07, "loss": 0.1011, "step": 98270 }, { "epoch": 2.2915449855731396, "grad_norm": 3.5158462524414062, "learning_rate": 2.3646863147422745e-07, "loss": 0.1073, "step": 98280 }, { "epoch": 2.291778146949958, "grad_norm": 1.120561957359314, "learning_rate": 2.3639090965615865e-07, "loss": 0.0954, "step": 98290 }, { "epoch": 2.2920113083267757, "grad_norm": 1.455701231956482, "learning_rate": 2.363131878380899e-07, "loss": 0.0956, "step": 98300 }, { "epoch": 2.2922444697035935, "grad_norm": 1.1436636447906494, "learning_rate": 2.3623546602002113e-07, "loss": 0.1025, "step": 98310 }, { "epoch": 2.2924776310804114, "grad_norm": 1.6878396272659302, "learning_rate": 2.3615774420195236e-07, "loss": 0.099, "step": 98320 }, { "epoch": 2.2927107924572296, "grad_norm": 1.3107755184173584, "learning_rate": 2.3608002238388357e-07, "loss": 0.105, "step": 98330 }, { "epoch": 2.2929439538340475, "grad_norm": 1.3774574995040894, "learning_rate": 2.3600230056581483e-07, "loss": 0.1107, "step": 98340 }, { "epoch": 2.2931771152108653, "grad_norm": 1.5843710899353027, "learning_rate": 2.3592457874774604e-07, "loss": 0.1067, "step": 98350 }, { "epoch": 2.293410276587683, "grad_norm": 1.8646069765090942, "learning_rate": 2.3584685692967728e-07, "loss": 0.1084, "step": 98360 }, { "epoch": 2.293643437964501, "grad_norm": 1.4315184354782104, "learning_rate": 2.3576913511160851e-07, "loss": 0.1, "step": 98370 }, { "epoch": 2.2938765993413193, "grad_norm": 1.1384644508361816, "learning_rate": 2.3569141329353975e-07, "loss": 0.1027, "step": 98380 }, { "epoch": 2.294109760718137, "grad_norm": 1.4890999794006348, "learning_rate": 2.3561369147547096e-07, "loss": 0.1011, "step": 98390 }, { "epoch": 2.294342922094955, "grad_norm": 1.6532948017120361, "learning_rate": 2.3553596965740222e-07, "loss": 0.1003, "step": 98400 }, { "epoch": 2.2945760834717728, "grad_norm": 1.397813320159912, "learning_rate": 2.3545824783933343e-07, "loss": 0.1043, "step": 98410 }, { "epoch": 2.294809244848591, "grad_norm": 1.4463332891464233, "learning_rate": 2.353805260212647e-07, "loss": 0.1028, "step": 98420 }, { "epoch": 2.295042406225409, "grad_norm": 2.1403183937072754, "learning_rate": 2.353028042031959e-07, "loss": 0.1061, "step": 98430 }, { "epoch": 2.2952755676022267, "grad_norm": 1.5026682615280151, "learning_rate": 2.3522508238512714e-07, "loss": 0.1067, "step": 98440 }, { "epoch": 2.2955087289790446, "grad_norm": 2.356726884841919, "learning_rate": 2.3514736056705837e-07, "loss": 0.1007, "step": 98450 }, { "epoch": 2.2957418903558624, "grad_norm": 2.0795140266418457, "learning_rate": 2.350696387489896e-07, "loss": 0.0953, "step": 98460 }, { "epoch": 2.2959750517326807, "grad_norm": 1.7777342796325684, "learning_rate": 2.3499191693092082e-07, "loss": 0.1021, "step": 98470 }, { "epoch": 2.2962082131094985, "grad_norm": 1.8075675964355469, "learning_rate": 2.3491419511285208e-07, "loss": 0.1132, "step": 98480 }, { "epoch": 2.2964413744863164, "grad_norm": 1.1763039827346802, "learning_rate": 2.348364732947833e-07, "loss": 0.1181, "step": 98490 }, { "epoch": 2.296674535863134, "grad_norm": 2.0127763748168945, "learning_rate": 2.3475875147671452e-07, "loss": 0.1018, "step": 98500 }, { "epoch": 2.296907697239952, "grad_norm": 1.5259206295013428, "learning_rate": 2.3468102965864576e-07, "loss": 0.1048, "step": 98510 }, { "epoch": 2.2971408586167703, "grad_norm": 1.7384653091430664, "learning_rate": 2.34603307840577e-07, "loss": 0.1027, "step": 98520 }, { "epoch": 2.297374019993588, "grad_norm": 1.8357948064804077, "learning_rate": 2.345255860225082e-07, "loss": 0.1109, "step": 98530 }, { "epoch": 2.297607181370406, "grad_norm": 1.3802955150604248, "learning_rate": 2.3444786420443947e-07, "loss": 0.1001, "step": 98540 }, { "epoch": 2.297840342747224, "grad_norm": 1.4886748790740967, "learning_rate": 2.3437014238637068e-07, "loss": 0.1099, "step": 98550 }, { "epoch": 2.2980735041240417, "grad_norm": 2.9841976165771484, "learning_rate": 2.342924205683019e-07, "loss": 0.1054, "step": 98560 }, { "epoch": 2.29830666550086, "grad_norm": 2.1075234413146973, "learning_rate": 2.3421469875023315e-07, "loss": 0.0911, "step": 98570 }, { "epoch": 2.2985398268776778, "grad_norm": 1.3941189050674438, "learning_rate": 2.3413697693216438e-07, "loss": 0.1025, "step": 98580 }, { "epoch": 2.2987729882544956, "grad_norm": 1.2670079469680786, "learning_rate": 2.340592551140956e-07, "loss": 0.0999, "step": 98590 }, { "epoch": 2.2990061496313134, "grad_norm": 1.4659268856048584, "learning_rate": 2.3398153329602686e-07, "loss": 0.1114, "step": 98600 }, { "epoch": 2.2992393110081313, "grad_norm": 2.680025339126587, "learning_rate": 2.3390381147795806e-07, "loss": 0.1121, "step": 98610 }, { "epoch": 2.2994724723849496, "grad_norm": 1.1577718257904053, "learning_rate": 2.3382608965988933e-07, "loss": 0.1015, "step": 98620 }, { "epoch": 2.2997056337617674, "grad_norm": 1.6761490106582642, "learning_rate": 2.3374836784182054e-07, "loss": 0.1072, "step": 98630 }, { "epoch": 2.2999387951385852, "grad_norm": 1.766401767730713, "learning_rate": 2.3367064602375177e-07, "loss": 0.1025, "step": 98640 }, { "epoch": 2.300171956515403, "grad_norm": 1.5870466232299805, "learning_rate": 2.33592924205683e-07, "loss": 0.1105, "step": 98650 }, { "epoch": 2.300405117892221, "grad_norm": 1.4169641733169556, "learning_rate": 2.3351520238761424e-07, "loss": 0.0994, "step": 98660 }, { "epoch": 2.300638279269039, "grad_norm": 1.8913416862487793, "learning_rate": 2.3343748056954545e-07, "loss": 0.1055, "step": 98670 }, { "epoch": 2.300871440645857, "grad_norm": 1.0686626434326172, "learning_rate": 2.3335975875147671e-07, "loss": 0.0951, "step": 98680 }, { "epoch": 2.301104602022675, "grad_norm": 1.2734739780426025, "learning_rate": 2.3328203693340792e-07, "loss": 0.1077, "step": 98690 }, { "epoch": 2.3013377633994927, "grad_norm": 1.6431156396865845, "learning_rate": 2.3320431511533916e-07, "loss": 0.1005, "step": 98700 }, { "epoch": 2.301570924776311, "grad_norm": 1.1359859704971313, "learning_rate": 2.3312659329727042e-07, "loss": 0.0956, "step": 98710 }, { "epoch": 2.301804086153129, "grad_norm": 1.2815481424331665, "learning_rate": 2.3304887147920163e-07, "loss": 0.1002, "step": 98720 }, { "epoch": 2.3020372475299467, "grad_norm": 2.105466365814209, "learning_rate": 2.3297114966113287e-07, "loss": 0.0995, "step": 98730 }, { "epoch": 2.3022704089067645, "grad_norm": 1.594342589378357, "learning_rate": 2.328934278430641e-07, "loss": 0.0922, "step": 98740 }, { "epoch": 2.3025035702835828, "grad_norm": 1.4160760641098022, "learning_rate": 2.3281570602499534e-07, "loss": 0.1015, "step": 98750 }, { "epoch": 2.3027367316604006, "grad_norm": 1.3273831605911255, "learning_rate": 2.3273798420692655e-07, "loss": 0.1123, "step": 98760 }, { "epoch": 2.3029698930372184, "grad_norm": 1.4544099569320679, "learning_rate": 2.326602623888578e-07, "loss": 0.1003, "step": 98770 }, { "epoch": 2.3032030544140363, "grad_norm": 1.266352653503418, "learning_rate": 2.3258254057078902e-07, "loss": 0.1125, "step": 98780 }, { "epoch": 2.303436215790854, "grad_norm": 1.7144609689712524, "learning_rate": 2.3250481875272025e-07, "loss": 0.1112, "step": 98790 }, { "epoch": 2.3036693771676724, "grad_norm": 1.2952255010604858, "learning_rate": 2.324270969346515e-07, "loss": 0.1039, "step": 98800 }, { "epoch": 2.3039025385444902, "grad_norm": 1.3881163597106934, "learning_rate": 2.3234937511658273e-07, "loss": 0.1122, "step": 98810 }, { "epoch": 2.304135699921308, "grad_norm": 2.397648572921753, "learning_rate": 2.3227165329851396e-07, "loss": 0.1079, "step": 98820 }, { "epoch": 2.304368861298126, "grad_norm": 1.3761861324310303, "learning_rate": 2.321939314804452e-07, "loss": 0.1109, "step": 98830 }, { "epoch": 2.3046020226749437, "grad_norm": 1.8797423839569092, "learning_rate": 2.321162096623764e-07, "loss": 0.1032, "step": 98840 }, { "epoch": 2.304835184051762, "grad_norm": 1.6798322200775146, "learning_rate": 2.3203848784430767e-07, "loss": 0.1033, "step": 98850 }, { "epoch": 2.30506834542858, "grad_norm": 1.3228029012680054, "learning_rate": 2.3196076602623888e-07, "loss": 0.0982, "step": 98860 }, { "epoch": 2.3053015068053977, "grad_norm": 1.642336130142212, "learning_rate": 2.318830442081701e-07, "loss": 0.1009, "step": 98870 }, { "epoch": 2.3055346681822155, "grad_norm": 1.3274189233779907, "learning_rate": 2.318130945719082e-07, "loss": 0.1085, "step": 98880 }, { "epoch": 2.3057678295590334, "grad_norm": 1.5936683416366577, "learning_rate": 2.3173537275383945e-07, "loss": 0.1181, "step": 98890 }, { "epoch": 2.3060009909358516, "grad_norm": 1.5024683475494385, "learning_rate": 2.3165765093577066e-07, "loss": 0.1025, "step": 98900 }, { "epoch": 2.3062341523126695, "grad_norm": 2.927868127822876, "learning_rate": 2.3157992911770192e-07, "loss": 0.1106, "step": 98910 }, { "epoch": 2.3064673136894873, "grad_norm": 1.447771430015564, "learning_rate": 2.3150220729963313e-07, "loss": 0.0987, "step": 98920 }, { "epoch": 2.306700475066305, "grad_norm": 2.472862482070923, "learning_rate": 2.3142448548156436e-07, "loss": 0.1053, "step": 98930 }, { "epoch": 2.306933636443123, "grad_norm": 1.6402373313903809, "learning_rate": 2.313467636634956e-07, "loss": 0.1021, "step": 98940 }, { "epoch": 2.3071667978199413, "grad_norm": 1.3090933561325073, "learning_rate": 2.3126904184542683e-07, "loss": 0.1032, "step": 98950 }, { "epoch": 2.307399959196759, "grad_norm": 1.175552487373352, "learning_rate": 2.3119132002735807e-07, "loss": 0.0978, "step": 98960 }, { "epoch": 2.307633120573577, "grad_norm": 1.7261823415756226, "learning_rate": 2.311135982092893e-07, "loss": 0.1107, "step": 98970 }, { "epoch": 2.307866281950395, "grad_norm": 1.2130424976348877, "learning_rate": 2.3103587639122054e-07, "loss": 0.103, "step": 98980 }, { "epoch": 2.3080994433272126, "grad_norm": 1.254159927368164, "learning_rate": 2.3095815457315178e-07, "loss": 0.1079, "step": 98990 }, { "epoch": 2.308332604704031, "grad_norm": 1.2128150463104248, "learning_rate": 2.30880432755083e-07, "loss": 0.1007, "step": 99000 }, { "epoch": 2.3085657660808487, "grad_norm": 1.7126400470733643, "learning_rate": 2.3080271093701422e-07, "loss": 0.1043, "step": 99010 }, { "epoch": 2.3087989274576666, "grad_norm": 2.4140522480010986, "learning_rate": 2.3072498911894548e-07, "loss": 0.0963, "step": 99020 }, { "epoch": 2.3090320888344844, "grad_norm": 2.0038881301879883, "learning_rate": 2.306472673008767e-07, "loss": 0.1038, "step": 99030 }, { "epoch": 2.3092652502113027, "grad_norm": 1.2143285274505615, "learning_rate": 2.3056954548280793e-07, "loss": 0.1009, "step": 99040 }, { "epoch": 2.3094984115881205, "grad_norm": 1.3714361190795898, "learning_rate": 2.3049182366473917e-07, "loss": 0.0966, "step": 99050 }, { "epoch": 2.3097315729649384, "grad_norm": 1.586506724357605, "learning_rate": 2.304141018466704e-07, "loss": 0.0957, "step": 99060 }, { "epoch": 2.309964734341756, "grad_norm": 1.444042444229126, "learning_rate": 2.303363800286016e-07, "loss": 0.097, "step": 99070 }, { "epoch": 2.310197895718574, "grad_norm": 1.879081130027771, "learning_rate": 2.3025865821053287e-07, "loss": 0.1147, "step": 99080 }, { "epoch": 2.3104310570953923, "grad_norm": 1.081619143486023, "learning_rate": 2.3018093639246408e-07, "loss": 0.0899, "step": 99090 }, { "epoch": 2.31066421847221, "grad_norm": 3.082829475402832, "learning_rate": 2.3010321457439532e-07, "loss": 0.1038, "step": 99100 }, { "epoch": 2.310897379849028, "grad_norm": 1.4801865816116333, "learning_rate": 2.3002549275632655e-07, "loss": 0.1064, "step": 99110 }, { "epoch": 2.311130541225846, "grad_norm": 4.397147178649902, "learning_rate": 2.299477709382578e-07, "loss": 0.1087, "step": 99120 }, { "epoch": 2.311363702602664, "grad_norm": 1.6947089433670044, "learning_rate": 2.29870049120189e-07, "loss": 0.1025, "step": 99130 }, { "epoch": 2.311596863979482, "grad_norm": 1.508237600326538, "learning_rate": 2.2979232730212026e-07, "loss": 0.1128, "step": 99140 }, { "epoch": 2.3118300253563, "grad_norm": 1.457231879234314, "learning_rate": 2.2971460548405147e-07, "loss": 0.1089, "step": 99150 }, { "epoch": 2.3120631867331176, "grad_norm": 2.448664426803589, "learning_rate": 2.296368836659827e-07, "loss": 0.1002, "step": 99160 }, { "epoch": 2.3122963481099355, "grad_norm": 1.0288161039352417, "learning_rate": 2.2955916184791394e-07, "loss": 0.1007, "step": 99170 }, { "epoch": 2.3125295094867537, "grad_norm": 1.3363674879074097, "learning_rate": 2.2948144002984518e-07, "loss": 0.0985, "step": 99180 }, { "epoch": 2.3127626708635716, "grad_norm": 1.3591053485870361, "learning_rate": 2.2940371821177639e-07, "loss": 0.0969, "step": 99190 }, { "epoch": 2.3129958322403894, "grad_norm": 1.6362414360046387, "learning_rate": 2.2932599639370765e-07, "loss": 0.101, "step": 99200 }, { "epoch": 2.3132289936172072, "grad_norm": 1.8294544219970703, "learning_rate": 2.2924827457563886e-07, "loss": 0.1002, "step": 99210 }, { "epoch": 2.313462154994025, "grad_norm": 1.5349963903427124, "learning_rate": 2.2917055275757012e-07, "loss": 0.0888, "step": 99220 }, { "epoch": 2.3136953163708434, "grad_norm": 1.4566038846969604, "learning_rate": 2.2909283093950133e-07, "loss": 0.1008, "step": 99230 }, { "epoch": 2.313928477747661, "grad_norm": 3.268888473510742, "learning_rate": 2.2901510912143256e-07, "loss": 0.1083, "step": 99240 }, { "epoch": 2.314161639124479, "grad_norm": 2.1251983642578125, "learning_rate": 2.289373873033638e-07, "loss": 0.1012, "step": 99250 }, { "epoch": 2.314394800501297, "grad_norm": 0.9675702452659607, "learning_rate": 2.2885966548529504e-07, "loss": 0.1053, "step": 99260 }, { "epoch": 2.3146279618781147, "grad_norm": 2.6606860160827637, "learning_rate": 2.2878194366722624e-07, "loss": 0.1092, "step": 99270 }, { "epoch": 2.314861123254933, "grad_norm": 1.2543878555297852, "learning_rate": 2.287042218491575e-07, "loss": 0.1126, "step": 99280 }, { "epoch": 2.315094284631751, "grad_norm": 1.613393783569336, "learning_rate": 2.2862650003108872e-07, "loss": 0.1009, "step": 99290 }, { "epoch": 2.3153274460085687, "grad_norm": 1.1945948600769043, "learning_rate": 2.2854877821301995e-07, "loss": 0.1046, "step": 99300 }, { "epoch": 2.3155606073853865, "grad_norm": 1.3462157249450684, "learning_rate": 2.284710563949512e-07, "loss": 0.0926, "step": 99310 }, { "epoch": 2.3157937687622043, "grad_norm": 1.2913466691970825, "learning_rate": 2.2839333457688242e-07, "loss": 0.1173, "step": 99320 }, { "epoch": 2.3160269301390226, "grad_norm": 1.5362647771835327, "learning_rate": 2.2831561275881363e-07, "loss": 0.1023, "step": 99330 }, { "epoch": 2.3162600915158404, "grad_norm": 1.2060728073120117, "learning_rate": 2.282378909407449e-07, "loss": 0.1152, "step": 99340 }, { "epoch": 2.3164932528926583, "grad_norm": 1.6254403591156006, "learning_rate": 2.281601691226761e-07, "loss": 0.0952, "step": 99350 }, { "epoch": 2.316726414269476, "grad_norm": 1.053099274635315, "learning_rate": 2.2808244730460734e-07, "loss": 0.0947, "step": 99360 }, { "epoch": 2.316959575646294, "grad_norm": 1.4191428422927856, "learning_rate": 2.2800472548653857e-07, "loss": 0.0978, "step": 99370 }, { "epoch": 2.3171927370231122, "grad_norm": 1.1906964778900146, "learning_rate": 2.279270036684698e-07, "loss": 0.1078, "step": 99380 }, { "epoch": 2.31742589839993, "grad_norm": 1.6255639791488647, "learning_rate": 2.2784928185040102e-07, "loss": 0.1165, "step": 99390 }, { "epoch": 2.317659059776748, "grad_norm": 2.029681444168091, "learning_rate": 2.2777156003233228e-07, "loss": 0.1141, "step": 99400 }, { "epoch": 2.3178922211535657, "grad_norm": 2.029820203781128, "learning_rate": 2.276938382142635e-07, "loss": 0.1103, "step": 99410 }, { "epoch": 2.318125382530384, "grad_norm": 1.0576013326644897, "learning_rate": 2.2761611639619475e-07, "loss": 0.0974, "step": 99420 }, { "epoch": 2.318358543907202, "grad_norm": 1.1461336612701416, "learning_rate": 2.2753839457812596e-07, "loss": 0.0986, "step": 99430 }, { "epoch": 2.3185917052840197, "grad_norm": 1.2135720252990723, "learning_rate": 2.274606727600572e-07, "loss": 0.1, "step": 99440 }, { "epoch": 2.3188248666608375, "grad_norm": 1.7593305110931396, "learning_rate": 2.2738295094198843e-07, "loss": 0.1104, "step": 99450 }, { "epoch": 2.3190580280376554, "grad_norm": 2.147390365600586, "learning_rate": 2.2730522912391967e-07, "loss": 0.0991, "step": 99460 }, { "epoch": 2.3192911894144737, "grad_norm": 1.296301245689392, "learning_rate": 2.2722750730585088e-07, "loss": 0.1112, "step": 99470 }, { "epoch": 2.3195243507912915, "grad_norm": 1.0214500427246094, "learning_rate": 2.2714978548778214e-07, "loss": 0.1039, "step": 99480 }, { "epoch": 2.3197575121681093, "grad_norm": 2.206465482711792, "learning_rate": 2.2707206366971335e-07, "loss": 0.0989, "step": 99490 }, { "epoch": 2.319990673544927, "grad_norm": 1.5641326904296875, "learning_rate": 2.2699434185164459e-07, "loss": 0.102, "step": 99500 }, { "epoch": 2.3202238349217454, "grad_norm": 0.9812830090522766, "learning_rate": 2.2691662003357582e-07, "loss": 0.0881, "step": 99510 }, { "epoch": 2.3204569962985633, "grad_norm": 1.3744803667068481, "learning_rate": 2.2683889821550706e-07, "loss": 0.0998, "step": 99520 }, { "epoch": 2.320690157675381, "grad_norm": 1.063186526298523, "learning_rate": 2.2676117639743827e-07, "loss": 0.0979, "step": 99530 }, { "epoch": 2.320923319052199, "grad_norm": 1.4882564544677734, "learning_rate": 2.2668345457936953e-07, "loss": 0.1051, "step": 99540 }, { "epoch": 2.321156480429017, "grad_norm": 1.6510883569717407, "learning_rate": 2.2660573276130074e-07, "loss": 0.1161, "step": 99550 }, { "epoch": 2.321389641805835, "grad_norm": 3.157066583633423, "learning_rate": 2.2652801094323197e-07, "loss": 0.1003, "step": 99560 }, { "epoch": 2.321622803182653, "grad_norm": 1.3252805471420288, "learning_rate": 2.264502891251632e-07, "loss": 0.0973, "step": 99570 }, { "epoch": 2.3218559645594707, "grad_norm": 1.6804176568984985, "learning_rate": 2.2637256730709444e-07, "loss": 0.101, "step": 99580 }, { "epoch": 2.3220891259362886, "grad_norm": 4.536539554595947, "learning_rate": 2.2629484548902565e-07, "loss": 0.0941, "step": 99590 }, { "epoch": 2.3223222873131064, "grad_norm": 1.720949411392212, "learning_rate": 2.2621712367095692e-07, "loss": 0.0905, "step": 99600 }, { "epoch": 2.3225554486899247, "grad_norm": 1.5575863122940063, "learning_rate": 2.2613940185288813e-07, "loss": 0.1097, "step": 99610 }, { "epoch": 2.3227886100667425, "grad_norm": 2.977843761444092, "learning_rate": 2.260616800348194e-07, "loss": 0.1096, "step": 99620 }, { "epoch": 2.3230217714435604, "grad_norm": 1.5144546031951904, "learning_rate": 2.259839582167506e-07, "loss": 0.095, "step": 99630 }, { "epoch": 2.323254932820378, "grad_norm": 1.8930561542510986, "learning_rate": 2.2590623639868183e-07, "loss": 0.0995, "step": 99640 }, { "epoch": 2.323488094197196, "grad_norm": 1.2659931182861328, "learning_rate": 2.2582851458061307e-07, "loss": 0.1095, "step": 99650 }, { "epoch": 2.3237212555740143, "grad_norm": 1.4680057764053345, "learning_rate": 2.257507927625443e-07, "loss": 0.1115, "step": 99660 }, { "epoch": 2.323954416950832, "grad_norm": 1.3992242813110352, "learning_rate": 2.256730709444755e-07, "loss": 0.1066, "step": 99670 }, { "epoch": 2.32418757832765, "grad_norm": 1.396722674369812, "learning_rate": 2.2559534912640677e-07, "loss": 0.1135, "step": 99680 }, { "epoch": 2.324420739704468, "grad_norm": 1.4441670179367065, "learning_rate": 2.2551762730833798e-07, "loss": 0.1089, "step": 99690 }, { "epoch": 2.3246539010812857, "grad_norm": 1.4149527549743652, "learning_rate": 2.2543990549026922e-07, "loss": 0.1045, "step": 99700 }, { "epoch": 2.324887062458104, "grad_norm": 1.2001807689666748, "learning_rate": 2.2536218367220046e-07, "loss": 0.1042, "step": 99710 }, { "epoch": 2.325120223834922, "grad_norm": 1.916180968284607, "learning_rate": 2.252844618541317e-07, "loss": 0.108, "step": 99720 }, { "epoch": 2.3253533852117396, "grad_norm": 2.9412176609039307, "learning_rate": 2.252067400360629e-07, "loss": 0.0913, "step": 99730 }, { "epoch": 2.3255865465885575, "grad_norm": 1.3313909769058228, "learning_rate": 2.2512901821799416e-07, "loss": 0.1059, "step": 99740 }, { "epoch": 2.3258197079653753, "grad_norm": 1.2101290225982666, "learning_rate": 2.2505129639992537e-07, "loss": 0.1049, "step": 99750 }, { "epoch": 2.3260528693421936, "grad_norm": 1.6719863414764404, "learning_rate": 2.249735745818566e-07, "loss": 0.0968, "step": 99760 }, { "epoch": 2.3262860307190114, "grad_norm": 2.0637428760528564, "learning_rate": 2.2489585276378784e-07, "loss": 0.1028, "step": 99770 }, { "epoch": 2.3265191920958292, "grad_norm": 2.9324562549591064, "learning_rate": 2.2481813094571908e-07, "loss": 0.1171, "step": 99780 }, { "epoch": 2.326752353472647, "grad_norm": 1.525503158569336, "learning_rate": 2.247404091276503e-07, "loss": 0.1095, "step": 99790 }, { "epoch": 2.3269855148494654, "grad_norm": 1.351027011871338, "learning_rate": 2.2466268730958155e-07, "loss": 0.1052, "step": 99800 }, { "epoch": 2.327218676226283, "grad_norm": 1.5632902383804321, "learning_rate": 2.2458496549151276e-07, "loss": 0.0886, "step": 99810 }, { "epoch": 2.327451837603101, "grad_norm": 1.5891505479812622, "learning_rate": 2.24507243673444e-07, "loss": 0.0944, "step": 99820 }, { "epoch": 2.327684998979919, "grad_norm": 1.5040345191955566, "learning_rate": 2.2442952185537523e-07, "loss": 0.0954, "step": 99830 }, { "epoch": 2.327918160356737, "grad_norm": 2.6552908420562744, "learning_rate": 2.2435180003730647e-07, "loss": 0.0954, "step": 99840 }, { "epoch": 2.328151321733555, "grad_norm": 1.9758766889572144, "learning_rate": 2.242740782192377e-07, "loss": 0.0977, "step": 99850 }, { "epoch": 2.328384483110373, "grad_norm": 3.757493495941162, "learning_rate": 2.2419635640116894e-07, "loss": 0.089, "step": 99860 }, { "epoch": 2.3286176444871907, "grad_norm": 2.210815668106079, "learning_rate": 2.2411863458310015e-07, "loss": 0.1007, "step": 99870 }, { "epoch": 2.3288508058640085, "grad_norm": 1.336279273033142, "learning_rate": 2.240409127650314e-07, "loss": 0.104, "step": 99880 }, { "epoch": 2.329083967240827, "grad_norm": 1.4034323692321777, "learning_rate": 2.2396319094696262e-07, "loss": 0.104, "step": 99890 }, { "epoch": 2.3293171286176446, "grad_norm": 1.2868088483810425, "learning_rate": 2.2388546912889385e-07, "loss": 0.0943, "step": 99900 }, { "epoch": 2.3295502899944625, "grad_norm": 1.425734043121338, "learning_rate": 2.238077473108251e-07, "loss": 0.1005, "step": 99910 }, { "epoch": 2.3297834513712803, "grad_norm": 1.1486482620239258, "learning_rate": 2.2373002549275633e-07, "loss": 0.1029, "step": 99920 }, { "epoch": 2.330016612748098, "grad_norm": 1.2550498247146606, "learning_rate": 2.2365230367468753e-07, "loss": 0.1051, "step": 99930 }, { "epoch": 2.3302497741249164, "grad_norm": 1.4213589429855347, "learning_rate": 2.235745818566188e-07, "loss": 0.103, "step": 99940 }, { "epoch": 2.3304829355017342, "grad_norm": 2.898439407348633, "learning_rate": 2.2349686003855e-07, "loss": 0.1153, "step": 99950 }, { "epoch": 2.330716096878552, "grad_norm": 1.7290340662002563, "learning_rate": 2.2341913822048124e-07, "loss": 0.1056, "step": 99960 }, { "epoch": 2.33094925825537, "grad_norm": 1.3690588474273682, "learning_rate": 2.2334141640241248e-07, "loss": 0.1031, "step": 99970 }, { "epoch": 2.3311824196321878, "grad_norm": 1.6208404302597046, "learning_rate": 2.232636945843437e-07, "loss": 0.1009, "step": 99980 }, { "epoch": 2.331415581009006, "grad_norm": 1.4524487257003784, "learning_rate": 2.2318597276627492e-07, "loss": 0.0959, "step": 99990 }, { "epoch": 2.331648742385824, "grad_norm": 2.098867177963257, "learning_rate": 2.2310825094820618e-07, "loss": 0.0952, "step": 100000 }, { "epoch": 2.331648742385824, "eval_accuracy": 0.9456964738310678, "eval_f1": 0.9611065556927617, "eval_loss": 0.1416751593351364, "eval_runtime": 3916.7793, "eval_samples_per_second": 467.2, "eval_steps_per_second": 58.4, "step": 100000 }, { "epoch": 2.3318819037626417, "grad_norm": 1.2842507362365723, "learning_rate": 2.230305291301374e-07, "loss": 0.1011, "step": 100010 }, { "epoch": 2.3321150651394595, "grad_norm": 1.360791563987732, "learning_rate": 2.2295280731206863e-07, "loss": 0.103, "step": 100020 }, { "epoch": 2.3323482265162774, "grad_norm": 1.3574637174606323, "learning_rate": 2.2287508549399986e-07, "loss": 0.1053, "step": 100030 }, { "epoch": 2.3325813878930957, "grad_norm": 1.947214126586914, "learning_rate": 2.227973636759311e-07, "loss": 0.0905, "step": 100040 }, { "epoch": 2.3328145492699135, "grad_norm": 1.1828715801239014, "learning_rate": 2.227196418578623e-07, "loss": 0.106, "step": 100050 }, { "epoch": 2.3330477106467313, "grad_norm": 1.1745249032974243, "learning_rate": 2.2264192003979357e-07, "loss": 0.0971, "step": 100060 }, { "epoch": 2.333280872023549, "grad_norm": 1.1447064876556396, "learning_rate": 2.2256419822172478e-07, "loss": 0.1004, "step": 100070 }, { "epoch": 2.333514033400367, "grad_norm": 1.9486654996871948, "learning_rate": 2.2248647640365604e-07, "loss": 0.1091, "step": 100080 }, { "epoch": 2.3337471947771853, "grad_norm": 1.3972784280776978, "learning_rate": 2.2240875458558725e-07, "loss": 0.0938, "step": 100090 }, { "epoch": 2.333980356154003, "grad_norm": 3.859475612640381, "learning_rate": 2.223310327675185e-07, "loss": 0.1072, "step": 100100 }, { "epoch": 2.334213517530821, "grad_norm": 2.1291298866271973, "learning_rate": 2.2225331094944972e-07, "loss": 0.1022, "step": 100110 }, { "epoch": 2.334446678907639, "grad_norm": 1.9346740245819092, "learning_rate": 2.2217558913138096e-07, "loss": 0.1018, "step": 100120 }, { "epoch": 2.3346798402844566, "grad_norm": 1.224764108657837, "learning_rate": 2.2209786731331217e-07, "loss": 0.0977, "step": 100130 }, { "epoch": 2.334913001661275, "grad_norm": 1.340096354484558, "learning_rate": 2.2202014549524343e-07, "loss": 0.0976, "step": 100140 }, { "epoch": 2.3351461630380927, "grad_norm": 3.873626232147217, "learning_rate": 2.2194242367717464e-07, "loss": 0.1058, "step": 100150 }, { "epoch": 2.3353793244149106, "grad_norm": 1.4386838674545288, "learning_rate": 2.2186470185910588e-07, "loss": 0.1056, "step": 100160 }, { "epoch": 2.3356124857917284, "grad_norm": 1.8519673347473145, "learning_rate": 2.217869800410371e-07, "loss": 0.1097, "step": 100170 }, { "epoch": 2.3358456471685467, "grad_norm": 1.2391899824142456, "learning_rate": 2.2170925822296835e-07, "loss": 0.1065, "step": 100180 }, { "epoch": 2.3360788085453645, "grad_norm": 1.8184961080551147, "learning_rate": 2.2163153640489956e-07, "loss": 0.1054, "step": 100190 }, { "epoch": 2.3363119699221824, "grad_norm": 1.213404893875122, "learning_rate": 2.2155381458683082e-07, "loss": 0.1063, "step": 100200 }, { "epoch": 2.336545131299, "grad_norm": 2.0274388790130615, "learning_rate": 2.2147609276876203e-07, "loss": 0.1022, "step": 100210 }, { "epoch": 2.3367782926758185, "grad_norm": 2.510084629058838, "learning_rate": 2.2139837095069326e-07, "loss": 0.1026, "step": 100220 }, { "epoch": 2.3370114540526363, "grad_norm": 1.7746092081069946, "learning_rate": 2.213206491326245e-07, "loss": 0.1126, "step": 100230 }, { "epoch": 2.337244615429454, "grad_norm": 1.754897952079773, "learning_rate": 2.2124292731455573e-07, "loss": 0.1055, "step": 100240 }, { "epoch": 2.337477776806272, "grad_norm": 1.6298021078109741, "learning_rate": 2.2116520549648694e-07, "loss": 0.1043, "step": 100250 }, { "epoch": 2.33771093818309, "grad_norm": 1.5777552127838135, "learning_rate": 2.210874836784182e-07, "loss": 0.1138, "step": 100260 }, { "epoch": 2.337944099559908, "grad_norm": 1.794049620628357, "learning_rate": 2.2100976186034942e-07, "loss": 0.1095, "step": 100270 }, { "epoch": 2.338177260936726, "grad_norm": 2.1549227237701416, "learning_rate": 2.2093204004228068e-07, "loss": 0.0961, "step": 100280 }, { "epoch": 2.338410422313544, "grad_norm": 2.211872100830078, "learning_rate": 2.2085431822421189e-07, "loss": 0.1085, "step": 100290 }, { "epoch": 2.3386435836903616, "grad_norm": 1.0689220428466797, "learning_rate": 2.2077659640614312e-07, "loss": 0.107, "step": 100300 }, { "epoch": 2.3388767450671795, "grad_norm": 1.9778709411621094, "learning_rate": 2.2069887458807436e-07, "loss": 0.1181, "step": 100310 }, { "epoch": 2.3391099064439977, "grad_norm": 2.6155152320861816, "learning_rate": 2.206211527700056e-07, "loss": 0.0899, "step": 100320 }, { "epoch": 2.3393430678208156, "grad_norm": 1.4692986011505127, "learning_rate": 2.205434309519368e-07, "loss": 0.0959, "step": 100330 }, { "epoch": 2.3395762291976334, "grad_norm": 1.2657828330993652, "learning_rate": 2.2046570913386807e-07, "loss": 0.0999, "step": 100340 }, { "epoch": 2.3398093905744513, "grad_norm": 2.34140682220459, "learning_rate": 2.2038798731579927e-07, "loss": 0.1045, "step": 100350 }, { "epoch": 2.340042551951269, "grad_norm": 1.3934526443481445, "learning_rate": 2.203102654977305e-07, "loss": 0.1002, "step": 100360 }, { "epoch": 2.3402757133280874, "grad_norm": 1.3047034740447998, "learning_rate": 2.2023254367966175e-07, "loss": 0.0919, "step": 100370 }, { "epoch": 2.340508874704905, "grad_norm": 1.969969630241394, "learning_rate": 2.2015482186159298e-07, "loss": 0.106, "step": 100380 }, { "epoch": 2.340742036081723, "grad_norm": 1.728574275970459, "learning_rate": 2.200771000435242e-07, "loss": 0.1085, "step": 100390 }, { "epoch": 2.340975197458541, "grad_norm": 1.4836186170578003, "learning_rate": 2.1999937822545545e-07, "loss": 0.0958, "step": 100400 }, { "epoch": 2.3412083588353587, "grad_norm": 1.8007185459136963, "learning_rate": 2.1992165640738666e-07, "loss": 0.1096, "step": 100410 }, { "epoch": 2.341441520212177, "grad_norm": 1.3503665924072266, "learning_rate": 2.198439345893179e-07, "loss": 0.0978, "step": 100420 }, { "epoch": 2.341674681588995, "grad_norm": 1.692410945892334, "learning_rate": 2.1976621277124913e-07, "loss": 0.109, "step": 100430 }, { "epoch": 2.3419078429658127, "grad_norm": 2.2487034797668457, "learning_rate": 2.1968849095318037e-07, "loss": 0.0979, "step": 100440 }, { "epoch": 2.3421410043426305, "grad_norm": 1.300201654434204, "learning_rate": 2.1961076913511158e-07, "loss": 0.0987, "step": 100450 }, { "epoch": 2.3423741657194483, "grad_norm": 1.422938346862793, "learning_rate": 2.1953304731704284e-07, "loss": 0.0979, "step": 100460 }, { "epoch": 2.3426073270962666, "grad_norm": 1.9357326030731201, "learning_rate": 2.1945532549897405e-07, "loss": 0.1092, "step": 100470 }, { "epoch": 2.3428404884730845, "grad_norm": 1.2101943492889404, "learning_rate": 2.193776036809053e-07, "loss": 0.1117, "step": 100480 }, { "epoch": 2.3430736498499023, "grad_norm": 1.1247092485427856, "learning_rate": 2.1929988186283652e-07, "loss": 0.0904, "step": 100490 }, { "epoch": 2.34330681122672, "grad_norm": 1.405175805091858, "learning_rate": 2.1922216004476776e-07, "loss": 0.1009, "step": 100500 }, { "epoch": 2.3435399726035384, "grad_norm": 1.9019596576690674, "learning_rate": 2.19144438226699e-07, "loss": 0.1071, "step": 100510 }, { "epoch": 2.3437731339803562, "grad_norm": 2.0015134811401367, "learning_rate": 2.1906671640863023e-07, "loss": 0.0963, "step": 100520 }, { "epoch": 2.344006295357174, "grad_norm": 1.3627535104751587, "learning_rate": 2.1898899459056144e-07, "loss": 0.109, "step": 100530 }, { "epoch": 2.344239456733992, "grad_norm": 1.3310445547103882, "learning_rate": 2.189112727724927e-07, "loss": 0.1131, "step": 100540 }, { "epoch": 2.3444726181108098, "grad_norm": 1.3934444189071655, "learning_rate": 2.188335509544239e-07, "loss": 0.0947, "step": 100550 }, { "epoch": 2.344705779487628, "grad_norm": 1.4407118558883667, "learning_rate": 2.1875582913635514e-07, "loss": 0.0952, "step": 100560 }, { "epoch": 2.344938940864446, "grad_norm": 1.0733730792999268, "learning_rate": 2.1867810731828638e-07, "loss": 0.0944, "step": 100570 }, { "epoch": 2.3451721022412637, "grad_norm": 1.4008958339691162, "learning_rate": 2.1860038550021762e-07, "loss": 0.1115, "step": 100580 }, { "epoch": 2.3454052636180815, "grad_norm": 1.2516041994094849, "learning_rate": 2.1852266368214882e-07, "loss": 0.1042, "step": 100590 }, { "epoch": 2.3456384249949, "grad_norm": 2.1482884883880615, "learning_rate": 2.184449418640801e-07, "loss": 0.1009, "step": 100600 }, { "epoch": 2.3458715863717177, "grad_norm": 1.34471595287323, "learning_rate": 2.183672200460113e-07, "loss": 0.1057, "step": 100610 }, { "epoch": 2.3461047477485355, "grad_norm": 1.992635726928711, "learning_rate": 2.1828949822794253e-07, "loss": 0.0972, "step": 100620 }, { "epoch": 2.3463379091253533, "grad_norm": 1.7110234498977661, "learning_rate": 2.1821177640987377e-07, "loss": 0.1089, "step": 100630 }, { "epoch": 2.346571070502171, "grad_norm": 2.264491558074951, "learning_rate": 2.18134054591805e-07, "loss": 0.1031, "step": 100640 }, { "epoch": 2.3468042318789895, "grad_norm": 1.1812025308609009, "learning_rate": 2.180563327737362e-07, "loss": 0.0993, "step": 100650 }, { "epoch": 2.3470373932558073, "grad_norm": 1.7642161846160889, "learning_rate": 2.1797861095566747e-07, "loss": 0.1072, "step": 100660 }, { "epoch": 2.347270554632625, "grad_norm": 1.562690019607544, "learning_rate": 2.1790088913759868e-07, "loss": 0.1009, "step": 100670 }, { "epoch": 2.347503716009443, "grad_norm": 1.4217063188552856, "learning_rate": 2.1782316731952992e-07, "loss": 0.099, "step": 100680 }, { "epoch": 2.347736877386261, "grad_norm": 1.4529438018798828, "learning_rate": 2.1774544550146116e-07, "loss": 0.0905, "step": 100690 }, { "epoch": 2.347970038763079, "grad_norm": 2.7028274536132812, "learning_rate": 2.176677236833924e-07, "loss": 0.1091, "step": 100700 }, { "epoch": 2.348203200139897, "grad_norm": 1.748874545097351, "learning_rate": 2.1759000186532363e-07, "loss": 0.107, "step": 100710 }, { "epoch": 2.3484363615167148, "grad_norm": 2.3844425678253174, "learning_rate": 2.1751228004725486e-07, "loss": 0.0982, "step": 100720 }, { "epoch": 2.3486695228935326, "grad_norm": 2.6868488788604736, "learning_rate": 2.1743455822918607e-07, "loss": 0.0944, "step": 100730 }, { "epoch": 2.3489026842703504, "grad_norm": 1.8865268230438232, "learning_rate": 2.1735683641111733e-07, "loss": 0.1065, "step": 100740 }, { "epoch": 2.3491358456471687, "grad_norm": 1.639897346496582, "learning_rate": 2.1727911459304854e-07, "loss": 0.1032, "step": 100750 }, { "epoch": 2.3493690070239865, "grad_norm": 1.297350525856018, "learning_rate": 2.1720139277497978e-07, "loss": 0.0976, "step": 100760 }, { "epoch": 2.3496021684008044, "grad_norm": 1.384988784790039, "learning_rate": 2.1712367095691101e-07, "loss": 0.1018, "step": 100770 }, { "epoch": 2.349835329777622, "grad_norm": 1.7696475982666016, "learning_rate": 2.1704594913884225e-07, "loss": 0.1043, "step": 100780 }, { "epoch": 2.35006849115444, "grad_norm": 1.6677327156066895, "learning_rate": 2.1696822732077346e-07, "loss": 0.1012, "step": 100790 }, { "epoch": 2.3503016525312583, "grad_norm": 1.193989634513855, "learning_rate": 2.1689050550270472e-07, "loss": 0.1017, "step": 100800 }, { "epoch": 2.350534813908076, "grad_norm": 3.1760027408599854, "learning_rate": 2.1681278368463593e-07, "loss": 0.098, "step": 100810 }, { "epoch": 2.350767975284894, "grad_norm": 1.6285651922225952, "learning_rate": 2.1673506186656717e-07, "loss": 0.11, "step": 100820 }, { "epoch": 2.351001136661712, "grad_norm": 1.5994629859924316, "learning_rate": 2.166573400484984e-07, "loss": 0.0975, "step": 100830 }, { "epoch": 2.3512342980385297, "grad_norm": 2.182591438293457, "learning_rate": 2.1657961823042964e-07, "loss": 0.119, "step": 100840 }, { "epoch": 2.351467459415348, "grad_norm": 1.16904616355896, "learning_rate": 2.1650189641236085e-07, "loss": 0.1054, "step": 100850 }, { "epoch": 2.351700620792166, "grad_norm": 1.0744444131851196, "learning_rate": 2.164241745942921e-07, "loss": 0.1089, "step": 100860 }, { "epoch": 2.3519337821689836, "grad_norm": 1.7586299180984497, "learning_rate": 2.1634645277622332e-07, "loss": 0.1009, "step": 100870 }, { "epoch": 2.3521669435458015, "grad_norm": 1.8632413148880005, "learning_rate": 2.1627650313996144e-07, "loss": 0.1041, "step": 100880 }, { "epoch": 2.3524001049226198, "grad_norm": 1.4169807434082031, "learning_rate": 2.1619878132189268e-07, "loss": 0.0939, "step": 100890 }, { "epoch": 2.3526332662994376, "grad_norm": 2.4424490928649902, "learning_rate": 2.161210595038239e-07, "loss": 0.1096, "step": 100900 }, { "epoch": 2.3528664276762554, "grad_norm": 2.3302688598632812, "learning_rate": 2.1604333768575515e-07, "loss": 0.1008, "step": 100910 }, { "epoch": 2.3530995890530733, "grad_norm": 1.2356058359146118, "learning_rate": 2.1596561586768636e-07, "loss": 0.0943, "step": 100920 }, { "epoch": 2.353332750429891, "grad_norm": 1.4713693857192993, "learning_rate": 2.158878940496176e-07, "loss": 0.0988, "step": 100930 }, { "epoch": 2.3535659118067094, "grad_norm": 1.6012758016586304, "learning_rate": 2.1581017223154883e-07, "loss": 0.1159, "step": 100940 }, { "epoch": 2.353799073183527, "grad_norm": 1.8136245012283325, "learning_rate": 2.1573245041348007e-07, "loss": 0.1048, "step": 100950 }, { "epoch": 2.354032234560345, "grad_norm": 2.740220308303833, "learning_rate": 2.1565472859541128e-07, "loss": 0.0997, "step": 100960 }, { "epoch": 2.354265395937163, "grad_norm": 1.5314172506332397, "learning_rate": 2.1557700677734254e-07, "loss": 0.1081, "step": 100970 }, { "epoch": 2.354498557313981, "grad_norm": 1.6825991868972778, "learning_rate": 2.1549928495927375e-07, "loss": 0.0887, "step": 100980 }, { "epoch": 2.354731718690799, "grad_norm": 1.8294285535812378, "learning_rate": 2.1542156314120498e-07, "loss": 0.0982, "step": 100990 }, { "epoch": 2.354964880067617, "grad_norm": 3.4484102725982666, "learning_rate": 2.1534384132313622e-07, "loss": 0.1023, "step": 101000 }, { "epoch": 2.3551980414444347, "grad_norm": 2.5182549953460693, "learning_rate": 2.1526611950506745e-07, "loss": 0.1001, "step": 101010 }, { "epoch": 2.3554312028212525, "grad_norm": 1.2046751976013184, "learning_rate": 2.1518839768699866e-07, "loss": 0.1118, "step": 101020 }, { "epoch": 2.355664364198071, "grad_norm": 1.759548544883728, "learning_rate": 2.1511067586892993e-07, "loss": 0.1067, "step": 101030 }, { "epoch": 2.3558975255748886, "grad_norm": 1.2839263677597046, "learning_rate": 2.1503295405086113e-07, "loss": 0.0969, "step": 101040 }, { "epoch": 2.3561306869517065, "grad_norm": 2.168030023574829, "learning_rate": 2.1495523223279237e-07, "loss": 0.0984, "step": 101050 }, { "epoch": 2.3563638483285243, "grad_norm": 4.138855457305908, "learning_rate": 2.148775104147236e-07, "loss": 0.0949, "step": 101060 }, { "epoch": 2.356597009705342, "grad_norm": 1.3259270191192627, "learning_rate": 2.1479978859665484e-07, "loss": 0.0964, "step": 101070 }, { "epoch": 2.3568301710821604, "grad_norm": 1.6762804985046387, "learning_rate": 2.1472206677858605e-07, "loss": 0.0978, "step": 101080 }, { "epoch": 2.3570633324589783, "grad_norm": 2.1243841648101807, "learning_rate": 2.146443449605173e-07, "loss": 0.1021, "step": 101090 }, { "epoch": 2.357296493835796, "grad_norm": 1.6091773509979248, "learning_rate": 2.1456662314244852e-07, "loss": 0.1015, "step": 101100 }, { "epoch": 2.357529655212614, "grad_norm": 1.3861531019210815, "learning_rate": 2.1448890132437978e-07, "loss": 0.1088, "step": 101110 }, { "epoch": 2.3577628165894318, "grad_norm": 1.0647417306900024, "learning_rate": 2.14411179506311e-07, "loss": 0.0989, "step": 101120 }, { "epoch": 2.35799597796625, "grad_norm": 1.8527023792266846, "learning_rate": 2.1433345768824223e-07, "loss": 0.1011, "step": 101130 }, { "epoch": 2.358229139343068, "grad_norm": 1.3623772859573364, "learning_rate": 2.1425573587017347e-07, "loss": 0.114, "step": 101140 }, { "epoch": 2.3584623007198857, "grad_norm": 1.9919404983520508, "learning_rate": 2.141780140521047e-07, "loss": 0.1086, "step": 101150 }, { "epoch": 2.3586954620967036, "grad_norm": 1.4311001300811768, "learning_rate": 2.141002922340359e-07, "loss": 0.0974, "step": 101160 }, { "epoch": 2.3589286234735214, "grad_norm": 2.5960042476654053, "learning_rate": 2.1402257041596717e-07, "loss": 0.1015, "step": 101170 }, { "epoch": 2.3591617848503397, "grad_norm": 2.3553221225738525, "learning_rate": 2.1395262077970527e-07, "loss": 0.1114, "step": 101180 }, { "epoch": 2.3593949462271575, "grad_norm": 0.9831893444061279, "learning_rate": 2.1387489896163648e-07, "loss": 0.0954, "step": 101190 }, { "epoch": 2.3596281076039753, "grad_norm": 1.38617742061615, "learning_rate": 2.1379717714356774e-07, "loss": 0.1047, "step": 101200 }, { "epoch": 2.359861268980793, "grad_norm": 1.9498363733291626, "learning_rate": 2.1371945532549895e-07, "loss": 0.1029, "step": 101210 }, { "epoch": 2.360094430357611, "grad_norm": 1.5926377773284912, "learning_rate": 2.136417335074302e-07, "loss": 0.1009, "step": 101220 }, { "epoch": 2.3603275917344293, "grad_norm": 2.281514883041382, "learning_rate": 2.1356401168936142e-07, "loss": 0.1064, "step": 101230 }, { "epoch": 2.360560753111247, "grad_norm": 1.34975004196167, "learning_rate": 2.1348628987129266e-07, "loss": 0.1002, "step": 101240 }, { "epoch": 2.360793914488065, "grad_norm": 1.416980266571045, "learning_rate": 2.1340856805322387e-07, "loss": 0.1009, "step": 101250 }, { "epoch": 2.361027075864883, "grad_norm": 1.8230170011520386, "learning_rate": 2.1333084623515513e-07, "loss": 0.1054, "step": 101260 }, { "epoch": 2.361260237241701, "grad_norm": 3.872211456298828, "learning_rate": 2.1325312441708634e-07, "loss": 0.0959, "step": 101270 }, { "epoch": 2.361493398618519, "grad_norm": 2.0932257175445557, "learning_rate": 2.131754025990176e-07, "loss": 0.0939, "step": 101280 }, { "epoch": 2.3617265599953368, "grad_norm": 1.7012168169021606, "learning_rate": 2.130976807809488e-07, "loss": 0.1046, "step": 101290 }, { "epoch": 2.3619597213721546, "grad_norm": 1.2491503953933716, "learning_rate": 2.1301995896288005e-07, "loss": 0.1033, "step": 101300 }, { "epoch": 2.362192882748973, "grad_norm": 1.167375922203064, "learning_rate": 2.1294223714481128e-07, "loss": 0.1016, "step": 101310 }, { "epoch": 2.3624260441257907, "grad_norm": 1.1424082517623901, "learning_rate": 2.1286451532674252e-07, "loss": 0.1029, "step": 101320 }, { "epoch": 2.3626592055026086, "grad_norm": 2.147183895111084, "learning_rate": 2.1278679350867373e-07, "loss": 0.0926, "step": 101330 }, { "epoch": 2.3628923668794264, "grad_norm": 1.0920416116714478, "learning_rate": 2.12709071690605e-07, "loss": 0.1002, "step": 101340 }, { "epoch": 2.3631255282562442, "grad_norm": 1.5608298778533936, "learning_rate": 2.126313498725362e-07, "loss": 0.1038, "step": 101350 }, { "epoch": 2.3633586896330625, "grad_norm": 1.281748652458191, "learning_rate": 2.1255362805446743e-07, "loss": 0.1016, "step": 101360 }, { "epoch": 2.3635918510098803, "grad_norm": 1.3267227411270142, "learning_rate": 2.1247590623639867e-07, "loss": 0.1133, "step": 101370 }, { "epoch": 2.363825012386698, "grad_norm": 1.0494627952575684, "learning_rate": 2.123981844183299e-07, "loss": 0.1051, "step": 101380 }, { "epoch": 2.364058173763516, "grad_norm": 2.247100830078125, "learning_rate": 2.1232046260026111e-07, "loss": 0.1061, "step": 101390 }, { "epoch": 2.364291335140334, "grad_norm": 1.1702213287353516, "learning_rate": 2.1224274078219238e-07, "loss": 0.0797, "step": 101400 }, { "epoch": 2.364524496517152, "grad_norm": 1.4014906883239746, "learning_rate": 2.1216501896412359e-07, "loss": 0.0898, "step": 101410 }, { "epoch": 2.36475765789397, "grad_norm": 2.2488534450531006, "learning_rate": 2.1208729714605482e-07, "loss": 0.1096, "step": 101420 }, { "epoch": 2.364990819270788, "grad_norm": 1.5351308584213257, "learning_rate": 2.1200957532798606e-07, "loss": 0.1118, "step": 101430 }, { "epoch": 2.3652239806476056, "grad_norm": 1.6907505989074707, "learning_rate": 2.119318535099173e-07, "loss": 0.1066, "step": 101440 }, { "epoch": 2.3654571420244235, "grad_norm": 1.67182195186615, "learning_rate": 2.118541316918485e-07, "loss": 0.1154, "step": 101450 }, { "epoch": 2.3656903034012418, "grad_norm": 2.0723111629486084, "learning_rate": 2.1177640987377976e-07, "loss": 0.0957, "step": 101460 }, { "epoch": 2.3659234647780596, "grad_norm": 2.7811639308929443, "learning_rate": 2.1169868805571097e-07, "loss": 0.1058, "step": 101470 }, { "epoch": 2.3661566261548774, "grad_norm": 1.136858344078064, "learning_rate": 2.1162096623764224e-07, "loss": 0.1028, "step": 101480 }, { "epoch": 2.3663897875316953, "grad_norm": 1.423831582069397, "learning_rate": 2.1154324441957344e-07, "loss": 0.1064, "step": 101490 }, { "epoch": 2.366622948908513, "grad_norm": 1.6382944583892822, "learning_rate": 2.1146552260150468e-07, "loss": 0.0994, "step": 101500 }, { "epoch": 2.3668561102853314, "grad_norm": 1.748844027519226, "learning_rate": 2.1138780078343592e-07, "loss": 0.0987, "step": 101510 }, { "epoch": 2.367089271662149, "grad_norm": 1.7487459182739258, "learning_rate": 2.1131007896536715e-07, "loss": 0.0945, "step": 101520 }, { "epoch": 2.367322433038967, "grad_norm": 2.0715179443359375, "learning_rate": 2.1123235714729836e-07, "loss": 0.1118, "step": 101530 }, { "epoch": 2.367555594415785, "grad_norm": 1.6565589904785156, "learning_rate": 2.1115463532922962e-07, "loss": 0.1134, "step": 101540 }, { "epoch": 2.3677887557926027, "grad_norm": 2.0706405639648438, "learning_rate": 2.1107691351116083e-07, "loss": 0.1092, "step": 101550 }, { "epoch": 2.368021917169421, "grad_norm": 1.0571507215499878, "learning_rate": 2.1099919169309207e-07, "loss": 0.0948, "step": 101560 }, { "epoch": 2.368255078546239, "grad_norm": 1.8616479635238647, "learning_rate": 2.109214698750233e-07, "loss": 0.1037, "step": 101570 }, { "epoch": 2.3684882399230567, "grad_norm": 1.532515525817871, "learning_rate": 2.1084374805695454e-07, "loss": 0.0989, "step": 101580 }, { "epoch": 2.3687214012998745, "grad_norm": 1.4354971647262573, "learning_rate": 2.1076602623888575e-07, "loss": 0.1091, "step": 101590 }, { "epoch": 2.368954562676693, "grad_norm": 2.7268998622894287, "learning_rate": 2.10688304420817e-07, "loss": 0.1135, "step": 101600 }, { "epoch": 2.3691877240535106, "grad_norm": 1.2873108386993408, "learning_rate": 2.1061058260274822e-07, "loss": 0.1134, "step": 101610 }, { "epoch": 2.3694208854303285, "grad_norm": 1.8813443183898926, "learning_rate": 2.1053286078467946e-07, "loss": 0.0969, "step": 101620 }, { "epoch": 2.3696540468071463, "grad_norm": 1.5251121520996094, "learning_rate": 2.104551389666107e-07, "loss": 0.1031, "step": 101630 }, { "epoch": 2.369887208183964, "grad_norm": 1.1099570989608765, "learning_rate": 2.1037741714854193e-07, "loss": 0.1055, "step": 101640 }, { "epoch": 2.3701203695607824, "grad_norm": 2.0937576293945312, "learning_rate": 2.1029969533047314e-07, "loss": 0.1085, "step": 101650 }, { "epoch": 2.3703535309376003, "grad_norm": 1.315495252609253, "learning_rate": 2.102219735124044e-07, "loss": 0.1033, "step": 101660 }, { "epoch": 2.370586692314418, "grad_norm": 3.7196292877197266, "learning_rate": 2.101442516943356e-07, "loss": 0.0988, "step": 101670 }, { "epoch": 2.370819853691236, "grad_norm": 1.366057276725769, "learning_rate": 2.1006652987626687e-07, "loss": 0.1066, "step": 101680 }, { "epoch": 2.371053015068054, "grad_norm": 1.7929296493530273, "learning_rate": 2.0998880805819808e-07, "loss": 0.1024, "step": 101690 }, { "epoch": 2.371286176444872, "grad_norm": 4.65559720993042, "learning_rate": 2.0991108624012931e-07, "loss": 0.0987, "step": 101700 }, { "epoch": 2.37151933782169, "grad_norm": 2.230701208114624, "learning_rate": 2.0983336442206055e-07, "loss": 0.1085, "step": 101710 }, { "epoch": 2.3717524991985077, "grad_norm": 2.543775796890259, "learning_rate": 2.0975564260399179e-07, "loss": 0.1041, "step": 101720 }, { "epoch": 2.3719856605753256, "grad_norm": 1.4851399660110474, "learning_rate": 2.09677920785923e-07, "loss": 0.1019, "step": 101730 }, { "epoch": 2.372218821952144, "grad_norm": 1.676386833190918, "learning_rate": 2.0960019896785426e-07, "loss": 0.107, "step": 101740 }, { "epoch": 2.3724519833289617, "grad_norm": 3.0153167247772217, "learning_rate": 2.0952247714978547e-07, "loss": 0.1101, "step": 101750 }, { "epoch": 2.3726851447057795, "grad_norm": 1.4239646196365356, "learning_rate": 2.094447553317167e-07, "loss": 0.0962, "step": 101760 }, { "epoch": 2.3729183060825974, "grad_norm": 2.5204246044158936, "learning_rate": 2.0936703351364794e-07, "loss": 0.1063, "step": 101770 }, { "epoch": 2.373151467459415, "grad_norm": 1.0361069440841675, "learning_rate": 2.0928931169557917e-07, "loss": 0.1015, "step": 101780 }, { "epoch": 2.3733846288362335, "grad_norm": 2.6483144760131836, "learning_rate": 2.0921158987751038e-07, "loss": 0.0971, "step": 101790 }, { "epoch": 2.3736177902130513, "grad_norm": 1.9142524003982544, "learning_rate": 2.0913386805944164e-07, "loss": 0.1059, "step": 101800 }, { "epoch": 2.373850951589869, "grad_norm": 3.462745189666748, "learning_rate": 2.0905614624137285e-07, "loss": 0.1107, "step": 101810 }, { "epoch": 2.374084112966687, "grad_norm": 1.4098381996154785, "learning_rate": 2.089784244233041e-07, "loss": 0.097, "step": 101820 }, { "epoch": 2.374317274343505, "grad_norm": 1.0165280103683472, "learning_rate": 2.0890070260523535e-07, "loss": 0.0979, "step": 101830 }, { "epoch": 2.374550435720323, "grad_norm": 1.336073637008667, "learning_rate": 2.0882298078716656e-07, "loss": 0.1144, "step": 101840 }, { "epoch": 2.374783597097141, "grad_norm": 1.5604580640792847, "learning_rate": 2.087452589690978e-07, "loss": 0.1048, "step": 101850 }, { "epoch": 2.3750167584739588, "grad_norm": 1.237844705581665, "learning_rate": 2.0866753715102903e-07, "loss": 0.0916, "step": 101860 }, { "epoch": 2.3752499198507766, "grad_norm": 1.8222131729125977, "learning_rate": 2.0858981533296027e-07, "loss": 0.0925, "step": 101870 }, { "epoch": 2.3754830812275944, "grad_norm": 3.4354069232940674, "learning_rate": 2.0851209351489148e-07, "loss": 0.1071, "step": 101880 }, { "epoch": 2.3757162426044127, "grad_norm": 1.2595961093902588, "learning_rate": 2.0843437169682274e-07, "loss": 0.1052, "step": 101890 }, { "epoch": 2.3759494039812306, "grad_norm": 1.204650640487671, "learning_rate": 2.0835664987875395e-07, "loss": 0.1002, "step": 101900 }, { "epoch": 2.3761825653580484, "grad_norm": 2.484750747680664, "learning_rate": 2.082789280606852e-07, "loss": 0.0987, "step": 101910 }, { "epoch": 2.3764157267348662, "grad_norm": 2.415231704711914, "learning_rate": 2.0820120624261642e-07, "loss": 0.1088, "step": 101920 }, { "epoch": 2.376648888111684, "grad_norm": 1.416730284690857, "learning_rate": 2.0812348442454766e-07, "loss": 0.1061, "step": 101930 }, { "epoch": 2.3768820494885023, "grad_norm": 1.0960116386413574, "learning_rate": 2.080457626064789e-07, "loss": 0.1132, "step": 101940 }, { "epoch": 2.37711521086532, "grad_norm": 1.3136463165283203, "learning_rate": 2.0796804078841013e-07, "loss": 0.097, "step": 101950 }, { "epoch": 2.377348372242138, "grad_norm": 1.2705426216125488, "learning_rate": 2.0789031897034134e-07, "loss": 0.1034, "step": 101960 }, { "epoch": 2.377581533618956, "grad_norm": 2.080660820007324, "learning_rate": 2.078125971522726e-07, "loss": 0.1154, "step": 101970 }, { "epoch": 2.377814694995774, "grad_norm": 1.6267520189285278, "learning_rate": 2.077348753342038e-07, "loss": 0.1007, "step": 101980 }, { "epoch": 2.378047856372592, "grad_norm": 1.2424101829528809, "learning_rate": 2.0765715351613504e-07, "loss": 0.0952, "step": 101990 }, { "epoch": 2.37828101774941, "grad_norm": 1.2770073413848877, "learning_rate": 2.0757943169806628e-07, "loss": 0.0936, "step": 102000 }, { "epoch": 2.3785141791262276, "grad_norm": 2.44752836227417, "learning_rate": 2.0750170987999751e-07, "loss": 0.1058, "step": 102010 }, { "epoch": 2.3787473405030455, "grad_norm": 1.3818353414535522, "learning_rate": 2.0742398806192872e-07, "loss": 0.1033, "step": 102020 }, { "epoch": 2.3789805018798638, "grad_norm": 2.617753028869629, "learning_rate": 2.0734626624385999e-07, "loss": 0.0984, "step": 102030 }, { "epoch": 2.3792136632566816, "grad_norm": 1.392640233039856, "learning_rate": 2.072685444257912e-07, "loss": 0.0954, "step": 102040 }, { "epoch": 2.3794468246334994, "grad_norm": 1.6956342458724976, "learning_rate": 2.0719082260772243e-07, "loss": 0.0947, "step": 102050 }, { "epoch": 2.3796799860103173, "grad_norm": 1.907375454902649, "learning_rate": 2.0711310078965367e-07, "loss": 0.092, "step": 102060 }, { "epoch": 2.3799131473871356, "grad_norm": 1.5959190130233765, "learning_rate": 2.070353789715849e-07, "loss": 0.1079, "step": 102070 }, { "epoch": 2.3801463087639534, "grad_norm": 2.0934860706329346, "learning_rate": 2.069576571535161e-07, "loss": 0.1091, "step": 102080 }, { "epoch": 2.3803794701407712, "grad_norm": 1.879046082496643, "learning_rate": 2.0687993533544737e-07, "loss": 0.1071, "step": 102090 }, { "epoch": 2.380612631517589, "grad_norm": 2.9809086322784424, "learning_rate": 2.0680221351737858e-07, "loss": 0.1042, "step": 102100 }, { "epoch": 2.380845792894407, "grad_norm": 1.4110522270202637, "learning_rate": 2.0672449169930985e-07, "loss": 0.1029, "step": 102110 }, { "epoch": 2.381078954271225, "grad_norm": 2.1043920516967773, "learning_rate": 2.0664676988124105e-07, "loss": 0.1049, "step": 102120 }, { "epoch": 2.381312115648043, "grad_norm": 1.903133749961853, "learning_rate": 2.065690480631723e-07, "loss": 0.1034, "step": 102130 }, { "epoch": 2.381545277024861, "grad_norm": 1.1572940349578857, "learning_rate": 2.0649132624510353e-07, "loss": 0.1063, "step": 102140 }, { "epoch": 2.3817784384016787, "grad_norm": 2.471262216567993, "learning_rate": 2.0641360442703476e-07, "loss": 0.1114, "step": 102150 }, { "epoch": 2.3820115997784965, "grad_norm": 1.1855467557907104, "learning_rate": 2.0633588260896597e-07, "loss": 0.0897, "step": 102160 }, { "epoch": 2.382244761155315, "grad_norm": 1.2797257900238037, "learning_rate": 2.0625816079089723e-07, "loss": 0.0945, "step": 102170 }, { "epoch": 2.3824779225321326, "grad_norm": 1.2673914432525635, "learning_rate": 2.0618043897282844e-07, "loss": 0.1006, "step": 102180 }, { "epoch": 2.3827110839089505, "grad_norm": 2.5444493293762207, "learning_rate": 2.0610271715475968e-07, "loss": 0.1057, "step": 102190 }, { "epoch": 2.3829442452857683, "grad_norm": 1.193790078163147, "learning_rate": 2.0602499533669091e-07, "loss": 0.0945, "step": 102200 }, { "epoch": 2.383177406662586, "grad_norm": 2.003862142562866, "learning_rate": 2.0594727351862215e-07, "loss": 0.0893, "step": 102210 }, { "epoch": 2.3834105680394044, "grad_norm": 1.8548946380615234, "learning_rate": 2.0586955170055336e-07, "loss": 0.0968, "step": 102220 }, { "epoch": 2.3836437294162223, "grad_norm": 1.8597134351730347, "learning_rate": 2.0579182988248462e-07, "loss": 0.1092, "step": 102230 }, { "epoch": 2.38387689079304, "grad_norm": 3.79467511177063, "learning_rate": 2.0571410806441583e-07, "loss": 0.1021, "step": 102240 }, { "epoch": 2.384110052169858, "grad_norm": 3.5924630165100098, "learning_rate": 2.0563638624634707e-07, "loss": 0.1106, "step": 102250 }, { "epoch": 2.3843432135466758, "grad_norm": 1.3060038089752197, "learning_rate": 2.055586644282783e-07, "loss": 0.1085, "step": 102260 }, { "epoch": 2.384576374923494, "grad_norm": 2.836244583129883, "learning_rate": 2.0548094261020954e-07, "loss": 0.1043, "step": 102270 }, { "epoch": 2.384809536300312, "grad_norm": 1.4656550884246826, "learning_rate": 2.0540322079214075e-07, "loss": 0.1028, "step": 102280 }, { "epoch": 2.3850426976771297, "grad_norm": 1.2270022630691528, "learning_rate": 2.05325498974072e-07, "loss": 0.1054, "step": 102290 }, { "epoch": 2.3852758590539476, "grad_norm": 3.423114538192749, "learning_rate": 2.0524777715600322e-07, "loss": 0.1037, "step": 102300 }, { "epoch": 2.3855090204307654, "grad_norm": 1.1920007467269897, "learning_rate": 2.0517005533793448e-07, "loss": 0.1016, "step": 102310 }, { "epoch": 2.3857421818075837, "grad_norm": 3.734433889389038, "learning_rate": 2.050923335198657e-07, "loss": 0.1075, "step": 102320 }, { "epoch": 2.3859753431844015, "grad_norm": 1.1610115766525269, "learning_rate": 2.0501461170179692e-07, "loss": 0.1081, "step": 102330 }, { "epoch": 2.3862085045612194, "grad_norm": 1.5654850006103516, "learning_rate": 2.0493688988372816e-07, "loss": 0.1097, "step": 102340 }, { "epoch": 2.386441665938037, "grad_norm": 1.2617818117141724, "learning_rate": 2.048591680656594e-07, "loss": 0.0976, "step": 102350 }, { "epoch": 2.3866748273148555, "grad_norm": 2.2945120334625244, "learning_rate": 2.047814462475906e-07, "loss": 0.1068, "step": 102360 }, { "epoch": 2.3869079886916733, "grad_norm": 2.8695383071899414, "learning_rate": 2.0470372442952187e-07, "loss": 0.1026, "step": 102370 }, { "epoch": 2.387141150068491, "grad_norm": 1.1878304481506348, "learning_rate": 2.0462600261145308e-07, "loss": 0.1096, "step": 102380 }, { "epoch": 2.387374311445309, "grad_norm": 2.6521546840667725, "learning_rate": 2.045482807933843e-07, "loss": 0.1019, "step": 102390 }, { "epoch": 2.3876074728221273, "grad_norm": 1.3678109645843506, "learning_rate": 2.0447055897531555e-07, "loss": 0.0982, "step": 102400 }, { "epoch": 2.387840634198945, "grad_norm": 1.562217116355896, "learning_rate": 2.0439283715724678e-07, "loss": 0.1112, "step": 102410 }, { "epoch": 2.388073795575763, "grad_norm": 1.113763451576233, "learning_rate": 2.04315115339178e-07, "loss": 0.0913, "step": 102420 }, { "epoch": 2.3883069569525808, "grad_norm": 2.003788948059082, "learning_rate": 2.0423739352110925e-07, "loss": 0.1006, "step": 102430 }, { "epoch": 2.3885401183293986, "grad_norm": 1.471284031867981, "learning_rate": 2.0415967170304046e-07, "loss": 0.1078, "step": 102440 }, { "epoch": 2.388773279706217, "grad_norm": 2.1024274826049805, "learning_rate": 2.040819498849717e-07, "loss": 0.0992, "step": 102450 }, { "epoch": 2.3890064410830347, "grad_norm": 2.727349281311035, "learning_rate": 2.0400422806690294e-07, "loss": 0.1035, "step": 102460 }, { "epoch": 2.3892396024598526, "grad_norm": 1.6113497018814087, "learning_rate": 2.0392650624883417e-07, "loss": 0.1085, "step": 102470 }, { "epoch": 2.3894727638366704, "grad_norm": 1.7737196683883667, "learning_rate": 2.0384878443076538e-07, "loss": 0.1058, "step": 102480 }, { "epoch": 2.3897059252134882, "grad_norm": 1.3564505577087402, "learning_rate": 2.0377106261269664e-07, "loss": 0.0949, "step": 102490 }, { "epoch": 2.3899390865903065, "grad_norm": 2.8157193660736084, "learning_rate": 2.0369334079462785e-07, "loss": 0.1037, "step": 102500 }, { "epoch": 2.3901722479671244, "grad_norm": 1.5596837997436523, "learning_rate": 2.0361561897655911e-07, "loss": 0.0953, "step": 102510 }, { "epoch": 2.390405409343942, "grad_norm": 1.3156300783157349, "learning_rate": 2.0353789715849032e-07, "loss": 0.093, "step": 102520 }, { "epoch": 2.39063857072076, "grad_norm": 1.888805866241455, "learning_rate": 2.0346017534042156e-07, "loss": 0.1057, "step": 102530 }, { "epoch": 2.390871732097578, "grad_norm": 1.2869672775268555, "learning_rate": 2.033824535223528e-07, "loss": 0.1018, "step": 102540 }, { "epoch": 2.391104893474396, "grad_norm": 1.240982174873352, "learning_rate": 2.0330473170428403e-07, "loss": 0.1073, "step": 102550 }, { "epoch": 2.391338054851214, "grad_norm": 2.314065933227539, "learning_rate": 2.0322700988621524e-07, "loss": 0.1168, "step": 102560 }, { "epoch": 2.391571216228032, "grad_norm": 1.3886852264404297, "learning_rate": 2.031492880681465e-07, "loss": 0.0928, "step": 102570 }, { "epoch": 2.3918043776048497, "grad_norm": 1.3034546375274658, "learning_rate": 2.030715662500777e-07, "loss": 0.1106, "step": 102580 }, { "epoch": 2.3920375389816675, "grad_norm": 1.7667255401611328, "learning_rate": 2.0299384443200895e-07, "loss": 0.0946, "step": 102590 }, { "epoch": 2.3922707003584858, "grad_norm": 1.2046360969543457, "learning_rate": 2.0291612261394018e-07, "loss": 0.0962, "step": 102600 }, { "epoch": 2.3925038617353036, "grad_norm": 2.07466459274292, "learning_rate": 2.0283840079587142e-07, "loss": 0.1098, "step": 102610 }, { "epoch": 2.3927370231121214, "grad_norm": 2.4410581588745117, "learning_rate": 2.0276067897780263e-07, "loss": 0.0988, "step": 102620 }, { "epoch": 2.3929701844889393, "grad_norm": 1.2035454511642456, "learning_rate": 2.026829571597339e-07, "loss": 0.1039, "step": 102630 }, { "epoch": 2.393203345865757, "grad_norm": 1.4950917959213257, "learning_rate": 2.026052353416651e-07, "loss": 0.1087, "step": 102640 }, { "epoch": 2.3934365072425754, "grad_norm": 3.604166030883789, "learning_rate": 2.0252751352359633e-07, "loss": 0.1134, "step": 102650 }, { "epoch": 2.3936696686193932, "grad_norm": 1.4745153188705444, "learning_rate": 2.0244979170552757e-07, "loss": 0.1044, "step": 102660 }, { "epoch": 2.393902829996211, "grad_norm": 1.318931221961975, "learning_rate": 2.023720698874588e-07, "loss": 0.104, "step": 102670 }, { "epoch": 2.394135991373029, "grad_norm": 1.453462839126587, "learning_rate": 2.0229434806939001e-07, "loss": 0.1017, "step": 102680 }, { "epoch": 2.3943691527498467, "grad_norm": 1.762881875038147, "learning_rate": 2.0221662625132128e-07, "loss": 0.0918, "step": 102690 }, { "epoch": 2.394602314126665, "grad_norm": 2.9054675102233887, "learning_rate": 2.0213890443325249e-07, "loss": 0.1079, "step": 102700 }, { "epoch": 2.394835475503483, "grad_norm": 1.3057806491851807, "learning_rate": 2.0206118261518372e-07, "loss": 0.1054, "step": 102710 }, { "epoch": 2.3950686368803007, "grad_norm": 1.291699767112732, "learning_rate": 2.0198346079711496e-07, "loss": 0.1006, "step": 102720 }, { "epoch": 2.3953017982571185, "grad_norm": 1.2850184440612793, "learning_rate": 2.019057389790462e-07, "loss": 0.1195, "step": 102730 }, { "epoch": 2.395534959633937, "grad_norm": 2.1973772048950195, "learning_rate": 2.0182801716097743e-07, "loss": 0.1089, "step": 102740 }, { "epoch": 2.3957681210107546, "grad_norm": 3.881042718887329, "learning_rate": 2.0175029534290866e-07, "loss": 0.1099, "step": 102750 }, { "epoch": 2.3960012823875725, "grad_norm": 1.2244915962219238, "learning_rate": 2.0167257352483987e-07, "loss": 0.1053, "step": 102760 }, { "epoch": 2.3962344437643903, "grad_norm": 1.2795888185501099, "learning_rate": 2.0159485170677114e-07, "loss": 0.1055, "step": 102770 }, { "epoch": 2.3964676051412086, "grad_norm": 2.0474932193756104, "learning_rate": 2.0151712988870234e-07, "loss": 0.1027, "step": 102780 }, { "epoch": 2.3967007665180264, "grad_norm": 1.2183738946914673, "learning_rate": 2.0143940807063358e-07, "loss": 0.1106, "step": 102790 }, { "epoch": 2.3969339278948443, "grad_norm": 2.147430419921875, "learning_rate": 2.0136168625256482e-07, "loss": 0.1047, "step": 102800 }, { "epoch": 2.397167089271662, "grad_norm": 1.6496816873550415, "learning_rate": 2.0128396443449605e-07, "loss": 0.1112, "step": 102810 }, { "epoch": 2.39740025064848, "grad_norm": 2.5710740089416504, "learning_rate": 2.0120624261642726e-07, "loss": 0.1071, "step": 102820 }, { "epoch": 2.3976334120252982, "grad_norm": 2.521557092666626, "learning_rate": 2.0112852079835852e-07, "loss": 0.1019, "step": 102830 }, { "epoch": 2.397866573402116, "grad_norm": 4.622973442077637, "learning_rate": 2.0105079898028973e-07, "loss": 0.1107, "step": 102840 }, { "epoch": 2.398099734778934, "grad_norm": 1.087490439414978, "learning_rate": 2.0097307716222097e-07, "loss": 0.1022, "step": 102850 }, { "epoch": 2.3983328961557517, "grad_norm": 1.3567564487457275, "learning_rate": 2.008953553441522e-07, "loss": 0.1182, "step": 102860 }, { "epoch": 2.3985660575325696, "grad_norm": 1.2302464246749878, "learning_rate": 2.0081763352608344e-07, "loss": 0.1113, "step": 102870 }, { "epoch": 2.398799218909388, "grad_norm": 1.2905476093292236, "learning_rate": 2.0073991170801465e-07, "loss": 0.0986, "step": 102880 }, { "epoch": 2.3990323802862057, "grad_norm": 3.118990182876587, "learning_rate": 2.006621898899459e-07, "loss": 0.0984, "step": 102890 }, { "epoch": 2.3992655416630235, "grad_norm": 1.9112799167633057, "learning_rate": 2.0058446807187712e-07, "loss": 0.1051, "step": 102900 }, { "epoch": 2.3994987030398414, "grad_norm": 3.3173000812530518, "learning_rate": 2.0050674625380836e-07, "loss": 0.1111, "step": 102910 }, { "epoch": 2.399731864416659, "grad_norm": 1.6846626996994019, "learning_rate": 2.004290244357396e-07, "loss": 0.1027, "step": 102920 }, { "epoch": 2.3999650257934775, "grad_norm": 1.719421148300171, "learning_rate": 2.0035130261767083e-07, "loss": 0.1079, "step": 102930 }, { "epoch": 2.4001981871702953, "grad_norm": 1.6295877695083618, "learning_rate": 2.0027358079960204e-07, "loss": 0.1034, "step": 102940 }, { "epoch": 2.400431348547113, "grad_norm": 1.5794651508331299, "learning_rate": 2.001958589815333e-07, "loss": 0.1078, "step": 102950 }, { "epoch": 2.400664509923931, "grad_norm": 1.0043163299560547, "learning_rate": 2.001181371634645e-07, "loss": 0.1194, "step": 102960 }, { "epoch": 2.400897671300749, "grad_norm": 1.3015230894088745, "learning_rate": 2.0004041534539577e-07, "loss": 0.0908, "step": 102970 }, { "epoch": 2.401130832677567, "grad_norm": 2.0912413597106934, "learning_rate": 1.9996269352732698e-07, "loss": 0.1024, "step": 102980 }, { "epoch": 2.401363994054385, "grad_norm": 1.7677942514419556, "learning_rate": 1.9988497170925821e-07, "loss": 0.1058, "step": 102990 }, { "epoch": 2.401597155431203, "grad_norm": 1.5541199445724487, "learning_rate": 1.9980724989118945e-07, "loss": 0.106, "step": 103000 }, { "epoch": 2.4018303168080206, "grad_norm": 1.4463114738464355, "learning_rate": 1.9972952807312069e-07, "loss": 0.1078, "step": 103010 }, { "epoch": 2.4020634781848385, "grad_norm": 1.6128922700881958, "learning_rate": 1.996518062550519e-07, "loss": 0.0966, "step": 103020 }, { "epoch": 2.4022966395616567, "grad_norm": 2.1250381469726562, "learning_rate": 1.9957408443698316e-07, "loss": 0.1121, "step": 103030 }, { "epoch": 2.4025298009384746, "grad_norm": 4.1598219871521, "learning_rate": 1.9949636261891437e-07, "loss": 0.1098, "step": 103040 }, { "epoch": 2.4027629623152924, "grad_norm": 1.4470134973526, "learning_rate": 1.994186408008456e-07, "loss": 0.1023, "step": 103050 }, { "epoch": 2.4029961236921102, "grad_norm": 2.619070053100586, "learning_rate": 1.9934091898277684e-07, "loss": 0.098, "step": 103060 }, { "epoch": 2.4032292850689285, "grad_norm": 2.5656347274780273, "learning_rate": 1.9926319716470807e-07, "loss": 0.1027, "step": 103070 }, { "epoch": 2.4034624464457464, "grad_norm": 1.2736796140670776, "learning_rate": 1.9918547534663928e-07, "loss": 0.1083, "step": 103080 }, { "epoch": 2.403695607822564, "grad_norm": 1.94377863407135, "learning_rate": 1.9910775352857054e-07, "loss": 0.1016, "step": 103090 }, { "epoch": 2.403928769199382, "grad_norm": 1.5479533672332764, "learning_rate": 1.9903003171050175e-07, "loss": 0.1048, "step": 103100 }, { "epoch": 2.4041619305762, "grad_norm": 1.0916352272033691, "learning_rate": 1.98952309892433e-07, "loss": 0.1044, "step": 103110 }, { "epoch": 2.404395091953018, "grad_norm": 1.754773736000061, "learning_rate": 1.9887458807436423e-07, "loss": 0.0972, "step": 103120 }, { "epoch": 2.404628253329836, "grad_norm": 1.3165016174316406, "learning_rate": 1.9879686625629546e-07, "loss": 0.106, "step": 103130 }, { "epoch": 2.404861414706654, "grad_norm": 1.8873814344406128, "learning_rate": 1.9871914443822667e-07, "loss": 0.1084, "step": 103140 }, { "epoch": 2.4050945760834717, "grad_norm": 1.2088958024978638, "learning_rate": 1.9864142262015793e-07, "loss": 0.1051, "step": 103150 }, { "epoch": 2.40532773746029, "grad_norm": 1.7261347770690918, "learning_rate": 1.9856370080208914e-07, "loss": 0.0947, "step": 103160 }, { "epoch": 2.4055608988371078, "grad_norm": 1.1169558763504028, "learning_rate": 1.984859789840204e-07, "loss": 0.1013, "step": 103170 }, { "epoch": 2.4057940602139256, "grad_norm": 1.8925426006317139, "learning_rate": 1.984082571659516e-07, "loss": 0.1044, "step": 103180 }, { "epoch": 2.4060272215907434, "grad_norm": 1.557761788368225, "learning_rate": 1.9833053534788285e-07, "loss": 0.107, "step": 103190 }, { "epoch": 2.4062603829675613, "grad_norm": 1.7320226430892944, "learning_rate": 1.9825281352981408e-07, "loss": 0.1026, "step": 103200 }, { "epoch": 2.4064935443443796, "grad_norm": 1.652962327003479, "learning_rate": 1.9817509171174532e-07, "loss": 0.0989, "step": 103210 }, { "epoch": 2.4067267057211974, "grad_norm": 1.4171667098999023, "learning_rate": 1.9809736989367653e-07, "loss": 0.0963, "step": 103220 }, { "epoch": 2.4069598670980152, "grad_norm": 1.7606849670410156, "learning_rate": 1.980196480756078e-07, "loss": 0.115, "step": 103230 }, { "epoch": 2.407193028474833, "grad_norm": 2.4437060356140137, "learning_rate": 1.97941926257539e-07, "loss": 0.1066, "step": 103240 }, { "epoch": 2.407426189851651, "grad_norm": 2.9594650268554688, "learning_rate": 1.9786420443947024e-07, "loss": 0.1031, "step": 103250 }, { "epoch": 2.407659351228469, "grad_norm": 2.577171564102173, "learning_rate": 1.9778648262140147e-07, "loss": 0.1022, "step": 103260 }, { "epoch": 2.407892512605287, "grad_norm": 1.299790382385254, "learning_rate": 1.977087608033327e-07, "loss": 0.1025, "step": 103270 }, { "epoch": 2.408125673982105, "grad_norm": 1.5718237161636353, "learning_rate": 1.9763103898526392e-07, "loss": 0.1115, "step": 103280 }, { "epoch": 2.4083588353589227, "grad_norm": 2.2438647747039795, "learning_rate": 1.9755331716719518e-07, "loss": 0.1024, "step": 103290 }, { "epoch": 2.4085919967357405, "grad_norm": 3.9184534549713135, "learning_rate": 1.974755953491264e-07, "loss": 0.1122, "step": 103300 }, { "epoch": 2.408825158112559, "grad_norm": 1.2451015710830688, "learning_rate": 1.9739787353105762e-07, "loss": 0.1031, "step": 103310 }, { "epoch": 2.4090583194893767, "grad_norm": 2.0550589561462402, "learning_rate": 1.9732015171298886e-07, "loss": 0.0903, "step": 103320 }, { "epoch": 2.4092914808661945, "grad_norm": 2.7010066509246826, "learning_rate": 1.972424298949201e-07, "loss": 0.1055, "step": 103330 }, { "epoch": 2.4095246422430123, "grad_norm": 0.8403099179267883, "learning_rate": 1.971647080768513e-07, "loss": 0.1142, "step": 103340 }, { "epoch": 2.40975780361983, "grad_norm": 2.160290241241455, "learning_rate": 1.9708698625878257e-07, "loss": 0.0922, "step": 103350 }, { "epoch": 2.4099909649966484, "grad_norm": 2.6180882453918457, "learning_rate": 1.9700926444071378e-07, "loss": 0.1108, "step": 103360 }, { "epoch": 2.4102241263734663, "grad_norm": 1.2203048467636108, "learning_rate": 1.9693154262264504e-07, "loss": 0.1097, "step": 103370 }, { "epoch": 2.410457287750284, "grad_norm": 1.0051863193511963, "learning_rate": 1.9685382080457625e-07, "loss": 0.1004, "step": 103380 }, { "epoch": 2.410690449127102, "grad_norm": 1.3934451341629028, "learning_rate": 1.9677609898650748e-07, "loss": 0.1044, "step": 103390 }, { "epoch": 2.41092361050392, "grad_norm": 1.3665496110916138, "learning_rate": 1.9669837716843872e-07, "loss": 0.106, "step": 103400 }, { "epoch": 2.411156771880738, "grad_norm": 1.6744513511657715, "learning_rate": 1.9662065535036995e-07, "loss": 0.0923, "step": 103410 }, { "epoch": 2.411389933257556, "grad_norm": 1.5387704372406006, "learning_rate": 1.9654293353230116e-07, "loss": 0.1099, "step": 103420 }, { "epoch": 2.4116230946343737, "grad_norm": 2.1734437942504883, "learning_rate": 1.9646521171423243e-07, "loss": 0.1038, "step": 103430 }, { "epoch": 2.4118562560111916, "grad_norm": 1.168039321899414, "learning_rate": 1.9638748989616363e-07, "loss": 0.0986, "step": 103440 }, { "epoch": 2.41208941738801, "grad_norm": 2.224447011947632, "learning_rate": 1.9630976807809487e-07, "loss": 0.1111, "step": 103450 }, { "epoch": 2.4123225787648277, "grad_norm": 1.7384049892425537, "learning_rate": 1.962320462600261e-07, "loss": 0.1022, "step": 103460 }, { "epoch": 2.4125557401416455, "grad_norm": 1.5651109218597412, "learning_rate": 1.9615432444195734e-07, "loss": 0.1127, "step": 103470 }, { "epoch": 2.4127889015184634, "grad_norm": 1.5312303304672241, "learning_rate": 1.9607660262388855e-07, "loss": 0.0899, "step": 103480 }, { "epoch": 2.413022062895281, "grad_norm": 3.3111183643341064, "learning_rate": 1.959988808058198e-07, "loss": 0.0955, "step": 103490 }, { "epoch": 2.4132552242720995, "grad_norm": 1.3742828369140625, "learning_rate": 1.9592115898775102e-07, "loss": 0.0922, "step": 103500 }, { "epoch": 2.4134883856489173, "grad_norm": 1.4520173072814941, "learning_rate": 1.9584343716968226e-07, "loss": 0.1019, "step": 103510 }, { "epoch": 2.413721547025735, "grad_norm": 1.3612138032913208, "learning_rate": 1.957657153516135e-07, "loss": 0.1094, "step": 103520 }, { "epoch": 2.413954708402553, "grad_norm": 1.7314623594284058, "learning_rate": 1.9568799353354473e-07, "loss": 0.1008, "step": 103530 }, { "epoch": 2.4141878697793713, "grad_norm": 1.0734310150146484, "learning_rate": 1.9561027171547594e-07, "loss": 0.1053, "step": 103540 }, { "epoch": 2.414421031156189, "grad_norm": 2.1810855865478516, "learning_rate": 1.955325498974072e-07, "loss": 0.1006, "step": 103550 }, { "epoch": 2.414654192533007, "grad_norm": 1.1702404022216797, "learning_rate": 1.954548280793384e-07, "loss": 0.1043, "step": 103560 }, { "epoch": 2.414887353909825, "grad_norm": 1.2575578689575195, "learning_rate": 1.9537710626126965e-07, "loss": 0.0955, "step": 103570 }, { "epoch": 2.4151205152866426, "grad_norm": 1.7259286642074585, "learning_rate": 1.9529938444320088e-07, "loss": 0.1086, "step": 103580 }, { "epoch": 2.415353676663461, "grad_norm": 1.8748595714569092, "learning_rate": 1.9522166262513212e-07, "loss": 0.0966, "step": 103590 }, { "epoch": 2.4155868380402787, "grad_norm": 1.5918124914169312, "learning_rate": 1.9514394080706335e-07, "loss": 0.1, "step": 103600 }, { "epoch": 2.4158199994170966, "grad_norm": 2.072983980178833, "learning_rate": 1.950662189889946e-07, "loss": 0.107, "step": 103610 }, { "epoch": 2.4160531607939144, "grad_norm": 1.2453595399856567, "learning_rate": 1.949884971709258e-07, "loss": 0.0961, "step": 103620 }, { "epoch": 2.4162863221707322, "grad_norm": 1.530327558517456, "learning_rate": 1.9491077535285706e-07, "loss": 0.0946, "step": 103630 }, { "epoch": 2.4165194835475505, "grad_norm": 2.121999502182007, "learning_rate": 1.9483305353478827e-07, "loss": 0.1011, "step": 103640 }, { "epoch": 2.4167526449243684, "grad_norm": 0.8454429507255554, "learning_rate": 1.947553317167195e-07, "loss": 0.1075, "step": 103650 }, { "epoch": 2.416985806301186, "grad_norm": 1.4555610418319702, "learning_rate": 1.9467760989865074e-07, "loss": 0.0937, "step": 103660 }, { "epoch": 2.417218967678004, "grad_norm": 1.3913379907608032, "learning_rate": 1.9459988808058198e-07, "loss": 0.1046, "step": 103670 }, { "epoch": 2.417452129054822, "grad_norm": 1.6638498306274414, "learning_rate": 1.9452216626251319e-07, "loss": 0.1022, "step": 103680 }, { "epoch": 2.41768529043164, "grad_norm": 2.8756415843963623, "learning_rate": 1.9444444444444445e-07, "loss": 0.1033, "step": 103690 }, { "epoch": 2.417918451808458, "grad_norm": 3.0715339183807373, "learning_rate": 1.9436672262637566e-07, "loss": 0.1024, "step": 103700 }, { "epoch": 2.418151613185276, "grad_norm": 3.519449234008789, "learning_rate": 1.942890008083069e-07, "loss": 0.0965, "step": 103710 }, { "epoch": 2.4183847745620937, "grad_norm": 1.8702280521392822, "learning_rate": 1.9421127899023813e-07, "loss": 0.1085, "step": 103720 }, { "epoch": 2.4186179359389115, "grad_norm": 2.1845152378082275, "learning_rate": 1.9413355717216936e-07, "loss": 0.1046, "step": 103730 }, { "epoch": 2.41885109731573, "grad_norm": 1.902873158454895, "learning_rate": 1.9405583535410057e-07, "loss": 0.1121, "step": 103740 }, { "epoch": 2.4190842586925476, "grad_norm": 2.162928581237793, "learning_rate": 1.9397811353603184e-07, "loss": 0.1236, "step": 103750 }, { "epoch": 2.4193174200693655, "grad_norm": 2.254100799560547, "learning_rate": 1.9390039171796304e-07, "loss": 0.1062, "step": 103760 }, { "epoch": 2.4195505814461833, "grad_norm": 1.3362683057785034, "learning_rate": 1.9382266989989428e-07, "loss": 0.1004, "step": 103770 }, { "epoch": 2.419783742823001, "grad_norm": 1.8279975652694702, "learning_rate": 1.9374494808182552e-07, "loss": 0.0999, "step": 103780 }, { "epoch": 2.4200169041998194, "grad_norm": 1.898288607597351, "learning_rate": 1.9366722626375675e-07, "loss": 0.1182, "step": 103790 }, { "epoch": 2.4202500655766372, "grad_norm": 2.958404064178467, "learning_rate": 1.9358950444568796e-07, "loss": 0.1062, "step": 103800 }, { "epoch": 2.420483226953455, "grad_norm": 1.4815384149551392, "learning_rate": 1.9351178262761922e-07, "loss": 0.0983, "step": 103810 }, { "epoch": 2.420716388330273, "grad_norm": 1.4464340209960938, "learning_rate": 1.9343406080955043e-07, "loss": 0.1107, "step": 103820 }, { "epoch": 2.420949549707091, "grad_norm": 1.4426416158676147, "learning_rate": 1.933563389914817e-07, "loss": 0.1072, "step": 103830 }, { "epoch": 2.421182711083909, "grad_norm": 1.4645601511001587, "learning_rate": 1.932786171734129e-07, "loss": 0.0903, "step": 103840 }, { "epoch": 2.421415872460727, "grad_norm": 2.7539453506469727, "learning_rate": 1.9320089535534414e-07, "loss": 0.0922, "step": 103850 }, { "epoch": 2.4216490338375447, "grad_norm": 1.5016587972640991, "learning_rate": 1.9312317353727537e-07, "loss": 0.1055, "step": 103860 }, { "epoch": 2.421882195214363, "grad_norm": 1.1080079078674316, "learning_rate": 1.930454517192066e-07, "loss": 0.0998, "step": 103870 }, { "epoch": 2.422115356591181, "grad_norm": 1.668074131011963, "learning_rate": 1.9296772990113782e-07, "loss": 0.1103, "step": 103880 }, { "epoch": 2.4223485179679987, "grad_norm": 2.5004677772521973, "learning_rate": 1.9289000808306908e-07, "loss": 0.1039, "step": 103890 }, { "epoch": 2.4225816793448165, "grad_norm": 3.080627918243408, "learning_rate": 1.928122862650003e-07, "loss": 0.1003, "step": 103900 }, { "epoch": 2.4228148407216343, "grad_norm": 1.8989882469177246, "learning_rate": 1.9273456444693153e-07, "loss": 0.1024, "step": 103910 }, { "epoch": 2.4230480020984526, "grad_norm": 1.7102687358856201, "learning_rate": 1.9265684262886276e-07, "loss": 0.1133, "step": 103920 }, { "epoch": 2.4232811634752704, "grad_norm": 1.2520129680633545, "learning_rate": 1.92579120810794e-07, "loss": 0.1106, "step": 103930 }, { "epoch": 2.4235143248520883, "grad_norm": 1.5184071063995361, "learning_rate": 1.925013989927252e-07, "loss": 0.0981, "step": 103940 }, { "epoch": 2.423747486228906, "grad_norm": 1.6689265966415405, "learning_rate": 1.9242367717465647e-07, "loss": 0.1151, "step": 103950 }, { "epoch": 2.423980647605724, "grad_norm": 1.3271324634552002, "learning_rate": 1.9234595535658768e-07, "loss": 0.1122, "step": 103960 }, { "epoch": 2.4242138089825422, "grad_norm": 3.3745055198669434, "learning_rate": 1.9226823353851891e-07, "loss": 0.1071, "step": 103970 }, { "epoch": 2.42444697035936, "grad_norm": 2.2158422470092773, "learning_rate": 1.9219051172045015e-07, "loss": 0.1021, "step": 103980 }, { "epoch": 2.424680131736178, "grad_norm": 0.9608162045478821, "learning_rate": 1.9211278990238139e-07, "loss": 0.0966, "step": 103990 }, { "epoch": 2.4249132931129957, "grad_norm": 3.040687322616577, "learning_rate": 1.920350680843126e-07, "loss": 0.1042, "step": 104000 }, { "epoch": 2.4251464544898136, "grad_norm": 1.2059142589569092, "learning_rate": 1.9195734626624386e-07, "loss": 0.1075, "step": 104010 }, { "epoch": 2.425379615866632, "grad_norm": 2.573765754699707, "learning_rate": 1.9187962444817507e-07, "loss": 0.0874, "step": 104020 }, { "epoch": 2.4256127772434497, "grad_norm": 1.0687252283096313, "learning_rate": 1.9180190263010633e-07, "loss": 0.0908, "step": 104030 }, { "epoch": 2.4258459386202675, "grad_norm": 2.2830910682678223, "learning_rate": 1.9172418081203754e-07, "loss": 0.0993, "step": 104040 }, { "epoch": 2.4260790999970854, "grad_norm": 1.5187469720840454, "learning_rate": 1.9164645899396877e-07, "loss": 0.103, "step": 104050 }, { "epoch": 2.426312261373903, "grad_norm": 1.7817946672439575, "learning_rate": 1.915687371759e-07, "loss": 0.1097, "step": 104060 }, { "epoch": 2.4265454227507215, "grad_norm": 1.4684288501739502, "learning_rate": 1.9149101535783124e-07, "loss": 0.1051, "step": 104070 }, { "epoch": 2.4267785841275393, "grad_norm": 1.4503625631332397, "learning_rate": 1.9141329353976245e-07, "loss": 0.0934, "step": 104080 }, { "epoch": 2.427011745504357, "grad_norm": 1.469048023223877, "learning_rate": 1.9133557172169372e-07, "loss": 0.0978, "step": 104090 }, { "epoch": 2.427244906881175, "grad_norm": 1.8841289281845093, "learning_rate": 1.9125784990362493e-07, "loss": 0.0879, "step": 104100 }, { "epoch": 2.427478068257993, "grad_norm": 1.6047712564468384, "learning_rate": 1.9118012808555616e-07, "loss": 0.0994, "step": 104110 }, { "epoch": 2.427711229634811, "grad_norm": 2.531062602996826, "learning_rate": 1.911024062674874e-07, "loss": 0.1007, "step": 104120 }, { "epoch": 2.427944391011629, "grad_norm": 2.3548498153686523, "learning_rate": 1.9102468444941863e-07, "loss": 0.1067, "step": 104130 }, { "epoch": 2.428177552388447, "grad_norm": 1.993255376815796, "learning_rate": 1.9094696263134984e-07, "loss": 0.1057, "step": 104140 }, { "epoch": 2.4284107137652646, "grad_norm": 1.4021316766738892, "learning_rate": 1.908692408132811e-07, "loss": 0.1034, "step": 104150 }, { "epoch": 2.4286438751420825, "grad_norm": 1.8669800758361816, "learning_rate": 1.907915189952123e-07, "loss": 0.1065, "step": 104160 }, { "epoch": 2.4288770365189007, "grad_norm": 1.241802453994751, "learning_rate": 1.9071379717714355e-07, "loss": 0.1062, "step": 104170 }, { "epoch": 2.4291101978957186, "grad_norm": 1.5420728921890259, "learning_rate": 1.9063607535907478e-07, "loss": 0.0875, "step": 104180 }, { "epoch": 2.4293433592725364, "grad_norm": 1.1950780153274536, "learning_rate": 1.9055835354100602e-07, "loss": 0.1045, "step": 104190 }, { "epoch": 2.4295765206493543, "grad_norm": 1.1284160614013672, "learning_rate": 1.9048063172293723e-07, "loss": 0.1021, "step": 104200 }, { "epoch": 2.4298096820261725, "grad_norm": 1.235243797302246, "learning_rate": 1.904029099048685e-07, "loss": 0.1024, "step": 104210 }, { "epoch": 2.4300428434029904, "grad_norm": 2.0177083015441895, "learning_rate": 1.903251880867997e-07, "loss": 0.1047, "step": 104220 }, { "epoch": 2.430276004779808, "grad_norm": 1.3191477060317993, "learning_rate": 1.9024746626873096e-07, "loss": 0.1049, "step": 104230 }, { "epoch": 2.430509166156626, "grad_norm": 1.0916105508804321, "learning_rate": 1.901697444506622e-07, "loss": 0.0918, "step": 104240 }, { "epoch": 2.4307423275334443, "grad_norm": 1.5159045457839966, "learning_rate": 1.900920226325934e-07, "loss": 0.1048, "step": 104250 }, { "epoch": 2.430975488910262, "grad_norm": 1.6434539556503296, "learning_rate": 1.9001430081452467e-07, "loss": 0.1079, "step": 104260 }, { "epoch": 2.43120865028708, "grad_norm": 1.1874420642852783, "learning_rate": 1.8993657899645588e-07, "loss": 0.0943, "step": 104270 }, { "epoch": 2.431441811663898, "grad_norm": 1.8695553541183472, "learning_rate": 1.8985885717838711e-07, "loss": 0.103, "step": 104280 }, { "epoch": 2.4316749730407157, "grad_norm": 1.6817055940628052, "learning_rate": 1.8978113536031835e-07, "loss": 0.0857, "step": 104290 }, { "epoch": 2.431908134417534, "grad_norm": 1.4940648078918457, "learning_rate": 1.8970341354224959e-07, "loss": 0.1094, "step": 104300 }, { "epoch": 2.432141295794352, "grad_norm": 1.0494420528411865, "learning_rate": 1.896256917241808e-07, "loss": 0.1023, "step": 104310 }, { "epoch": 2.4323744571711696, "grad_norm": 1.074703335762024, "learning_rate": 1.8954796990611206e-07, "loss": 0.1031, "step": 104320 }, { "epoch": 2.4326076185479875, "grad_norm": 1.7866253852844238, "learning_rate": 1.8947024808804327e-07, "loss": 0.097, "step": 104330 }, { "epoch": 2.4328407799248053, "grad_norm": 1.1894583702087402, "learning_rate": 1.893925262699745e-07, "loss": 0.0969, "step": 104340 }, { "epoch": 2.4330739413016236, "grad_norm": 1.4312037229537964, "learning_rate": 1.8931480445190574e-07, "loss": 0.1057, "step": 104350 }, { "epoch": 2.4333071026784414, "grad_norm": 1.531408667564392, "learning_rate": 1.8923708263383697e-07, "loss": 0.1057, "step": 104360 }, { "epoch": 2.4335402640552593, "grad_norm": 1.5511407852172852, "learning_rate": 1.8915936081576818e-07, "loss": 0.1159, "step": 104370 }, { "epoch": 2.433773425432077, "grad_norm": 1.390431523323059, "learning_rate": 1.8908163899769944e-07, "loss": 0.1076, "step": 104380 }, { "epoch": 2.434006586808895, "grad_norm": 2.051793098449707, "learning_rate": 1.8900391717963065e-07, "loss": 0.1127, "step": 104390 }, { "epoch": 2.434239748185713, "grad_norm": 1.2119793891906738, "learning_rate": 1.889261953615619e-07, "loss": 0.1031, "step": 104400 }, { "epoch": 2.434472909562531, "grad_norm": 1.9251428842544556, "learning_rate": 1.8884847354349313e-07, "loss": 0.089, "step": 104410 }, { "epoch": 2.434706070939349, "grad_norm": 2.253509283065796, "learning_rate": 1.8877075172542436e-07, "loss": 0.1047, "step": 104420 }, { "epoch": 2.4349392323161667, "grad_norm": 1.7226814031600952, "learning_rate": 1.8869302990735557e-07, "loss": 0.1069, "step": 104430 }, { "epoch": 2.4351723936929845, "grad_norm": 1.3205534219741821, "learning_rate": 1.8861530808928683e-07, "loss": 0.0993, "step": 104440 }, { "epoch": 2.435405555069803, "grad_norm": 1.3199788331985474, "learning_rate": 1.8853758627121804e-07, "loss": 0.0935, "step": 104450 }, { "epoch": 2.4356387164466207, "grad_norm": 1.2959903478622437, "learning_rate": 1.884598644531493e-07, "loss": 0.1112, "step": 104460 }, { "epoch": 2.4358718778234385, "grad_norm": 1.5059750080108643, "learning_rate": 1.883821426350805e-07, "loss": 0.0991, "step": 104470 }, { "epoch": 2.4361050392002563, "grad_norm": 1.2228034734725952, "learning_rate": 1.8830442081701175e-07, "loss": 0.1118, "step": 104480 }, { "epoch": 2.436338200577074, "grad_norm": 1.6598604917526245, "learning_rate": 1.8822669899894298e-07, "loss": 0.1056, "step": 104490 }, { "epoch": 2.4365713619538925, "grad_norm": 1.6111618280410767, "learning_rate": 1.8814897718087422e-07, "loss": 0.1098, "step": 104500 }, { "epoch": 2.4368045233307103, "grad_norm": 1.1503918170928955, "learning_rate": 1.8807125536280543e-07, "loss": 0.1066, "step": 104510 }, { "epoch": 2.437037684707528, "grad_norm": 1.2990084886550903, "learning_rate": 1.879935335447367e-07, "loss": 0.1087, "step": 104520 }, { "epoch": 2.437270846084346, "grad_norm": 1.613393783569336, "learning_rate": 1.879158117266679e-07, "loss": 0.1019, "step": 104530 }, { "epoch": 2.4375040074611642, "grad_norm": 1.4017564058303833, "learning_rate": 1.8783808990859914e-07, "loss": 0.1032, "step": 104540 }, { "epoch": 2.437737168837982, "grad_norm": 1.762317180633545, "learning_rate": 1.8776036809053037e-07, "loss": 0.0957, "step": 104550 }, { "epoch": 2.4379703302148, "grad_norm": 1.0385342836380005, "learning_rate": 1.876826462724616e-07, "loss": 0.0986, "step": 104560 }, { "epoch": 2.4382034915916178, "grad_norm": 1.5394092798233032, "learning_rate": 1.8760492445439282e-07, "loss": 0.0963, "step": 104570 }, { "epoch": 2.4384366529684356, "grad_norm": 2.3583197593688965, "learning_rate": 1.8752720263632408e-07, "loss": 0.1096, "step": 104580 }, { "epoch": 2.438669814345254, "grad_norm": 0.9838391542434692, "learning_rate": 1.874494808182553e-07, "loss": 0.101, "step": 104590 }, { "epoch": 2.4389029757220717, "grad_norm": 1.4648596048355103, "learning_rate": 1.8737175900018652e-07, "loss": 0.1117, "step": 104600 }, { "epoch": 2.4391361370988895, "grad_norm": 1.2147789001464844, "learning_rate": 1.8729403718211776e-07, "loss": 0.1122, "step": 104610 }, { "epoch": 2.4393692984757074, "grad_norm": 1.1907353401184082, "learning_rate": 1.87216315364049e-07, "loss": 0.0971, "step": 104620 }, { "epoch": 2.4396024598525257, "grad_norm": 1.4485433101654053, "learning_rate": 1.871385935459802e-07, "loss": 0.1089, "step": 104630 }, { "epoch": 2.4398356212293435, "grad_norm": 2.049206256866455, "learning_rate": 1.8706087172791147e-07, "loss": 0.0955, "step": 104640 }, { "epoch": 2.4400687826061613, "grad_norm": 1.7737406492233276, "learning_rate": 1.8698314990984268e-07, "loss": 0.1028, "step": 104650 }, { "epoch": 2.440301943982979, "grad_norm": 1.4539316892623901, "learning_rate": 1.8690542809177394e-07, "loss": 0.0943, "step": 104660 }, { "epoch": 2.440535105359797, "grad_norm": 1.8525704145431519, "learning_rate": 1.8682770627370515e-07, "loss": 0.1218, "step": 104670 }, { "epoch": 2.4407682667366153, "grad_norm": 1.1402804851531982, "learning_rate": 1.8674998445563638e-07, "loss": 0.0996, "step": 104680 }, { "epoch": 2.441001428113433, "grad_norm": 1.4624780416488647, "learning_rate": 1.8667226263756762e-07, "loss": 0.0952, "step": 104690 }, { "epoch": 2.441234589490251, "grad_norm": 1.2150330543518066, "learning_rate": 1.8659454081949885e-07, "loss": 0.1038, "step": 104700 }, { "epoch": 2.441467750867069, "grad_norm": 1.975075125694275, "learning_rate": 1.8651681900143006e-07, "loss": 0.1059, "step": 104710 }, { "epoch": 2.4417009122438866, "grad_norm": 1.2199469804763794, "learning_rate": 1.8643909718336133e-07, "loss": 0.1038, "step": 104720 }, { "epoch": 2.441934073620705, "grad_norm": 1.6434836387634277, "learning_rate": 1.8636137536529253e-07, "loss": 0.1167, "step": 104730 }, { "epoch": 2.4421672349975228, "grad_norm": 2.110182523727417, "learning_rate": 1.8628365354722377e-07, "loss": 0.0869, "step": 104740 }, { "epoch": 2.4424003963743406, "grad_norm": 1.4065842628479004, "learning_rate": 1.86205931729155e-07, "loss": 0.0894, "step": 104750 }, { "epoch": 2.4426335577511584, "grad_norm": 1.2786595821380615, "learning_rate": 1.8612820991108624e-07, "loss": 0.0968, "step": 104760 }, { "epoch": 2.4428667191279763, "grad_norm": 1.3510509729385376, "learning_rate": 1.8605048809301745e-07, "loss": 0.106, "step": 104770 }, { "epoch": 2.4430998805047945, "grad_norm": 1.298756718635559, "learning_rate": 1.859727662749487e-07, "loss": 0.1091, "step": 104780 }, { "epoch": 2.4433330418816124, "grad_norm": 1.448145866394043, "learning_rate": 1.8589504445687992e-07, "loss": 0.1004, "step": 104790 }, { "epoch": 2.44356620325843, "grad_norm": 1.2616711854934692, "learning_rate": 1.8581732263881116e-07, "loss": 0.0964, "step": 104800 }, { "epoch": 2.443799364635248, "grad_norm": 1.347657322883606, "learning_rate": 1.857396008207424e-07, "loss": 0.1007, "step": 104810 }, { "epoch": 2.444032526012066, "grad_norm": 1.3034802675247192, "learning_rate": 1.8566187900267363e-07, "loss": 0.0998, "step": 104820 }, { "epoch": 2.444265687388884, "grad_norm": 1.738006353378296, "learning_rate": 1.8558415718460484e-07, "loss": 0.0963, "step": 104830 }, { "epoch": 2.444498848765702, "grad_norm": 1.221606731414795, "learning_rate": 1.855064353665361e-07, "loss": 0.1121, "step": 104840 }, { "epoch": 2.44473201014252, "grad_norm": 1.9313613176345825, "learning_rate": 1.854287135484673e-07, "loss": 0.1006, "step": 104850 }, { "epoch": 2.4449651715193377, "grad_norm": 1.2244384288787842, "learning_rate": 1.8535099173039857e-07, "loss": 0.087, "step": 104860 }, { "epoch": 2.4451983328961555, "grad_norm": 1.4330607652664185, "learning_rate": 1.8527326991232978e-07, "loss": 0.1118, "step": 104870 }, { "epoch": 2.445431494272974, "grad_norm": 1.334930658340454, "learning_rate": 1.8519554809426102e-07, "loss": 0.0959, "step": 104880 }, { "epoch": 2.4456646556497916, "grad_norm": 3.1429924964904785, "learning_rate": 1.8511782627619225e-07, "loss": 0.1082, "step": 104890 }, { "epoch": 2.4458978170266095, "grad_norm": 2.043397903442383, "learning_rate": 1.850401044581235e-07, "loss": 0.106, "step": 104900 }, { "epoch": 2.4461309784034273, "grad_norm": 1.2153750658035278, "learning_rate": 1.849623826400547e-07, "loss": 0.0924, "step": 104910 }, { "epoch": 2.4463641397802456, "grad_norm": 3.2864105701446533, "learning_rate": 1.8488466082198596e-07, "loss": 0.0947, "step": 104920 }, { "epoch": 2.4465973011570634, "grad_norm": 1.2064483165740967, "learning_rate": 1.8480693900391717e-07, "loss": 0.0979, "step": 104930 }, { "epoch": 2.4468304625338813, "grad_norm": 1.178480863571167, "learning_rate": 1.847292171858484e-07, "loss": 0.1043, "step": 104940 }, { "epoch": 2.447063623910699, "grad_norm": 2.0211496353149414, "learning_rate": 1.8465149536777964e-07, "loss": 0.1058, "step": 104950 }, { "epoch": 2.447296785287517, "grad_norm": 1.4586312770843506, "learning_rate": 1.8457377354971088e-07, "loss": 0.1091, "step": 104960 }, { "epoch": 2.447529946664335, "grad_norm": 2.185621976852417, "learning_rate": 1.8449605173164209e-07, "loss": 0.1064, "step": 104970 }, { "epoch": 2.447763108041153, "grad_norm": 2.248535633087158, "learning_rate": 1.8441832991357335e-07, "loss": 0.1025, "step": 104980 }, { "epoch": 2.447996269417971, "grad_norm": 1.9711918830871582, "learning_rate": 1.8434060809550456e-07, "loss": 0.1052, "step": 104990 }, { "epoch": 2.4482294307947887, "grad_norm": 2.621798515319824, "learning_rate": 1.842628862774358e-07, "loss": 0.1112, "step": 105000 }, { "epoch": 2.448462592171607, "grad_norm": 2.7598319053649902, "learning_rate": 1.8418516445936703e-07, "loss": 0.1102, "step": 105010 }, { "epoch": 2.448695753548425, "grad_norm": 1.2623798847198486, "learning_rate": 1.8410744264129826e-07, "loss": 0.0954, "step": 105020 }, { "epoch": 2.4489289149252427, "grad_norm": 2.2061030864715576, "learning_rate": 1.8402972082322947e-07, "loss": 0.1043, "step": 105030 }, { "epoch": 2.4491620763020605, "grad_norm": 1.1549241542816162, "learning_rate": 1.8395199900516073e-07, "loss": 0.0922, "step": 105040 }, { "epoch": 2.4493952376788783, "grad_norm": 2.105402946472168, "learning_rate": 1.8387427718709194e-07, "loss": 0.1151, "step": 105050 }, { "epoch": 2.4496283990556966, "grad_norm": 1.4854881763458252, "learning_rate": 1.8379655536902318e-07, "loss": 0.0965, "step": 105060 }, { "epoch": 2.4498615604325145, "grad_norm": 0.9216797947883606, "learning_rate": 1.8371883355095442e-07, "loss": 0.1031, "step": 105070 }, { "epoch": 2.4500947218093323, "grad_norm": 1.6456626653671265, "learning_rate": 1.8364111173288565e-07, "loss": 0.0996, "step": 105080 }, { "epoch": 2.45032788318615, "grad_norm": 1.4744373559951782, "learning_rate": 1.835633899148169e-07, "loss": 0.1134, "step": 105090 }, { "epoch": 2.450561044562968, "grad_norm": 1.3358006477355957, "learning_rate": 1.8348566809674812e-07, "loss": 0.1004, "step": 105100 }, { "epoch": 2.4507942059397863, "grad_norm": 1.400550127029419, "learning_rate": 1.8340794627867933e-07, "loss": 0.1085, "step": 105110 }, { "epoch": 2.451027367316604, "grad_norm": 1.9372199773788452, "learning_rate": 1.833302244606106e-07, "loss": 0.1039, "step": 105120 }, { "epoch": 2.451260528693422, "grad_norm": 2.0272810459136963, "learning_rate": 1.832525026425418e-07, "loss": 0.1047, "step": 105130 }, { "epoch": 2.4514936900702398, "grad_norm": 1.3712191581726074, "learning_rate": 1.8317478082447304e-07, "loss": 0.0984, "step": 105140 }, { "epoch": 2.4517268514470576, "grad_norm": 1.8965704441070557, "learning_rate": 1.8309705900640427e-07, "loss": 0.1044, "step": 105150 }, { "epoch": 2.451960012823876, "grad_norm": 1.4354290962219238, "learning_rate": 1.830193371883355e-07, "loss": 0.1051, "step": 105160 }, { "epoch": 2.4521931742006937, "grad_norm": 1.235621690750122, "learning_rate": 1.8294161537026672e-07, "loss": 0.1124, "step": 105170 }, { "epoch": 2.4524263355775116, "grad_norm": 1.7975435256958008, "learning_rate": 1.8286389355219798e-07, "loss": 0.1035, "step": 105180 }, { "epoch": 2.4526594969543294, "grad_norm": 1.5037221908569336, "learning_rate": 1.8279394391593608e-07, "loss": 0.1111, "step": 105190 }, { "epoch": 2.4528926583311472, "grad_norm": 2.3681609630584717, "learning_rate": 1.827162220978673e-07, "loss": 0.0945, "step": 105200 }, { "epoch": 2.4531258197079655, "grad_norm": 2.0811634063720703, "learning_rate": 1.8263850027979855e-07, "loss": 0.0863, "step": 105210 }, { "epoch": 2.4533589810847833, "grad_norm": 1.788374423980713, "learning_rate": 1.8256077846172976e-07, "loss": 0.0963, "step": 105220 }, { "epoch": 2.453592142461601, "grad_norm": 1.7512456178665161, "learning_rate": 1.82483056643661e-07, "loss": 0.1056, "step": 105230 }, { "epoch": 2.453825303838419, "grad_norm": 2.2937092781066895, "learning_rate": 1.8240533482559223e-07, "loss": 0.1121, "step": 105240 }, { "epoch": 2.454058465215237, "grad_norm": 2.226562976837158, "learning_rate": 1.8232761300752347e-07, "loss": 0.0954, "step": 105250 }, { "epoch": 2.454291626592055, "grad_norm": 2.9538474082946777, "learning_rate": 1.822498911894547e-07, "loss": 0.1064, "step": 105260 }, { "epoch": 2.454524787968873, "grad_norm": 1.6487327814102173, "learning_rate": 1.8217216937138594e-07, "loss": 0.0851, "step": 105270 }, { "epoch": 2.454757949345691, "grad_norm": 1.3461174964904785, "learning_rate": 1.8209444755331715e-07, "loss": 0.1139, "step": 105280 }, { "epoch": 2.4549911107225086, "grad_norm": 1.8817917108535767, "learning_rate": 1.820167257352484e-07, "loss": 0.1038, "step": 105290 }, { "epoch": 2.455224272099327, "grad_norm": 2.9512217044830322, "learning_rate": 1.8193900391717962e-07, "loss": 0.1005, "step": 105300 }, { "epoch": 2.4554574334761448, "grad_norm": 1.4309486150741577, "learning_rate": 1.8186128209911086e-07, "loss": 0.1053, "step": 105310 }, { "epoch": 2.4556905948529626, "grad_norm": 2.064807891845703, "learning_rate": 1.817835602810421e-07, "loss": 0.1094, "step": 105320 }, { "epoch": 2.4559237562297804, "grad_norm": 1.196743369102478, "learning_rate": 1.8170583846297333e-07, "loss": 0.1016, "step": 105330 }, { "epoch": 2.4561569176065987, "grad_norm": 2.933661937713623, "learning_rate": 1.8162811664490454e-07, "loss": 0.1085, "step": 105340 }, { "epoch": 2.4563900789834165, "grad_norm": 1.536863088607788, "learning_rate": 1.815503948268358e-07, "loss": 0.1015, "step": 105350 }, { "epoch": 2.4566232403602344, "grad_norm": 1.2237662076950073, "learning_rate": 1.81472673008767e-07, "loss": 0.0963, "step": 105360 }, { "epoch": 2.456856401737052, "grad_norm": 1.5127885341644287, "learning_rate": 1.8139495119069824e-07, "loss": 0.0983, "step": 105370 }, { "epoch": 2.45708956311387, "grad_norm": 1.5381020307540894, "learning_rate": 1.8131722937262948e-07, "loss": 0.1151, "step": 105380 }, { "epoch": 2.4573227244906883, "grad_norm": 1.4787282943725586, "learning_rate": 1.8123950755456071e-07, "loss": 0.0869, "step": 105390 }, { "epoch": 2.457555885867506, "grad_norm": 1.488879919052124, "learning_rate": 1.8116178573649192e-07, "loss": 0.1203, "step": 105400 }, { "epoch": 2.457789047244324, "grad_norm": 2.0040109157562256, "learning_rate": 1.8108406391842319e-07, "loss": 0.101, "step": 105410 }, { "epoch": 2.458022208621142, "grad_norm": 2.167532444000244, "learning_rate": 1.810063421003544e-07, "loss": 0.0891, "step": 105420 }, { "epoch": 2.4582553699979597, "grad_norm": 1.7765679359436035, "learning_rate": 1.8092862028228563e-07, "loss": 0.1041, "step": 105430 }, { "epoch": 2.458488531374778, "grad_norm": 1.4430139064788818, "learning_rate": 1.8085089846421687e-07, "loss": 0.1063, "step": 105440 }, { "epoch": 2.458721692751596, "grad_norm": 1.888157844543457, "learning_rate": 1.807731766461481e-07, "loss": 0.1048, "step": 105450 }, { "epoch": 2.4589548541284136, "grad_norm": 1.3191660642623901, "learning_rate": 1.806954548280793e-07, "loss": 0.1074, "step": 105460 }, { "epoch": 2.4591880155052315, "grad_norm": 0.9237526655197144, "learning_rate": 1.8061773301001057e-07, "loss": 0.1077, "step": 105470 }, { "epoch": 2.4594211768820493, "grad_norm": 1.1502571105957031, "learning_rate": 1.8054001119194178e-07, "loss": 0.1136, "step": 105480 }, { "epoch": 2.4596543382588676, "grad_norm": 1.5026912689208984, "learning_rate": 1.8046228937387304e-07, "loss": 0.0955, "step": 105490 }, { "epoch": 2.4598874996356854, "grad_norm": 2.57692813873291, "learning_rate": 1.8038456755580425e-07, "loss": 0.1067, "step": 105500 }, { "epoch": 2.4601206610125033, "grad_norm": 1.3666719198226929, "learning_rate": 1.803068457377355e-07, "loss": 0.1081, "step": 105510 }, { "epoch": 2.460353822389321, "grad_norm": 2.1784698963165283, "learning_rate": 1.8022912391966673e-07, "loss": 0.1198, "step": 105520 }, { "epoch": 2.460586983766139, "grad_norm": 1.5676053762435913, "learning_rate": 1.8015140210159796e-07, "loss": 0.0997, "step": 105530 }, { "epoch": 2.460820145142957, "grad_norm": 1.552009105682373, "learning_rate": 1.8007368028352917e-07, "loss": 0.1077, "step": 105540 }, { "epoch": 2.461053306519775, "grad_norm": 1.6419721841812134, "learning_rate": 1.7999595846546043e-07, "loss": 0.1031, "step": 105550 }, { "epoch": 2.461286467896593, "grad_norm": 2.0716731548309326, "learning_rate": 1.7991823664739164e-07, "loss": 0.1156, "step": 105560 }, { "epoch": 2.4615196292734107, "grad_norm": 1.1899514198303223, "learning_rate": 1.7984051482932288e-07, "loss": 0.1038, "step": 105570 }, { "epoch": 2.4617527906502286, "grad_norm": 2.2066633701324463, "learning_rate": 1.797627930112541e-07, "loss": 0.1134, "step": 105580 }, { "epoch": 2.461985952027047, "grad_norm": 2.5272819995880127, "learning_rate": 1.7968507119318535e-07, "loss": 0.1166, "step": 105590 }, { "epoch": 2.4622191134038647, "grad_norm": 1.6268929243087769, "learning_rate": 1.7960734937511656e-07, "loss": 0.0966, "step": 105600 }, { "epoch": 2.4624522747806825, "grad_norm": 1.4963961839675903, "learning_rate": 1.7952962755704782e-07, "loss": 0.1014, "step": 105610 }, { "epoch": 2.4626854361575004, "grad_norm": 1.3611624240875244, "learning_rate": 1.7945190573897903e-07, "loss": 0.0989, "step": 105620 }, { "epoch": 2.4629185975343186, "grad_norm": 1.6641658544540405, "learning_rate": 1.7937418392091027e-07, "loss": 0.1007, "step": 105630 }, { "epoch": 2.4631517589111365, "grad_norm": 1.1977909803390503, "learning_rate": 1.792964621028415e-07, "loss": 0.0995, "step": 105640 }, { "epoch": 2.4633849202879543, "grad_norm": 1.2613046169281006, "learning_rate": 1.7921874028477274e-07, "loss": 0.0918, "step": 105650 }, { "epoch": 2.463618081664772, "grad_norm": 1.5714271068572998, "learning_rate": 1.7914101846670395e-07, "loss": 0.1018, "step": 105660 }, { "epoch": 2.46385124304159, "grad_norm": 1.287879228591919, "learning_rate": 1.790632966486352e-07, "loss": 0.0959, "step": 105670 }, { "epoch": 2.4640844044184083, "grad_norm": 2.138869047164917, "learning_rate": 1.7898557483056642e-07, "loss": 0.0996, "step": 105680 }, { "epoch": 2.464317565795226, "grad_norm": 1.917988896369934, "learning_rate": 1.7890785301249768e-07, "loss": 0.1028, "step": 105690 }, { "epoch": 2.464550727172044, "grad_norm": 1.806344985961914, "learning_rate": 1.788301311944289e-07, "loss": 0.1003, "step": 105700 }, { "epoch": 2.4647838885488618, "grad_norm": 1.7768642902374268, "learning_rate": 1.7875240937636012e-07, "loss": 0.0965, "step": 105710 }, { "epoch": 2.46501704992568, "grad_norm": 1.7354559898376465, "learning_rate": 1.7867468755829136e-07, "loss": 0.0968, "step": 105720 }, { "epoch": 2.465250211302498, "grad_norm": 1.8730968236923218, "learning_rate": 1.785969657402226e-07, "loss": 0.099, "step": 105730 }, { "epoch": 2.4654833726793157, "grad_norm": 2.774807929992676, "learning_rate": 1.785192439221538e-07, "loss": 0.108, "step": 105740 }, { "epoch": 2.4657165340561336, "grad_norm": 1.8296979665756226, "learning_rate": 1.7844152210408507e-07, "loss": 0.1027, "step": 105750 }, { "epoch": 2.4659496954329514, "grad_norm": 2.9659130573272705, "learning_rate": 1.7836380028601628e-07, "loss": 0.1082, "step": 105760 }, { "epoch": 2.4661828568097697, "grad_norm": 1.7131321430206299, "learning_rate": 1.782860784679475e-07, "loss": 0.1076, "step": 105770 }, { "epoch": 2.4664160181865875, "grad_norm": 1.2994710206985474, "learning_rate": 1.7820835664987875e-07, "loss": 0.0926, "step": 105780 }, { "epoch": 2.4666491795634053, "grad_norm": 2.2921299934387207, "learning_rate": 1.7813063483180998e-07, "loss": 0.1043, "step": 105790 }, { "epoch": 2.466882340940223, "grad_norm": 3.4676311016082764, "learning_rate": 1.780529130137412e-07, "loss": 0.1063, "step": 105800 }, { "epoch": 2.467115502317041, "grad_norm": 2.2702860832214355, "learning_rate": 1.7797519119567245e-07, "loss": 0.1048, "step": 105810 }, { "epoch": 2.4673486636938593, "grad_norm": 2.7604169845581055, "learning_rate": 1.7789746937760366e-07, "loss": 0.1165, "step": 105820 }, { "epoch": 2.467581825070677, "grad_norm": 1.3751248121261597, "learning_rate": 1.778197475595349e-07, "loss": 0.1058, "step": 105830 }, { "epoch": 2.467814986447495, "grad_norm": 1.7496823072433472, "learning_rate": 1.7774202574146613e-07, "loss": 0.0999, "step": 105840 }, { "epoch": 2.468048147824313, "grad_norm": 2.7508034706115723, "learning_rate": 1.7766430392339737e-07, "loss": 0.1012, "step": 105850 }, { "epoch": 2.4682813092011306, "grad_norm": 1.3069206476211548, "learning_rate": 1.7758658210532858e-07, "loss": 0.0984, "step": 105860 }, { "epoch": 2.468514470577949, "grad_norm": 0.9671632051467896, "learning_rate": 1.7750886028725984e-07, "loss": 0.0979, "step": 105870 }, { "epoch": 2.4687476319547668, "grad_norm": 2.456176519393921, "learning_rate": 1.7743113846919105e-07, "loss": 0.1102, "step": 105880 }, { "epoch": 2.4689807933315846, "grad_norm": 1.4069212675094604, "learning_rate": 1.7735341665112231e-07, "loss": 0.0867, "step": 105890 }, { "epoch": 2.4692139547084024, "grad_norm": 2.767019510269165, "learning_rate": 1.7727569483305352e-07, "loss": 0.1005, "step": 105900 }, { "epoch": 2.4694471160852203, "grad_norm": 1.156982421875, "learning_rate": 1.7719797301498476e-07, "loss": 0.1048, "step": 105910 }, { "epoch": 2.4696802774620386, "grad_norm": 1.4516141414642334, "learning_rate": 1.77120251196916e-07, "loss": 0.0979, "step": 105920 }, { "epoch": 2.4699134388388564, "grad_norm": 1.3986307382583618, "learning_rate": 1.7704252937884723e-07, "loss": 0.1052, "step": 105930 }, { "epoch": 2.4701466002156742, "grad_norm": 1.8972347974777222, "learning_rate": 1.7696480756077844e-07, "loss": 0.1073, "step": 105940 }, { "epoch": 2.470379761592492, "grad_norm": 1.3640289306640625, "learning_rate": 1.768870857427097e-07, "loss": 0.117, "step": 105950 }, { "epoch": 2.47061292296931, "grad_norm": 1.8966865539550781, "learning_rate": 1.768093639246409e-07, "loss": 0.0939, "step": 105960 }, { "epoch": 2.470846084346128, "grad_norm": 1.3112255334854126, "learning_rate": 1.7673164210657215e-07, "loss": 0.1123, "step": 105970 }, { "epoch": 2.471079245722946, "grad_norm": 2.6634490489959717, "learning_rate": 1.7665392028850338e-07, "loss": 0.0939, "step": 105980 }, { "epoch": 2.471312407099764, "grad_norm": 2.519132137298584, "learning_rate": 1.7657619847043462e-07, "loss": 0.1035, "step": 105990 }, { "epoch": 2.4715455684765817, "grad_norm": 1.6047545671463013, "learning_rate": 1.7649847665236583e-07, "loss": 0.1131, "step": 106000 }, { "epoch": 2.4717787298534, "grad_norm": 1.2773016691207886, "learning_rate": 1.764207548342971e-07, "loss": 0.0961, "step": 106010 }, { "epoch": 2.472011891230218, "grad_norm": 2.312122106552124, "learning_rate": 1.763430330162283e-07, "loss": 0.0967, "step": 106020 }, { "epoch": 2.4722450526070356, "grad_norm": 1.3002179861068726, "learning_rate": 1.7626531119815953e-07, "loss": 0.104, "step": 106030 }, { "epoch": 2.4724782139838535, "grad_norm": 1.672440528869629, "learning_rate": 1.7618758938009077e-07, "loss": 0.1081, "step": 106040 }, { "epoch": 2.4727113753606713, "grad_norm": 1.2098253965377808, "learning_rate": 1.76109867562022e-07, "loss": 0.1055, "step": 106050 }, { "epoch": 2.4729445367374896, "grad_norm": 1.411571741104126, "learning_rate": 1.7603214574395321e-07, "loss": 0.1257, "step": 106060 }, { "epoch": 2.4731776981143074, "grad_norm": 1.0492759943008423, "learning_rate": 1.7595442392588448e-07, "loss": 0.1003, "step": 106070 }, { "epoch": 2.4734108594911253, "grad_norm": 1.615187168121338, "learning_rate": 1.7587670210781569e-07, "loss": 0.1202, "step": 106080 }, { "epoch": 2.473644020867943, "grad_norm": 4.670670509338379, "learning_rate": 1.7579898028974695e-07, "loss": 0.0956, "step": 106090 }, { "epoch": 2.4738771822447614, "grad_norm": 1.0785866975784302, "learning_rate": 1.7572125847167816e-07, "loss": 0.1085, "step": 106100 }, { "epoch": 2.4741103436215792, "grad_norm": 1.0461952686309814, "learning_rate": 1.756435366536094e-07, "loss": 0.1031, "step": 106110 }, { "epoch": 2.474343504998397, "grad_norm": 1.9457584619522095, "learning_rate": 1.7556581483554063e-07, "loss": 0.1029, "step": 106120 }, { "epoch": 2.474576666375215, "grad_norm": 1.58564031124115, "learning_rate": 1.7548809301747186e-07, "loss": 0.0955, "step": 106130 }, { "epoch": 2.4748098277520327, "grad_norm": 3.493197441101074, "learning_rate": 1.7541037119940307e-07, "loss": 0.1013, "step": 106140 }, { "epoch": 2.475042989128851, "grad_norm": 1.9085053205490112, "learning_rate": 1.7533264938133434e-07, "loss": 0.0889, "step": 106150 }, { "epoch": 2.475276150505669, "grad_norm": 1.2014886140823364, "learning_rate": 1.7525492756326554e-07, "loss": 0.0996, "step": 106160 }, { "epoch": 2.4755093118824867, "grad_norm": 1.3844213485717773, "learning_rate": 1.7517720574519678e-07, "loss": 0.0954, "step": 106170 }, { "epoch": 2.4757424732593045, "grad_norm": 2.966609477996826, "learning_rate": 1.7509948392712802e-07, "loss": 0.0966, "step": 106180 }, { "epoch": 2.4759756346361224, "grad_norm": 3.715954542160034, "learning_rate": 1.7502176210905925e-07, "loss": 0.1023, "step": 106190 }, { "epoch": 2.4762087960129406, "grad_norm": 1.3124415874481201, "learning_rate": 1.7494404029099046e-07, "loss": 0.0968, "step": 106200 }, { "epoch": 2.4764419573897585, "grad_norm": 1.417288064956665, "learning_rate": 1.7486631847292172e-07, "loss": 0.1019, "step": 106210 }, { "epoch": 2.4766751187665763, "grad_norm": 1.8589988946914673, "learning_rate": 1.7478859665485293e-07, "loss": 0.1008, "step": 106220 }, { "epoch": 2.476908280143394, "grad_norm": 1.3921250104904175, "learning_rate": 1.7471087483678417e-07, "loss": 0.1045, "step": 106230 }, { "epoch": 2.477141441520212, "grad_norm": 1.3805890083312988, "learning_rate": 1.746331530187154e-07, "loss": 0.1093, "step": 106240 }, { "epoch": 2.4773746028970303, "grad_norm": 1.192719578742981, "learning_rate": 1.7455543120064664e-07, "loss": 0.1098, "step": 106250 }, { "epoch": 2.477607764273848, "grad_norm": 1.9886506795883179, "learning_rate": 1.7447770938257785e-07, "loss": 0.0989, "step": 106260 }, { "epoch": 2.477840925650666, "grad_norm": 1.7756110429763794, "learning_rate": 1.743999875645091e-07, "loss": 0.1007, "step": 106270 }, { "epoch": 2.4780740870274838, "grad_norm": 1.735573649406433, "learning_rate": 1.7432226574644032e-07, "loss": 0.1148, "step": 106280 }, { "epoch": 2.4783072484043016, "grad_norm": 1.1475833654403687, "learning_rate": 1.7424454392837156e-07, "loss": 0.1059, "step": 106290 }, { "epoch": 2.47854040978112, "grad_norm": 1.4174158573150635, "learning_rate": 1.741668221103028e-07, "loss": 0.1045, "step": 106300 }, { "epoch": 2.4787735711579377, "grad_norm": 3.2900824546813965, "learning_rate": 1.7408910029223403e-07, "loss": 0.0978, "step": 106310 }, { "epoch": 2.4790067325347556, "grad_norm": 1.5334055423736572, "learning_rate": 1.7401137847416526e-07, "loss": 0.1002, "step": 106320 }, { "epoch": 2.4792398939115734, "grad_norm": 3.8423702716827393, "learning_rate": 1.739336566560965e-07, "loss": 0.1092, "step": 106330 }, { "epoch": 2.4794730552883912, "grad_norm": 1.1618479490280151, "learning_rate": 1.738559348380277e-07, "loss": 0.1234, "step": 106340 }, { "epoch": 2.4797062166652095, "grad_norm": 1.4245632886886597, "learning_rate": 1.7378598520176583e-07, "loss": 0.104, "step": 106350 }, { "epoch": 2.4799393780420274, "grad_norm": 1.4454690217971802, "learning_rate": 1.7370826338369707e-07, "loss": 0.106, "step": 106360 }, { "epoch": 2.480172539418845, "grad_norm": 1.2932544946670532, "learning_rate": 1.7363054156562828e-07, "loss": 0.1024, "step": 106370 }, { "epoch": 2.480405700795663, "grad_norm": 2.1183340549468994, "learning_rate": 1.7355281974755954e-07, "loss": 0.0891, "step": 106380 }, { "epoch": 2.4806388621724813, "grad_norm": 1.0796599388122559, "learning_rate": 1.7347509792949075e-07, "loss": 0.1031, "step": 106390 }, { "epoch": 2.480872023549299, "grad_norm": 2.780930757522583, "learning_rate": 1.7339737611142198e-07, "loss": 0.1072, "step": 106400 }, { "epoch": 2.481105184926117, "grad_norm": 2.5020551681518555, "learning_rate": 1.7331965429335322e-07, "loss": 0.1043, "step": 106410 }, { "epoch": 2.481338346302935, "grad_norm": 1.6260186433792114, "learning_rate": 1.7324193247528446e-07, "loss": 0.0919, "step": 106420 }, { "epoch": 2.481571507679753, "grad_norm": 1.1348323822021484, "learning_rate": 1.7316421065721567e-07, "loss": 0.093, "step": 106430 }, { "epoch": 2.481804669056571, "grad_norm": 1.554855227470398, "learning_rate": 1.7308648883914693e-07, "loss": 0.1079, "step": 106440 }, { "epoch": 2.4820378304333888, "grad_norm": 1.40323805809021, "learning_rate": 1.7300876702107814e-07, "loss": 0.1075, "step": 106450 }, { "epoch": 2.4822709918102066, "grad_norm": 1.6843278408050537, "learning_rate": 1.7293104520300937e-07, "loss": 0.102, "step": 106460 }, { "epoch": 2.4825041531870244, "grad_norm": 2.2002530097961426, "learning_rate": 1.728533233849406e-07, "loss": 0.1094, "step": 106470 }, { "epoch": 2.4827373145638427, "grad_norm": 1.5324162244796753, "learning_rate": 1.7277560156687184e-07, "loss": 0.0946, "step": 106480 }, { "epoch": 2.4829704759406606, "grad_norm": 1.862510323524475, "learning_rate": 1.7269787974880305e-07, "loss": 0.1116, "step": 106490 }, { "epoch": 2.4832036373174784, "grad_norm": 1.6020463705062866, "learning_rate": 1.7262015793073431e-07, "loss": 0.1, "step": 106500 }, { "epoch": 2.4834367986942962, "grad_norm": 2.2978014945983887, "learning_rate": 1.7254243611266552e-07, "loss": 0.1016, "step": 106510 }, { "epoch": 2.483669960071114, "grad_norm": 1.3691329956054688, "learning_rate": 1.7246471429459679e-07, "loss": 0.1112, "step": 106520 }, { "epoch": 2.4839031214479323, "grad_norm": 1.4571187496185303, "learning_rate": 1.72386992476528e-07, "loss": 0.0978, "step": 106530 }, { "epoch": 2.48413628282475, "grad_norm": 1.0168718099594116, "learning_rate": 1.7230927065845923e-07, "loss": 0.1043, "step": 106540 }, { "epoch": 2.484369444201568, "grad_norm": 1.3018766641616821, "learning_rate": 1.7223154884039047e-07, "loss": 0.106, "step": 106550 }, { "epoch": 2.484602605578386, "grad_norm": 1.050080418586731, "learning_rate": 1.721538270223217e-07, "loss": 0.1114, "step": 106560 }, { "epoch": 2.4848357669552037, "grad_norm": 1.143310546875, "learning_rate": 1.720761052042529e-07, "loss": 0.1027, "step": 106570 }, { "epoch": 2.485068928332022, "grad_norm": 1.397138237953186, "learning_rate": 1.7199838338618417e-07, "loss": 0.0969, "step": 106580 }, { "epoch": 2.48530208970884, "grad_norm": 1.6085816621780396, "learning_rate": 1.7192066156811538e-07, "loss": 0.0946, "step": 106590 }, { "epoch": 2.4855352510856576, "grad_norm": 1.707897663116455, "learning_rate": 1.7184293975004662e-07, "loss": 0.1089, "step": 106600 }, { "epoch": 2.4857684124624755, "grad_norm": 2.083698034286499, "learning_rate": 1.7176521793197785e-07, "loss": 0.1112, "step": 106610 }, { "epoch": 2.4860015738392933, "grad_norm": 1.4245061874389648, "learning_rate": 1.716874961139091e-07, "loss": 0.1018, "step": 106620 }, { "epoch": 2.4862347352161116, "grad_norm": 1.1647671461105347, "learning_rate": 1.716097742958403e-07, "loss": 0.1117, "step": 106630 }, { "epoch": 2.4864678965929294, "grad_norm": 2.340296745300293, "learning_rate": 1.7153205247777156e-07, "loss": 0.0987, "step": 106640 }, { "epoch": 2.4867010579697473, "grad_norm": 1.1574172973632812, "learning_rate": 1.7145433065970277e-07, "loss": 0.1101, "step": 106650 }, { "epoch": 2.486934219346565, "grad_norm": 1.767887830734253, "learning_rate": 1.71376608841634e-07, "loss": 0.1155, "step": 106660 }, { "epoch": 2.487167380723383, "grad_norm": 2.065972089767456, "learning_rate": 1.7129888702356524e-07, "loss": 0.1017, "step": 106670 }, { "epoch": 2.4874005421002012, "grad_norm": 2.0838565826416016, "learning_rate": 1.7122116520549648e-07, "loss": 0.1068, "step": 106680 }, { "epoch": 2.487633703477019, "grad_norm": 1.2657020092010498, "learning_rate": 1.711434433874277e-07, "loss": 0.0947, "step": 106690 }, { "epoch": 2.487866864853837, "grad_norm": 1.5054630041122437, "learning_rate": 1.7106572156935895e-07, "loss": 0.1037, "step": 106700 }, { "epoch": 2.4881000262306547, "grad_norm": 1.3079743385314941, "learning_rate": 1.7098799975129016e-07, "loss": 0.1194, "step": 106710 }, { "epoch": 2.4883331876074726, "grad_norm": 3.571280002593994, "learning_rate": 1.7091027793322142e-07, "loss": 0.0994, "step": 106720 }, { "epoch": 2.488566348984291, "grad_norm": 2.2212226390838623, "learning_rate": 1.7083255611515263e-07, "loss": 0.1069, "step": 106730 }, { "epoch": 2.4887995103611087, "grad_norm": 1.280059576034546, "learning_rate": 1.7075483429708387e-07, "loss": 0.0985, "step": 106740 }, { "epoch": 2.4890326717379265, "grad_norm": 1.3580416440963745, "learning_rate": 1.706771124790151e-07, "loss": 0.107, "step": 106750 }, { "epoch": 2.4892658331147444, "grad_norm": 1.024038553237915, "learning_rate": 1.7059939066094634e-07, "loss": 0.0952, "step": 106760 }, { "epoch": 2.4894989944915626, "grad_norm": 1.3187170028686523, "learning_rate": 1.7052166884287755e-07, "loss": 0.1059, "step": 106770 }, { "epoch": 2.4897321558683805, "grad_norm": 1.2744014263153076, "learning_rate": 1.704439470248088e-07, "loss": 0.1, "step": 106780 }, { "epoch": 2.4899653172451983, "grad_norm": 1.459342360496521, "learning_rate": 1.7036622520674002e-07, "loss": 0.0989, "step": 106790 }, { "epoch": 2.490198478622016, "grad_norm": 1.9069830179214478, "learning_rate": 1.7028850338867125e-07, "loss": 0.1079, "step": 106800 }, { "epoch": 2.4904316399988344, "grad_norm": 1.3899251222610474, "learning_rate": 1.702107815706025e-07, "loss": 0.1104, "step": 106810 }, { "epoch": 2.4906648013756523, "grad_norm": 1.4435946941375732, "learning_rate": 1.7013305975253372e-07, "loss": 0.0967, "step": 106820 }, { "epoch": 2.49089796275247, "grad_norm": 2.1160950660705566, "learning_rate": 1.7005533793446493e-07, "loss": 0.1078, "step": 106830 }, { "epoch": 2.491131124129288, "grad_norm": 1.4320286512374878, "learning_rate": 1.699776161163962e-07, "loss": 0.1022, "step": 106840 }, { "epoch": 2.491364285506106, "grad_norm": 1.4391469955444336, "learning_rate": 1.698998942983274e-07, "loss": 0.0921, "step": 106850 }, { "epoch": 2.491597446882924, "grad_norm": 2.1052372455596924, "learning_rate": 1.6982217248025864e-07, "loss": 0.0975, "step": 106860 }, { "epoch": 2.491830608259742, "grad_norm": 1.3117469549179077, "learning_rate": 1.6974445066218988e-07, "loss": 0.1054, "step": 106870 }, { "epoch": 2.4920637696365597, "grad_norm": 1.2564704418182373, "learning_rate": 1.696667288441211e-07, "loss": 0.0908, "step": 106880 }, { "epoch": 2.4922969310133776, "grad_norm": 2.772144317626953, "learning_rate": 1.6958900702605232e-07, "loss": 0.0969, "step": 106890 }, { "epoch": 2.4925300923901954, "grad_norm": 1.238731861114502, "learning_rate": 1.6951128520798358e-07, "loss": 0.1071, "step": 106900 }, { "epoch": 2.4927632537670137, "grad_norm": 1.8864327669143677, "learning_rate": 1.694335633899148e-07, "loss": 0.1022, "step": 106910 }, { "epoch": 2.4929964151438315, "grad_norm": 2.4714534282684326, "learning_rate": 1.6935584157184605e-07, "loss": 0.0962, "step": 106920 }, { "epoch": 2.4932295765206494, "grad_norm": 0.940628170967102, "learning_rate": 1.6927811975377726e-07, "loss": 0.1063, "step": 106930 }, { "epoch": 2.493462737897467, "grad_norm": 1.6145552396774292, "learning_rate": 1.692003979357085e-07, "loss": 0.0998, "step": 106940 }, { "epoch": 2.493695899274285, "grad_norm": 1.166817545890808, "learning_rate": 1.6912267611763974e-07, "loss": 0.0972, "step": 106950 }, { "epoch": 2.4939290606511033, "grad_norm": 2.3107893466949463, "learning_rate": 1.6904495429957097e-07, "loss": 0.1068, "step": 106960 }, { "epoch": 2.494162222027921, "grad_norm": 2.646486282348633, "learning_rate": 1.6896723248150218e-07, "loss": 0.106, "step": 106970 }, { "epoch": 2.494395383404739, "grad_norm": 1.3266246318817139, "learning_rate": 1.6888951066343344e-07, "loss": 0.1033, "step": 106980 }, { "epoch": 2.494628544781557, "grad_norm": 1.5334433317184448, "learning_rate": 1.6881178884536465e-07, "loss": 0.1054, "step": 106990 }, { "epoch": 2.4948617061583747, "grad_norm": 2.3781135082244873, "learning_rate": 1.687340670272959e-07, "loss": 0.1046, "step": 107000 }, { "epoch": 2.495094867535193, "grad_norm": 1.8033756017684937, "learning_rate": 1.6865634520922712e-07, "loss": 0.1151, "step": 107010 }, { "epoch": 2.4953280289120108, "grad_norm": 1.8347644805908203, "learning_rate": 1.6857862339115836e-07, "loss": 0.1048, "step": 107020 }, { "epoch": 2.4955611902888286, "grad_norm": 1.5293896198272705, "learning_rate": 1.6850090157308957e-07, "loss": 0.1013, "step": 107030 }, { "epoch": 2.4957943516656464, "grad_norm": 2.648118734359741, "learning_rate": 1.6842317975502083e-07, "loss": 0.0929, "step": 107040 }, { "epoch": 2.4960275130424643, "grad_norm": 1.3165831565856934, "learning_rate": 1.6834545793695204e-07, "loss": 0.1122, "step": 107050 }, { "epoch": 2.4962606744192826, "grad_norm": 1.1485404968261719, "learning_rate": 1.6826773611888327e-07, "loss": 0.1004, "step": 107060 }, { "epoch": 2.4964938357961004, "grad_norm": 1.4003334045410156, "learning_rate": 1.681900143008145e-07, "loss": 0.1005, "step": 107070 }, { "epoch": 2.4967269971729182, "grad_norm": 1.308172583580017, "learning_rate": 1.6811229248274575e-07, "loss": 0.1069, "step": 107080 }, { "epoch": 2.496960158549736, "grad_norm": 2.4232442378997803, "learning_rate": 1.6803457066467696e-07, "loss": 0.0992, "step": 107090 }, { "epoch": 2.4971933199265544, "grad_norm": 2.719998598098755, "learning_rate": 1.6795684884660822e-07, "loss": 0.1077, "step": 107100 }, { "epoch": 2.497426481303372, "grad_norm": 1.317330002784729, "learning_rate": 1.6787912702853945e-07, "loss": 0.0977, "step": 107110 }, { "epoch": 2.49765964268019, "grad_norm": 1.989184021949768, "learning_rate": 1.678014052104707e-07, "loss": 0.1038, "step": 107120 }, { "epoch": 2.497892804057008, "grad_norm": 2.055741310119629, "learning_rate": 1.6772368339240192e-07, "loss": 0.0951, "step": 107130 }, { "epoch": 2.4981259654338257, "grad_norm": 1.242194414138794, "learning_rate": 1.6764596157433313e-07, "loss": 0.0995, "step": 107140 }, { "epoch": 2.498359126810644, "grad_norm": 1.3396214246749878, "learning_rate": 1.675682397562644e-07, "loss": 0.1096, "step": 107150 }, { "epoch": 2.498592288187462, "grad_norm": 1.4154491424560547, "learning_rate": 1.674905179381956e-07, "loss": 0.1114, "step": 107160 }, { "epoch": 2.4988254495642797, "grad_norm": 2.243303060531616, "learning_rate": 1.6741279612012684e-07, "loss": 0.1052, "step": 107170 }, { "epoch": 2.4990586109410975, "grad_norm": 2.689541816711426, "learning_rate": 1.6733507430205808e-07, "loss": 0.1043, "step": 107180 }, { "epoch": 2.4992917723179158, "grad_norm": 1.2386469841003418, "learning_rate": 1.672573524839893e-07, "loss": 0.0913, "step": 107190 }, { "epoch": 2.4995249336947336, "grad_norm": 1.7248347997665405, "learning_rate": 1.6717963066592052e-07, "loss": 0.0966, "step": 107200 }, { "epoch": 2.4997580950715514, "grad_norm": 1.4323772192001343, "learning_rate": 1.6710190884785178e-07, "loss": 0.1053, "step": 107210 }, { "epoch": 2.4999912564483693, "grad_norm": 1.4287052154541016, "learning_rate": 1.67024187029783e-07, "loss": 0.0907, "step": 107220 }, { "epoch": 2.500224417825187, "grad_norm": 2.158101797103882, "learning_rate": 1.6694646521171423e-07, "loss": 0.0998, "step": 107230 }, { "epoch": 2.5004575792020054, "grad_norm": 2.9200093746185303, "learning_rate": 1.6686874339364546e-07, "loss": 0.1154, "step": 107240 }, { "epoch": 2.5006907405788232, "grad_norm": 3.0734801292419434, "learning_rate": 1.667910215755767e-07, "loss": 0.099, "step": 107250 }, { "epoch": 2.500923901955641, "grad_norm": 1.1197175979614258, "learning_rate": 1.667132997575079e-07, "loss": 0.0942, "step": 107260 }, { "epoch": 2.501157063332459, "grad_norm": 1.9786560535430908, "learning_rate": 1.6663557793943917e-07, "loss": 0.109, "step": 107270 }, { "epoch": 2.5013902247092767, "grad_norm": 1.1415083408355713, "learning_rate": 1.6655785612137038e-07, "loss": 0.0971, "step": 107280 }, { "epoch": 2.501623386086095, "grad_norm": 2.362069606781006, "learning_rate": 1.6648013430330162e-07, "loss": 0.0948, "step": 107290 }, { "epoch": 2.501856547462913, "grad_norm": 1.568804144859314, "learning_rate": 1.6640241248523285e-07, "loss": 0.1092, "step": 107300 }, { "epoch": 2.5020897088397307, "grad_norm": 2.018017292022705, "learning_rate": 1.663246906671641e-07, "loss": 0.1125, "step": 107310 }, { "epoch": 2.5023228702165485, "grad_norm": 1.0092530250549316, "learning_rate": 1.662469688490953e-07, "loss": 0.0916, "step": 107320 }, { "epoch": 2.5025560315933664, "grad_norm": 2.347391366958618, "learning_rate": 1.6616924703102656e-07, "loss": 0.0945, "step": 107330 }, { "epoch": 2.5027891929701847, "grad_norm": 1.5457574129104614, "learning_rate": 1.6609152521295777e-07, "loss": 0.1107, "step": 107340 }, { "epoch": 2.5030223543470025, "grad_norm": 1.3526111841201782, "learning_rate": 1.6601380339488903e-07, "loss": 0.115, "step": 107350 }, { "epoch": 2.5032555157238203, "grad_norm": 1.2687461376190186, "learning_rate": 1.6593608157682024e-07, "loss": 0.0992, "step": 107360 }, { "epoch": 2.503488677100638, "grad_norm": 1.3310738801956177, "learning_rate": 1.6585835975875147e-07, "loss": 0.1082, "step": 107370 }, { "epoch": 2.503721838477456, "grad_norm": 2.042182445526123, "learning_rate": 1.657806379406827e-07, "loss": 0.1085, "step": 107380 }, { "epoch": 2.5039549998542743, "grad_norm": 2.5426995754241943, "learning_rate": 1.6570291612261395e-07, "loss": 0.097, "step": 107390 }, { "epoch": 2.504188161231092, "grad_norm": 1.2011067867279053, "learning_rate": 1.6562519430454516e-07, "loss": 0.0919, "step": 107400 }, { "epoch": 2.50442132260791, "grad_norm": 1.3288652896881104, "learning_rate": 1.6554747248647642e-07, "loss": 0.1066, "step": 107410 }, { "epoch": 2.504654483984728, "grad_norm": 1.5166929960250854, "learning_rate": 1.6546975066840763e-07, "loss": 0.1037, "step": 107420 }, { "epoch": 2.5048876453615456, "grad_norm": 1.372004747390747, "learning_rate": 1.6539202885033886e-07, "loss": 0.1016, "step": 107430 }, { "epoch": 2.505120806738364, "grad_norm": 1.4509308338165283, "learning_rate": 1.653143070322701e-07, "loss": 0.1113, "step": 107440 }, { "epoch": 2.5053539681151817, "grad_norm": 2.808438777923584, "learning_rate": 1.6523658521420133e-07, "loss": 0.1094, "step": 107450 }, { "epoch": 2.5055871294919996, "grad_norm": 2.9633662700653076, "learning_rate": 1.6515886339613254e-07, "loss": 0.1007, "step": 107460 }, { "epoch": 2.505820290868818, "grad_norm": 1.2804008722305298, "learning_rate": 1.650811415780638e-07, "loss": 0.1176, "step": 107470 }, { "epoch": 2.5060534522456352, "grad_norm": 2.1450159549713135, "learning_rate": 1.6500341975999501e-07, "loss": 0.1002, "step": 107480 }, { "epoch": 2.5062866136224535, "grad_norm": 3.6695165634155273, "learning_rate": 1.6492569794192625e-07, "loss": 0.0998, "step": 107490 }, { "epoch": 2.5065197749992714, "grad_norm": 1.7344106435775757, "learning_rate": 1.6484797612385749e-07, "loss": 0.1079, "step": 107500 }, { "epoch": 2.506752936376089, "grad_norm": 1.6719889640808105, "learning_rate": 1.6477025430578872e-07, "loss": 0.1051, "step": 107510 }, { "epoch": 2.5069860977529075, "grad_norm": 1.3009659051895142, "learning_rate": 1.6469253248771993e-07, "loss": 0.0953, "step": 107520 }, { "epoch": 2.5072192591297253, "grad_norm": 1.5352368354797363, "learning_rate": 1.646148106696512e-07, "loss": 0.1042, "step": 107530 }, { "epoch": 2.507452420506543, "grad_norm": 1.563373327255249, "learning_rate": 1.645370888515824e-07, "loss": 0.1077, "step": 107540 }, { "epoch": 2.507685581883361, "grad_norm": 1.0603976249694824, "learning_rate": 1.6445936703351366e-07, "loss": 0.1028, "step": 107550 }, { "epoch": 2.507918743260179, "grad_norm": 2.3664138317108154, "learning_rate": 1.6438164521544487e-07, "loss": 0.1051, "step": 107560 }, { "epoch": 2.508151904636997, "grad_norm": 1.8363851308822632, "learning_rate": 1.643039233973761e-07, "loss": 0.1047, "step": 107570 }, { "epoch": 2.508385066013815, "grad_norm": 4.047418117523193, "learning_rate": 1.6422620157930734e-07, "loss": 0.1072, "step": 107580 }, { "epoch": 2.508618227390633, "grad_norm": 1.5339815616607666, "learning_rate": 1.6414847976123858e-07, "loss": 0.1048, "step": 107590 }, { "epoch": 2.5088513887674506, "grad_norm": 1.5817523002624512, "learning_rate": 1.640707579431698e-07, "loss": 0.108, "step": 107600 }, { "epoch": 2.5090845501442685, "grad_norm": 1.7279778718948364, "learning_rate": 1.6399303612510105e-07, "loss": 0.1026, "step": 107610 }, { "epoch": 2.5093177115210867, "grad_norm": 2.2235841751098633, "learning_rate": 1.6391531430703226e-07, "loss": 0.1019, "step": 107620 }, { "epoch": 2.5095508728979046, "grad_norm": 1.609326958656311, "learning_rate": 1.638375924889635e-07, "loss": 0.1135, "step": 107630 }, { "epoch": 2.5097840342747224, "grad_norm": 1.5052893161773682, "learning_rate": 1.6375987067089473e-07, "loss": 0.0921, "step": 107640 }, { "epoch": 2.5100171956515402, "grad_norm": 2.308223009109497, "learning_rate": 1.6368214885282597e-07, "loss": 0.094, "step": 107650 }, { "epoch": 2.510250357028358, "grad_norm": 1.0986086130142212, "learning_rate": 1.6360442703475718e-07, "loss": 0.1017, "step": 107660 }, { "epoch": 2.5104835184051764, "grad_norm": 1.619519591331482, "learning_rate": 1.6352670521668844e-07, "loss": 0.0999, "step": 107670 }, { "epoch": 2.510716679781994, "grad_norm": 1.3195358514785767, "learning_rate": 1.6344898339861965e-07, "loss": 0.1004, "step": 107680 }, { "epoch": 2.510949841158812, "grad_norm": 1.185507893562317, "learning_rate": 1.6337126158055088e-07, "loss": 0.1004, "step": 107690 }, { "epoch": 2.51118300253563, "grad_norm": 1.8644328117370605, "learning_rate": 1.6329353976248212e-07, "loss": 0.1028, "step": 107700 }, { "epoch": 2.5114161639124477, "grad_norm": 1.4724113941192627, "learning_rate": 1.6321581794441336e-07, "loss": 0.1095, "step": 107710 }, { "epoch": 2.511649325289266, "grad_norm": 1.4980807304382324, "learning_rate": 1.6313809612634456e-07, "loss": 0.1084, "step": 107720 }, { "epoch": 2.511882486666084, "grad_norm": 1.0088344812393188, "learning_rate": 1.6306037430827583e-07, "loss": 0.1109, "step": 107730 }, { "epoch": 2.5121156480429017, "grad_norm": 1.5244766473770142, "learning_rate": 1.6298265249020704e-07, "loss": 0.1035, "step": 107740 }, { "epoch": 2.5123488094197195, "grad_norm": 1.3720488548278809, "learning_rate": 1.629049306721383e-07, "loss": 0.0958, "step": 107750 }, { "epoch": 2.5125819707965373, "grad_norm": 1.3051412105560303, "learning_rate": 1.628272088540695e-07, "loss": 0.1058, "step": 107760 }, { "epoch": 2.5128151321733556, "grad_norm": 1.418270468711853, "learning_rate": 1.6274948703600074e-07, "loss": 0.1111, "step": 107770 }, { "epoch": 2.5130482935501735, "grad_norm": 2.085618019104004, "learning_rate": 1.6267176521793198e-07, "loss": 0.1094, "step": 107780 }, { "epoch": 2.5132814549269913, "grad_norm": 1.356042504310608, "learning_rate": 1.6259404339986321e-07, "loss": 0.1065, "step": 107790 }, { "epoch": 2.513514616303809, "grad_norm": 1.3488123416900635, "learning_rate": 1.6251632158179442e-07, "loss": 0.1141, "step": 107800 }, { "epoch": 2.513747777680627, "grad_norm": 2.095038890838623, "learning_rate": 1.6243859976372569e-07, "loss": 0.1079, "step": 107810 }, { "epoch": 2.5139809390574452, "grad_norm": 2.785930871963501, "learning_rate": 1.623608779456569e-07, "loss": 0.1005, "step": 107820 }, { "epoch": 2.514214100434263, "grad_norm": 1.5502543449401855, "learning_rate": 1.6228315612758813e-07, "loss": 0.1001, "step": 107830 }, { "epoch": 2.514447261811081, "grad_norm": 1.3664785623550415, "learning_rate": 1.6220543430951937e-07, "loss": 0.1011, "step": 107840 }, { "epoch": 2.514680423187899, "grad_norm": 2.807953357696533, "learning_rate": 1.621277124914506e-07, "loss": 0.1112, "step": 107850 }, { "epoch": 2.5149135845647166, "grad_norm": 2.1109094619750977, "learning_rate": 1.620499906733818e-07, "loss": 0.101, "step": 107860 }, { "epoch": 2.515146745941535, "grad_norm": 1.3277149200439453, "learning_rate": 1.6197226885531307e-07, "loss": 0.114, "step": 107870 }, { "epoch": 2.5153799073183527, "grad_norm": 1.6726031303405762, "learning_rate": 1.6189454703724428e-07, "loss": 0.1006, "step": 107880 }, { "epoch": 2.5156130686951705, "grad_norm": 1.0682315826416016, "learning_rate": 1.6181682521917552e-07, "loss": 0.0958, "step": 107890 }, { "epoch": 2.515846230071989, "grad_norm": 1.7828378677368164, "learning_rate": 1.6173910340110675e-07, "loss": 0.0979, "step": 107900 }, { "epoch": 2.5160793914488067, "grad_norm": 1.5713520050048828, "learning_rate": 1.61661381583038e-07, "loss": 0.1105, "step": 107910 }, { "epoch": 2.5163125528256245, "grad_norm": 1.1968187093734741, "learning_rate": 1.615836597649692e-07, "loss": 0.0946, "step": 107920 }, { "epoch": 2.5165457142024423, "grad_norm": 1.2226152420043945, "learning_rate": 1.6150593794690046e-07, "loss": 0.0968, "step": 107930 }, { "epoch": 2.51677887557926, "grad_norm": 1.4429538249969482, "learning_rate": 1.6142821612883167e-07, "loss": 0.1064, "step": 107940 }, { "epoch": 2.5170120369560784, "grad_norm": 1.1830635070800781, "learning_rate": 1.613504943107629e-07, "loss": 0.1045, "step": 107950 }, { "epoch": 2.5172451983328963, "grad_norm": 1.9345781803131104, "learning_rate": 1.6127277249269414e-07, "loss": 0.0998, "step": 107960 }, { "epoch": 2.517478359709714, "grad_norm": 1.8978649377822876, "learning_rate": 1.6119505067462538e-07, "loss": 0.1043, "step": 107970 }, { "epoch": 2.517711521086532, "grad_norm": 1.6767622232437134, "learning_rate": 1.611173288565566e-07, "loss": 0.1167, "step": 107980 }, { "epoch": 2.51794468246335, "grad_norm": 1.1867767572402954, "learning_rate": 1.6103960703848785e-07, "loss": 0.0956, "step": 107990 }, { "epoch": 2.518177843840168, "grad_norm": 1.25746488571167, "learning_rate": 1.6096188522041906e-07, "loss": 0.1055, "step": 108000 }, { "epoch": 2.518411005216986, "grad_norm": 2.3545281887054443, "learning_rate": 1.6088416340235032e-07, "loss": 0.096, "step": 108010 }, { "epoch": 2.5186441665938037, "grad_norm": 1.8296493291854858, "learning_rate": 1.6080644158428153e-07, "loss": 0.1022, "step": 108020 }, { "epoch": 2.5188773279706216, "grad_norm": 2.0047638416290283, "learning_rate": 1.6072871976621277e-07, "loss": 0.0914, "step": 108030 }, { "epoch": 2.5191104893474394, "grad_norm": 1.5616462230682373, "learning_rate": 1.60650997948144e-07, "loss": 0.104, "step": 108040 }, { "epoch": 2.5193436507242577, "grad_norm": 1.2560863494873047, "learning_rate": 1.6057327613007524e-07, "loss": 0.0955, "step": 108050 }, { "epoch": 2.5195768121010755, "grad_norm": 2.176622152328491, "learning_rate": 1.6049555431200645e-07, "loss": 0.1086, "step": 108060 }, { "epoch": 2.5198099734778934, "grad_norm": 2.4620516300201416, "learning_rate": 1.604178324939377e-07, "loss": 0.1053, "step": 108070 }, { "epoch": 2.520043134854711, "grad_norm": 1.413917899131775, "learning_rate": 1.6034011067586892e-07, "loss": 0.1124, "step": 108080 }, { "epoch": 2.520276296231529, "grad_norm": 1.3164888620376587, "learning_rate": 1.6026238885780015e-07, "loss": 0.1041, "step": 108090 }, { "epoch": 2.5205094576083473, "grad_norm": 1.5951625108718872, "learning_rate": 1.601846670397314e-07, "loss": 0.104, "step": 108100 }, { "epoch": 2.520742618985165, "grad_norm": 1.2345478534698486, "learning_rate": 1.6010694522166262e-07, "loss": 0.106, "step": 108110 }, { "epoch": 2.520975780361983, "grad_norm": 1.2521047592163086, "learning_rate": 1.6002922340359383e-07, "loss": 0.1164, "step": 108120 }, { "epoch": 2.521208941738801, "grad_norm": 2.2470953464508057, "learning_rate": 1.599515015855251e-07, "loss": 0.0961, "step": 108130 }, { "epoch": 2.5214421031156187, "grad_norm": 1.4276201725006104, "learning_rate": 1.598737797674563e-07, "loss": 0.0998, "step": 108140 }, { "epoch": 2.521675264492437, "grad_norm": 1.9160493612289429, "learning_rate": 1.5979605794938754e-07, "loss": 0.1023, "step": 108150 }, { "epoch": 2.521908425869255, "grad_norm": 1.2703076601028442, "learning_rate": 1.5971833613131878e-07, "loss": 0.1098, "step": 108160 }, { "epoch": 2.5221415872460726, "grad_norm": 2.133206367492676, "learning_rate": 1.5964061431325e-07, "loss": 0.1096, "step": 108170 }, { "epoch": 2.5223747486228905, "grad_norm": 1.8315048217773438, "learning_rate": 1.5956289249518122e-07, "loss": 0.1005, "step": 108180 }, { "epoch": 2.5226079099997083, "grad_norm": 1.2148123979568481, "learning_rate": 1.5948517067711248e-07, "loss": 0.1051, "step": 108190 }, { "epoch": 2.5228410713765266, "grad_norm": 2.991722822189331, "learning_rate": 1.594074488590437e-07, "loss": 0.1027, "step": 108200 }, { "epoch": 2.5230742327533444, "grad_norm": 1.6942486763000488, "learning_rate": 1.5932972704097495e-07, "loss": 0.1031, "step": 108210 }, { "epoch": 2.5233073941301623, "grad_norm": 1.8465054035186768, "learning_rate": 1.5925200522290616e-07, "loss": 0.0997, "step": 108220 }, { "epoch": 2.5235405555069805, "grad_norm": 2.54974102973938, "learning_rate": 1.591742834048374e-07, "loss": 0.0906, "step": 108230 }, { "epoch": 2.523773716883798, "grad_norm": 1.7746825218200684, "learning_rate": 1.5909656158676864e-07, "loss": 0.1068, "step": 108240 }, { "epoch": 2.524006878260616, "grad_norm": 1.6373552083969116, "learning_rate": 1.5901883976869987e-07, "loss": 0.1129, "step": 108250 }, { "epoch": 2.524240039637434, "grad_norm": 1.38238525390625, "learning_rate": 1.5894111795063108e-07, "loss": 0.1105, "step": 108260 }, { "epoch": 2.524473201014252, "grad_norm": 1.515162467956543, "learning_rate": 1.5886339613256234e-07, "loss": 0.1147, "step": 108270 }, { "epoch": 2.52470636239107, "grad_norm": 1.3499149084091187, "learning_rate": 1.5878567431449355e-07, "loss": 0.097, "step": 108280 }, { "epoch": 2.524939523767888, "grad_norm": 1.2435879707336426, "learning_rate": 1.587079524964248e-07, "loss": 0.1035, "step": 108290 }, { "epoch": 2.525172685144706, "grad_norm": 1.237954020500183, "learning_rate": 1.5863023067835602e-07, "loss": 0.0938, "step": 108300 }, { "epoch": 2.5254058465215237, "grad_norm": 1.3345999717712402, "learning_rate": 1.5855250886028726e-07, "loss": 0.0971, "step": 108310 }, { "epoch": 2.5256390078983415, "grad_norm": 1.324588656425476, "learning_rate": 1.5847478704221847e-07, "loss": 0.0978, "step": 108320 }, { "epoch": 2.52587216927516, "grad_norm": 1.1814764738082886, "learning_rate": 1.5839706522414973e-07, "loss": 0.1014, "step": 108330 }, { "epoch": 2.5261053306519776, "grad_norm": 1.5651317834854126, "learning_rate": 1.5831934340608094e-07, "loss": 0.1075, "step": 108340 }, { "epoch": 2.5263384920287955, "grad_norm": 1.3670625686645508, "learning_rate": 1.5824162158801217e-07, "loss": 0.1022, "step": 108350 }, { "epoch": 2.5265716534056133, "grad_norm": 3.020195484161377, "learning_rate": 1.581638997699434e-07, "loss": 0.1036, "step": 108360 }, { "epoch": 2.526804814782431, "grad_norm": 1.7283176183700562, "learning_rate": 1.5808617795187465e-07, "loss": 0.1031, "step": 108370 }, { "epoch": 2.5270379761592494, "grad_norm": 1.2136117219924927, "learning_rate": 1.5800845613380586e-07, "loss": 0.1054, "step": 108380 }, { "epoch": 2.5272711375360672, "grad_norm": 2.213261842727661, "learning_rate": 1.5793073431573712e-07, "loss": 0.0988, "step": 108390 }, { "epoch": 2.527504298912885, "grad_norm": 1.257859230041504, "learning_rate": 1.5785301249766833e-07, "loss": 0.0965, "step": 108400 }, { "epoch": 2.527737460289703, "grad_norm": 1.5476999282836914, "learning_rate": 1.577752906795996e-07, "loss": 0.0941, "step": 108410 }, { "epoch": 2.5279706216665208, "grad_norm": 1.1448991298675537, "learning_rate": 1.576975688615308e-07, "loss": 0.1014, "step": 108420 }, { "epoch": 2.528203783043339, "grad_norm": 1.9977527856826782, "learning_rate": 1.5761984704346203e-07, "loss": 0.1043, "step": 108430 }, { "epoch": 2.528436944420157, "grad_norm": 1.5536820888519287, "learning_rate": 1.5754212522539327e-07, "loss": 0.099, "step": 108440 }, { "epoch": 2.5286701057969747, "grad_norm": 1.6059070825576782, "learning_rate": 1.574644034073245e-07, "loss": 0.1056, "step": 108450 }, { "epoch": 2.5289032671737925, "grad_norm": 1.540440559387207, "learning_rate": 1.5738668158925571e-07, "loss": 0.1083, "step": 108460 }, { "epoch": 2.5291364285506104, "grad_norm": 1.3152104616165161, "learning_rate": 1.5730895977118698e-07, "loss": 0.1004, "step": 108470 }, { "epoch": 2.5293695899274287, "grad_norm": 1.0107420682907104, "learning_rate": 1.5723123795311819e-07, "loss": 0.1027, "step": 108480 }, { "epoch": 2.5296027513042465, "grad_norm": 1.9165817499160767, "learning_rate": 1.5715351613504942e-07, "loss": 0.1149, "step": 108490 }, { "epoch": 2.5298359126810643, "grad_norm": 1.6394743919372559, "learning_rate": 1.5707579431698066e-07, "loss": 0.1052, "step": 108500 }, { "epoch": 2.530069074057882, "grad_norm": 1.2258273363113403, "learning_rate": 1.569980724989119e-07, "loss": 0.0941, "step": 108510 }, { "epoch": 2.5303022354347, "grad_norm": 1.451756477355957, "learning_rate": 1.569203506808431e-07, "loss": 0.1127, "step": 108520 }, { "epoch": 2.5305353968115183, "grad_norm": 1.2403898239135742, "learning_rate": 1.5684262886277436e-07, "loss": 0.1025, "step": 108530 }, { "epoch": 2.530768558188336, "grad_norm": 2.1505396366119385, "learning_rate": 1.5676490704470557e-07, "loss": 0.1084, "step": 108540 }, { "epoch": 2.531001719565154, "grad_norm": 2.009378671646118, "learning_rate": 1.566871852266368e-07, "loss": 0.1077, "step": 108550 }, { "epoch": 2.531234880941972, "grad_norm": 1.594220519065857, "learning_rate": 1.5660946340856804e-07, "loss": 0.1008, "step": 108560 }, { "epoch": 2.5314680423187896, "grad_norm": 2.9264562129974365, "learning_rate": 1.5653174159049928e-07, "loss": 0.1113, "step": 108570 }, { "epoch": 2.531701203695608, "grad_norm": 1.378010630607605, "learning_rate": 1.564540197724305e-07, "loss": 0.1109, "step": 108580 }, { "epoch": 2.5319343650724258, "grad_norm": 1.2358531951904297, "learning_rate": 1.5637629795436175e-07, "loss": 0.107, "step": 108590 }, { "epoch": 2.5321675264492436, "grad_norm": 1.5588269233703613, "learning_rate": 1.5629857613629296e-07, "loss": 0.1073, "step": 108600 }, { "epoch": 2.532400687826062, "grad_norm": 1.3835703134536743, "learning_rate": 1.5622085431822422e-07, "loss": 0.1073, "step": 108610 }, { "epoch": 2.5326338492028797, "grad_norm": 1.2804961204528809, "learning_rate": 1.5614313250015543e-07, "loss": 0.102, "step": 108620 }, { "epoch": 2.5328670105796975, "grad_norm": 2.0241641998291016, "learning_rate": 1.5606541068208667e-07, "loss": 0.0894, "step": 108630 }, { "epoch": 2.5331001719565154, "grad_norm": 1.0567504167556763, "learning_rate": 1.559876888640179e-07, "loss": 0.1039, "step": 108640 }, { "epoch": 2.533333333333333, "grad_norm": 1.2792279720306396, "learning_rate": 1.5590996704594914e-07, "loss": 0.1079, "step": 108650 }, { "epoch": 2.5335664947101515, "grad_norm": 2.4037816524505615, "learning_rate": 1.5583224522788035e-07, "loss": 0.1166, "step": 108660 }, { "epoch": 2.5337996560869693, "grad_norm": 1.3546059131622314, "learning_rate": 1.557545234098116e-07, "loss": 0.113, "step": 108670 }, { "epoch": 2.534032817463787, "grad_norm": 1.173958659172058, "learning_rate": 1.5567680159174282e-07, "loss": 0.0992, "step": 108680 }, { "epoch": 2.534265978840605, "grad_norm": 1.479486346244812, "learning_rate": 1.5559907977367406e-07, "loss": 0.0994, "step": 108690 }, { "epoch": 2.534499140217423, "grad_norm": 1.4749780893325806, "learning_rate": 1.555213579556053e-07, "loss": 0.1021, "step": 108700 }, { "epoch": 2.534732301594241, "grad_norm": 2.0658271312713623, "learning_rate": 1.5544363613753653e-07, "loss": 0.0967, "step": 108710 }, { "epoch": 2.534965462971059, "grad_norm": 1.9056421518325806, "learning_rate": 1.5536591431946774e-07, "loss": 0.1002, "step": 108720 }, { "epoch": 2.535198624347877, "grad_norm": 2.415158987045288, "learning_rate": 1.55288192501399e-07, "loss": 0.0986, "step": 108730 }, { "epoch": 2.5354317857246946, "grad_norm": 1.6764453649520874, "learning_rate": 1.552104706833302e-07, "loss": 0.101, "step": 108740 }, { "epoch": 2.5356649471015125, "grad_norm": 2.795283079147339, "learning_rate": 1.5513274886526144e-07, "loss": 0.1048, "step": 108750 }, { "epoch": 2.5358981084783307, "grad_norm": 2.0985233783721924, "learning_rate": 1.5505502704719268e-07, "loss": 0.1027, "step": 108760 }, { "epoch": 2.5361312698551486, "grad_norm": 1.4237431287765503, "learning_rate": 1.5497730522912391e-07, "loss": 0.1086, "step": 108770 }, { "epoch": 2.5363644312319664, "grad_norm": 2.3110220432281494, "learning_rate": 1.5489958341105512e-07, "loss": 0.1016, "step": 108780 }, { "epoch": 2.5365975926087843, "grad_norm": 1.5340933799743652, "learning_rate": 1.5482186159298639e-07, "loss": 0.1115, "step": 108790 }, { "epoch": 2.536830753985602, "grad_norm": 1.9108465909957886, "learning_rate": 1.547441397749176e-07, "loss": 0.0963, "step": 108800 }, { "epoch": 2.5370639153624204, "grad_norm": 3.2394661903381348, "learning_rate": 1.5466641795684883e-07, "loss": 0.104, "step": 108810 }, { "epoch": 2.537297076739238, "grad_norm": 2.4892337322235107, "learning_rate": 1.5458869613878007e-07, "loss": 0.099, "step": 108820 }, { "epoch": 2.537530238116056, "grad_norm": 1.750178575515747, "learning_rate": 1.545109743207113e-07, "loss": 0.1059, "step": 108830 }, { "epoch": 2.537763399492874, "grad_norm": 1.1351683139801025, "learning_rate": 1.5443325250264254e-07, "loss": 0.1077, "step": 108840 }, { "epoch": 2.5379965608696917, "grad_norm": 1.1097886562347412, "learning_rate": 1.5435553068457377e-07, "loss": 0.1099, "step": 108850 }, { "epoch": 2.53822972224651, "grad_norm": 4.27453088760376, "learning_rate": 1.5427780886650498e-07, "loss": 0.1088, "step": 108860 }, { "epoch": 2.538462883623328, "grad_norm": 1.48793363571167, "learning_rate": 1.5420008704843624e-07, "loss": 0.1017, "step": 108870 }, { "epoch": 2.5386960450001457, "grad_norm": 1.9693968296051025, "learning_rate": 1.5412236523036745e-07, "loss": 0.1095, "step": 108880 }, { "epoch": 2.5389292063769635, "grad_norm": 1.6994566917419434, "learning_rate": 1.540446434122987e-07, "loss": 0.1189, "step": 108890 }, { "epoch": 2.5391623677537813, "grad_norm": 4.919279098510742, "learning_rate": 1.5396692159422993e-07, "loss": 0.1171, "step": 108900 }, { "epoch": 2.5393955291305996, "grad_norm": 1.5925534963607788, "learning_rate": 1.5388919977616116e-07, "loss": 0.1011, "step": 108910 }, { "epoch": 2.5396286905074175, "grad_norm": 1.4099633693695068, "learning_rate": 1.5381147795809237e-07, "loss": 0.0937, "step": 108920 }, { "epoch": 2.5398618518842353, "grad_norm": 1.7951089143753052, "learning_rate": 1.5373375614002363e-07, "loss": 0.1117, "step": 108930 }, { "epoch": 2.5400950132610536, "grad_norm": 1.454369068145752, "learning_rate": 1.5365603432195484e-07, "loss": 0.1069, "step": 108940 }, { "epoch": 2.540328174637871, "grad_norm": 2.2585859298706055, "learning_rate": 1.5357831250388608e-07, "loss": 0.1027, "step": 108950 }, { "epoch": 2.5405613360146893, "grad_norm": 2.5902822017669678, "learning_rate": 1.535005906858173e-07, "loss": 0.1032, "step": 108960 }, { "epoch": 2.540794497391507, "grad_norm": 1.1679478883743286, "learning_rate": 1.5342286886774855e-07, "loss": 0.0984, "step": 108970 }, { "epoch": 2.541027658768325, "grad_norm": 2.141535997390747, "learning_rate": 1.5334514704967976e-07, "loss": 0.0928, "step": 108980 }, { "epoch": 2.541260820145143, "grad_norm": 1.502985954284668, "learning_rate": 1.5326742523161102e-07, "loss": 0.0911, "step": 108990 }, { "epoch": 2.541493981521961, "grad_norm": 1.2031985521316528, "learning_rate": 1.5318970341354223e-07, "loss": 0.0923, "step": 109000 }, { "epoch": 2.541727142898779, "grad_norm": 4.505914688110352, "learning_rate": 1.5311198159547346e-07, "loss": 0.1042, "step": 109010 }, { "epoch": 2.5419603042755967, "grad_norm": 1.5744739770889282, "learning_rate": 1.530342597774047e-07, "loss": 0.1099, "step": 109020 }, { "epoch": 2.5421934656524146, "grad_norm": 1.6976141929626465, "learning_rate": 1.5295653795933594e-07, "loss": 0.1001, "step": 109030 }, { "epoch": 2.542426627029233, "grad_norm": 1.3924700021743774, "learning_rate": 1.5287881614126715e-07, "loss": 0.1051, "step": 109040 }, { "epoch": 2.5426597884060507, "grad_norm": 1.3947607278823853, "learning_rate": 1.528010943231984e-07, "loss": 0.1007, "step": 109050 }, { "epoch": 2.5428929497828685, "grad_norm": 2.019357442855835, "learning_rate": 1.5272337250512962e-07, "loss": 0.1077, "step": 109060 }, { "epoch": 2.5431261111596863, "grad_norm": 2.35209321975708, "learning_rate": 1.5264565068706088e-07, "loss": 0.1116, "step": 109070 }, { "epoch": 2.543359272536504, "grad_norm": 1.2583218812942505, "learning_rate": 1.5257570105079898e-07, "loss": 0.109, "step": 109080 }, { "epoch": 2.5435924339133225, "grad_norm": 0.9057754278182983, "learning_rate": 1.524979792327302e-07, "loss": 0.1077, "step": 109090 }, { "epoch": 2.5438255952901403, "grad_norm": 2.8422887325286865, "learning_rate": 1.5242025741466145e-07, "loss": 0.1127, "step": 109100 }, { "epoch": 2.544058756666958, "grad_norm": 1.203284502029419, "learning_rate": 1.5234253559659266e-07, "loss": 0.109, "step": 109110 }, { "epoch": 2.544291918043776, "grad_norm": 3.085977077484131, "learning_rate": 1.522648137785239e-07, "loss": 0.1117, "step": 109120 }, { "epoch": 2.544525079420594, "grad_norm": 1.232015609741211, "learning_rate": 1.5218709196045513e-07, "loss": 0.1049, "step": 109130 }, { "epoch": 2.544758240797412, "grad_norm": 1.3099368810653687, "learning_rate": 1.5210937014238637e-07, "loss": 0.1008, "step": 109140 }, { "epoch": 2.54499140217423, "grad_norm": 1.6385507583618164, "learning_rate": 1.5203164832431757e-07, "loss": 0.1003, "step": 109150 }, { "epoch": 2.5452245635510478, "grad_norm": 1.2732515335083008, "learning_rate": 1.5195392650624884e-07, "loss": 0.1121, "step": 109160 }, { "epoch": 2.5454577249278656, "grad_norm": 1.8009002208709717, "learning_rate": 1.5187620468818005e-07, "loss": 0.1023, "step": 109170 }, { "epoch": 2.5456908863046834, "grad_norm": 1.6138020753860474, "learning_rate": 1.5179848287011128e-07, "loss": 0.1024, "step": 109180 }, { "epoch": 2.5459240476815017, "grad_norm": 1.8591039180755615, "learning_rate": 1.5172076105204252e-07, "loss": 0.1049, "step": 109190 }, { "epoch": 2.5461572090583195, "grad_norm": 1.7564431428909302, "learning_rate": 1.5164303923397375e-07, "loss": 0.1089, "step": 109200 }, { "epoch": 2.5463903704351374, "grad_norm": 1.7260030508041382, "learning_rate": 1.5156531741590496e-07, "loss": 0.1059, "step": 109210 }, { "epoch": 2.546623531811955, "grad_norm": 1.3319268226623535, "learning_rate": 1.5148759559783622e-07, "loss": 0.1, "step": 109220 }, { "epoch": 2.546856693188773, "grad_norm": 2.8928380012512207, "learning_rate": 1.5140987377976743e-07, "loss": 0.1108, "step": 109230 }, { "epoch": 2.5470898545655913, "grad_norm": 1.323212742805481, "learning_rate": 1.513321519616987e-07, "loss": 0.1028, "step": 109240 }, { "epoch": 2.547323015942409, "grad_norm": 1.192042350769043, "learning_rate": 1.512544301436299e-07, "loss": 0.1094, "step": 109250 }, { "epoch": 2.547556177319227, "grad_norm": 1.2915319204330444, "learning_rate": 1.5117670832556114e-07, "loss": 0.1062, "step": 109260 }, { "epoch": 2.547789338696045, "grad_norm": 3.1731762886047363, "learning_rate": 1.5109898650749238e-07, "loss": 0.0965, "step": 109270 }, { "epoch": 2.5480225000728627, "grad_norm": 1.4325335025787354, "learning_rate": 1.510212646894236e-07, "loss": 0.1033, "step": 109280 }, { "epoch": 2.548255661449681, "grad_norm": 1.3822147846221924, "learning_rate": 1.5094354287135482e-07, "loss": 0.1031, "step": 109290 }, { "epoch": 2.548488822826499, "grad_norm": 2.5380961894989014, "learning_rate": 1.5086582105328608e-07, "loss": 0.1018, "step": 109300 }, { "epoch": 2.5487219842033166, "grad_norm": 1.3654533624649048, "learning_rate": 1.507880992352173e-07, "loss": 0.099, "step": 109310 }, { "epoch": 2.548955145580135, "grad_norm": 1.3769723176956177, "learning_rate": 1.5071037741714853e-07, "loss": 0.1066, "step": 109320 }, { "epoch": 2.5491883069569523, "grad_norm": 2.5647966861724854, "learning_rate": 1.5063265559907976e-07, "loss": 0.1006, "step": 109330 }, { "epoch": 2.5494214683337706, "grad_norm": 3.2030110359191895, "learning_rate": 1.50554933781011e-07, "loss": 0.0852, "step": 109340 }, { "epoch": 2.5496546297105884, "grad_norm": 1.2389509677886963, "learning_rate": 1.504772119629422e-07, "loss": 0.0972, "step": 109350 }, { "epoch": 2.5498877910874063, "grad_norm": 1.2909883260726929, "learning_rate": 1.5039949014487347e-07, "loss": 0.1057, "step": 109360 }, { "epoch": 2.5501209524642245, "grad_norm": 2.170344829559326, "learning_rate": 1.5032176832680468e-07, "loss": 0.0938, "step": 109370 }, { "epoch": 2.5503541138410424, "grad_norm": 2.190645694732666, "learning_rate": 1.5024404650873592e-07, "loss": 0.1102, "step": 109380 }, { "epoch": 2.55058727521786, "grad_norm": 1.438286304473877, "learning_rate": 1.5016632469066715e-07, "loss": 0.1133, "step": 109390 }, { "epoch": 2.550820436594678, "grad_norm": 1.5470691919326782, "learning_rate": 1.500886028725984e-07, "loss": 0.1037, "step": 109400 }, { "epoch": 2.551053597971496, "grad_norm": 1.2600123882293701, "learning_rate": 1.500108810545296e-07, "loss": 0.0965, "step": 109410 }, { "epoch": 2.551286759348314, "grad_norm": 1.3372249603271484, "learning_rate": 1.4993315923646086e-07, "loss": 0.118, "step": 109420 }, { "epoch": 2.551519920725132, "grad_norm": 1.6659029722213745, "learning_rate": 1.4985543741839207e-07, "loss": 0.1182, "step": 109430 }, { "epoch": 2.55175308210195, "grad_norm": 1.0768508911132812, "learning_rate": 1.4977771560032333e-07, "loss": 0.1088, "step": 109440 }, { "epoch": 2.5519862434787677, "grad_norm": 2.0317299365997314, "learning_rate": 1.4969999378225454e-07, "loss": 0.1175, "step": 109450 }, { "epoch": 2.5522194048555855, "grad_norm": 1.4199631214141846, "learning_rate": 1.4962227196418577e-07, "loss": 0.0941, "step": 109460 }, { "epoch": 2.552452566232404, "grad_norm": 2.3499464988708496, "learning_rate": 1.49544550146117e-07, "loss": 0.1195, "step": 109470 }, { "epoch": 2.5526857276092216, "grad_norm": 0.9662525057792664, "learning_rate": 1.4946682832804825e-07, "loss": 0.1105, "step": 109480 }, { "epoch": 2.5529188889860395, "grad_norm": 1.4474709033966064, "learning_rate": 1.4938910650997946e-07, "loss": 0.1137, "step": 109490 }, { "epoch": 2.5531520503628573, "grad_norm": 1.070896029472351, "learning_rate": 1.4931138469191072e-07, "loss": 0.105, "step": 109500 }, { "epoch": 2.553385211739675, "grad_norm": 1.1888054609298706, "learning_rate": 1.4923366287384193e-07, "loss": 0.0966, "step": 109510 }, { "epoch": 2.5536183731164934, "grad_norm": 1.6546785831451416, "learning_rate": 1.4915594105577316e-07, "loss": 0.1025, "step": 109520 }, { "epoch": 2.5538515344933113, "grad_norm": 1.41734778881073, "learning_rate": 1.490782192377044e-07, "loss": 0.0909, "step": 109530 }, { "epoch": 2.554084695870129, "grad_norm": 1.8359445333480835, "learning_rate": 1.4900049741963563e-07, "loss": 0.1076, "step": 109540 }, { "epoch": 2.554317857246947, "grad_norm": 2.287904739379883, "learning_rate": 1.4892277560156684e-07, "loss": 0.0932, "step": 109550 }, { "epoch": 2.5545510186237648, "grad_norm": 1.7187838554382324, "learning_rate": 1.488450537834981e-07, "loss": 0.0989, "step": 109560 }, { "epoch": 2.554784180000583, "grad_norm": 1.519388198852539, "learning_rate": 1.4876733196542931e-07, "loss": 0.1027, "step": 109570 }, { "epoch": 2.555017341377401, "grad_norm": 1.3775525093078613, "learning_rate": 1.4868961014736055e-07, "loss": 0.0984, "step": 109580 }, { "epoch": 2.5552505027542187, "grad_norm": 1.5030425786972046, "learning_rate": 1.4861188832929179e-07, "loss": 0.1067, "step": 109590 }, { "epoch": 2.5554836641310366, "grad_norm": 1.7733787298202515, "learning_rate": 1.4853416651122302e-07, "loss": 0.111, "step": 109600 }, { "epoch": 2.5557168255078544, "grad_norm": 1.7164194583892822, "learning_rate": 1.4845644469315423e-07, "loss": 0.0999, "step": 109610 }, { "epoch": 2.5559499868846727, "grad_norm": 2.209688901901245, "learning_rate": 1.483787228750855e-07, "loss": 0.1035, "step": 109620 }, { "epoch": 2.5561831482614905, "grad_norm": 1.9286266565322876, "learning_rate": 1.483010010570167e-07, "loss": 0.098, "step": 109630 }, { "epoch": 2.5564163096383083, "grad_norm": 1.3452112674713135, "learning_rate": 1.4822327923894796e-07, "loss": 0.1079, "step": 109640 }, { "epoch": 2.556649471015126, "grad_norm": 1.1362751722335815, "learning_rate": 1.4814555742087917e-07, "loss": 0.1059, "step": 109650 }, { "epoch": 2.556882632391944, "grad_norm": 1.2845008373260498, "learning_rate": 1.480678356028104e-07, "loss": 0.0988, "step": 109660 }, { "epoch": 2.5571157937687623, "grad_norm": 2.297564744949341, "learning_rate": 1.4799011378474164e-07, "loss": 0.1068, "step": 109670 }, { "epoch": 2.55734895514558, "grad_norm": 1.7802810668945312, "learning_rate": 1.4791239196667288e-07, "loss": 0.0944, "step": 109680 }, { "epoch": 2.557582116522398, "grad_norm": 2.1519320011138916, "learning_rate": 1.478346701486041e-07, "loss": 0.1045, "step": 109690 }, { "epoch": 2.5578152778992163, "grad_norm": 2.413235902786255, "learning_rate": 1.4775694833053535e-07, "loss": 0.1029, "step": 109700 }, { "epoch": 2.5580484392760336, "grad_norm": 3.1746129989624023, "learning_rate": 1.4767922651246656e-07, "loss": 0.0937, "step": 109710 }, { "epoch": 2.558281600652852, "grad_norm": 1.4623939990997314, "learning_rate": 1.476015046943978e-07, "loss": 0.1033, "step": 109720 }, { "epoch": 2.5585147620296698, "grad_norm": 2.1264994144439697, "learning_rate": 1.4752378287632903e-07, "loss": 0.104, "step": 109730 }, { "epoch": 2.5587479234064876, "grad_norm": 2.3428096771240234, "learning_rate": 1.4744606105826027e-07, "loss": 0.1105, "step": 109740 }, { "epoch": 2.558981084783306, "grad_norm": 1.210054636001587, "learning_rate": 1.473683392401915e-07, "loss": 0.1044, "step": 109750 }, { "epoch": 2.5592142461601237, "grad_norm": 2.9349215030670166, "learning_rate": 1.4729061742212274e-07, "loss": 0.1154, "step": 109760 }, { "epoch": 2.5594474075369416, "grad_norm": 2.2381410598754883, "learning_rate": 1.4721289560405398e-07, "loss": 0.0952, "step": 109770 }, { "epoch": 2.5596805689137594, "grad_norm": 1.3481779098510742, "learning_rate": 1.4713517378598518e-07, "loss": 0.0997, "step": 109780 }, { "epoch": 2.5599137302905772, "grad_norm": 1.7806357145309448, "learning_rate": 1.4705745196791645e-07, "loss": 0.087, "step": 109790 }, { "epoch": 2.5601468916673955, "grad_norm": 1.3076050281524658, "learning_rate": 1.4697973014984766e-07, "loss": 0.1069, "step": 109800 }, { "epoch": 2.5603800530442133, "grad_norm": 1.580938458442688, "learning_rate": 1.469020083317789e-07, "loss": 0.1042, "step": 109810 }, { "epoch": 2.560613214421031, "grad_norm": 3.2288479804992676, "learning_rate": 1.4682428651371013e-07, "loss": 0.1083, "step": 109820 }, { "epoch": 2.560846375797849, "grad_norm": 1.2815145254135132, "learning_rate": 1.4674656469564136e-07, "loss": 0.1089, "step": 109830 }, { "epoch": 2.561079537174667, "grad_norm": 1.6024768352508545, "learning_rate": 1.4666884287757257e-07, "loss": 0.1126, "step": 109840 }, { "epoch": 2.561312698551485, "grad_norm": 2.1231069564819336, "learning_rate": 1.4659112105950383e-07, "loss": 0.0985, "step": 109850 }, { "epoch": 2.561545859928303, "grad_norm": 1.439245343208313, "learning_rate": 1.4651339924143504e-07, "loss": 0.0897, "step": 109860 }, { "epoch": 2.561779021305121, "grad_norm": 1.447434425354004, "learning_rate": 1.464356774233663e-07, "loss": 0.1055, "step": 109870 }, { "epoch": 2.5620121826819386, "grad_norm": 1.3105376958847046, "learning_rate": 1.4635795560529751e-07, "loss": 0.0935, "step": 109880 }, { "epoch": 2.5622453440587565, "grad_norm": 1.3527460098266602, "learning_rate": 1.4628023378722875e-07, "loss": 0.1266, "step": 109890 }, { "epoch": 2.5624785054355748, "grad_norm": 1.777950644493103, "learning_rate": 1.4620251196915999e-07, "loss": 0.1019, "step": 109900 }, { "epoch": 2.5627116668123926, "grad_norm": 1.9972617626190186, "learning_rate": 1.4612479015109122e-07, "loss": 0.109, "step": 109910 }, { "epoch": 2.5629448281892104, "grad_norm": 1.2512893676757812, "learning_rate": 1.4604706833302243e-07, "loss": 0.1119, "step": 109920 }, { "epoch": 2.5631779895660283, "grad_norm": 1.4100521802902222, "learning_rate": 1.459693465149537e-07, "loss": 0.1009, "step": 109930 }, { "epoch": 2.563411150942846, "grad_norm": 1.384021520614624, "learning_rate": 1.458916246968849e-07, "loss": 0.0994, "step": 109940 }, { "epoch": 2.5636443123196644, "grad_norm": 1.7520643472671509, "learning_rate": 1.4581390287881614e-07, "loss": 0.1095, "step": 109950 }, { "epoch": 2.5638774736964822, "grad_norm": 1.6147428750991821, "learning_rate": 1.4573618106074737e-07, "loss": 0.1077, "step": 109960 }, { "epoch": 2.5641106350733, "grad_norm": 1.3746620416641235, "learning_rate": 1.456584592426786e-07, "loss": 0.1045, "step": 109970 }, { "epoch": 2.564343796450118, "grad_norm": 1.1488673686981201, "learning_rate": 1.4558073742460982e-07, "loss": 0.1014, "step": 109980 }, { "epoch": 2.5645769578269357, "grad_norm": 2.6866841316223145, "learning_rate": 1.4550301560654108e-07, "loss": 0.1069, "step": 109990 }, { "epoch": 2.564810119203754, "grad_norm": 1.0192116498947144, "learning_rate": 1.454252937884723e-07, "loss": 0.1003, "step": 110000 }, { "epoch": 2.564810119203754, "eval_accuracy": 0.9489917034533788, "eval_f1": 0.9635852287290507, "eval_loss": 0.13291335105895996, "eval_runtime": 3909.6927, "eval_samples_per_second": 468.047, "eval_steps_per_second": 58.506, "step": 110000 }, { "epoch": 2.565043280580572, "grad_norm": 2.1091229915618896, "learning_rate": 1.4534757197040353e-07, "loss": 0.1094, "step": 110010 }, { "epoch": 2.5652764419573897, "grad_norm": 1.3195877075195312, "learning_rate": 1.4526985015233476e-07, "loss": 0.1068, "step": 110020 }, { "epoch": 2.5655096033342075, "grad_norm": 1.2480733394622803, "learning_rate": 1.45192128334266e-07, "loss": 0.1015, "step": 110030 }, { "epoch": 2.5657427647110254, "grad_norm": 2.425626277923584, "learning_rate": 1.451144065161972e-07, "loss": 0.0976, "step": 110040 }, { "epoch": 2.5659759260878436, "grad_norm": 1.4553284645080566, "learning_rate": 1.4503668469812847e-07, "loss": 0.1075, "step": 110050 }, { "epoch": 2.5662090874646615, "grad_norm": 2.6621792316436768, "learning_rate": 1.4495896288005968e-07, "loss": 0.0969, "step": 110060 }, { "epoch": 2.5664422488414793, "grad_norm": 1.2922886610031128, "learning_rate": 1.4488124106199094e-07, "loss": 0.1162, "step": 110070 }, { "epoch": 2.5666754102182976, "grad_norm": 1.68051278591156, "learning_rate": 1.4480351924392215e-07, "loss": 0.1135, "step": 110080 }, { "epoch": 2.5669085715951154, "grad_norm": 1.302725076675415, "learning_rate": 1.4472579742585338e-07, "loss": 0.1099, "step": 110090 }, { "epoch": 2.5671417329719333, "grad_norm": 1.4775636196136475, "learning_rate": 1.4464807560778462e-07, "loss": 0.11, "step": 110100 }, { "epoch": 2.567374894348751, "grad_norm": 2.556851625442505, "learning_rate": 1.4457035378971586e-07, "loss": 0.1087, "step": 110110 }, { "epoch": 2.567608055725569, "grad_norm": 1.709062099456787, "learning_rate": 1.4449263197164707e-07, "loss": 0.104, "step": 110120 }, { "epoch": 2.567841217102387, "grad_norm": 1.9616649150848389, "learning_rate": 1.4441491015357833e-07, "loss": 0.106, "step": 110130 }, { "epoch": 2.568074378479205, "grad_norm": 1.1445777416229248, "learning_rate": 1.4433718833550954e-07, "loss": 0.0993, "step": 110140 }, { "epoch": 2.568307539856023, "grad_norm": 1.0783393383026123, "learning_rate": 1.4425946651744077e-07, "loss": 0.0982, "step": 110150 }, { "epoch": 2.5685407012328407, "grad_norm": 1.6562989950180054, "learning_rate": 1.44181744699372e-07, "loss": 0.097, "step": 110160 }, { "epoch": 2.5687738626096586, "grad_norm": 2.51743745803833, "learning_rate": 1.4410402288130324e-07, "loss": 0.0978, "step": 110170 }, { "epoch": 2.569007023986477, "grad_norm": 1.2888516187667847, "learning_rate": 1.4402630106323445e-07, "loss": 0.1102, "step": 110180 }, { "epoch": 2.5692401853632947, "grad_norm": 1.121195912361145, "learning_rate": 1.4394857924516571e-07, "loss": 0.0988, "step": 110190 }, { "epoch": 2.5694733467401125, "grad_norm": 2.4032504558563232, "learning_rate": 1.4387085742709692e-07, "loss": 0.0962, "step": 110200 }, { "epoch": 2.5697065081169304, "grad_norm": 1.3880313634872437, "learning_rate": 1.4379313560902816e-07, "loss": 0.0957, "step": 110210 }, { "epoch": 2.569939669493748, "grad_norm": 1.092873454093933, "learning_rate": 1.437154137909594e-07, "loss": 0.1116, "step": 110220 }, { "epoch": 2.5701728308705665, "grad_norm": 2.069528102874756, "learning_rate": 1.4363769197289063e-07, "loss": 0.1003, "step": 110230 }, { "epoch": 2.5704059922473843, "grad_norm": 1.7069882154464722, "learning_rate": 1.4355997015482184e-07, "loss": 0.1064, "step": 110240 }, { "epoch": 2.570639153624202, "grad_norm": 1.148699164390564, "learning_rate": 1.434822483367531e-07, "loss": 0.1071, "step": 110250 }, { "epoch": 2.57087231500102, "grad_norm": 1.3059359788894653, "learning_rate": 1.434045265186843e-07, "loss": 0.0992, "step": 110260 }, { "epoch": 2.571105476377838, "grad_norm": 2.94079327583313, "learning_rate": 1.4332680470061557e-07, "loss": 0.0977, "step": 110270 }, { "epoch": 2.571338637754656, "grad_norm": 2.4431934356689453, "learning_rate": 1.4324908288254678e-07, "loss": 0.1096, "step": 110280 }, { "epoch": 2.571571799131474, "grad_norm": 1.7116801738739014, "learning_rate": 1.4317136106447802e-07, "loss": 0.1027, "step": 110290 }, { "epoch": 2.5718049605082918, "grad_norm": 1.5848000049591064, "learning_rate": 1.4309363924640925e-07, "loss": 0.1052, "step": 110300 }, { "epoch": 2.5720381218851096, "grad_norm": 3.085376739501953, "learning_rate": 1.430159174283405e-07, "loss": 0.0997, "step": 110310 }, { "epoch": 2.5722712832619274, "grad_norm": 1.8223031759262085, "learning_rate": 1.429381956102717e-07, "loss": 0.1023, "step": 110320 }, { "epoch": 2.5725044446387457, "grad_norm": 1.8018109798431396, "learning_rate": 1.4286047379220296e-07, "loss": 0.0939, "step": 110330 }, { "epoch": 2.5727376060155636, "grad_norm": 2.1994593143463135, "learning_rate": 1.4278275197413417e-07, "loss": 0.1052, "step": 110340 }, { "epoch": 2.5729707673923814, "grad_norm": 1.3738539218902588, "learning_rate": 1.427050301560654e-07, "loss": 0.0979, "step": 110350 }, { "epoch": 2.5732039287691992, "grad_norm": 1.8497222661972046, "learning_rate": 1.4262730833799664e-07, "loss": 0.1162, "step": 110360 }, { "epoch": 2.573437090146017, "grad_norm": 1.3525229692459106, "learning_rate": 1.4254958651992788e-07, "loss": 0.1032, "step": 110370 }, { "epoch": 2.5736702515228354, "grad_norm": 1.7367123365402222, "learning_rate": 1.424718647018591e-07, "loss": 0.0888, "step": 110380 }, { "epoch": 2.573903412899653, "grad_norm": 1.6569029092788696, "learning_rate": 1.4239414288379035e-07, "loss": 0.0958, "step": 110390 }, { "epoch": 2.574136574276471, "grad_norm": 1.797039270401001, "learning_rate": 1.4231642106572156e-07, "loss": 0.1079, "step": 110400 }, { "epoch": 2.5743697356532893, "grad_norm": 2.1512460708618164, "learning_rate": 1.422386992476528e-07, "loss": 0.1021, "step": 110410 }, { "epoch": 2.5746028970301067, "grad_norm": 2.5820047855377197, "learning_rate": 1.4216097742958403e-07, "loss": 0.0977, "step": 110420 }, { "epoch": 2.574836058406925, "grad_norm": 2.151188373565674, "learning_rate": 1.4208325561151527e-07, "loss": 0.112, "step": 110430 }, { "epoch": 2.575069219783743, "grad_norm": 2.9736104011535645, "learning_rate": 1.4200553379344647e-07, "loss": 0.1041, "step": 110440 }, { "epoch": 2.5753023811605606, "grad_norm": 3.143780469894409, "learning_rate": 1.4192781197537774e-07, "loss": 0.1083, "step": 110450 }, { "epoch": 2.575535542537379, "grad_norm": 1.3829214572906494, "learning_rate": 1.4185009015730895e-07, "loss": 0.1025, "step": 110460 }, { "epoch": 2.5757687039141968, "grad_norm": 2.8174707889556885, "learning_rate": 1.417723683392402e-07, "loss": 0.1116, "step": 110470 }, { "epoch": 2.5760018652910146, "grad_norm": 1.9128526449203491, "learning_rate": 1.4169464652117142e-07, "loss": 0.105, "step": 110480 }, { "epoch": 2.5762350266678324, "grad_norm": 1.9891115427017212, "learning_rate": 1.4161692470310265e-07, "loss": 0.0985, "step": 110490 }, { "epoch": 2.5764681880446503, "grad_norm": 1.2774015665054321, "learning_rate": 1.415392028850339e-07, "loss": 0.1077, "step": 110500 }, { "epoch": 2.5767013494214686, "grad_norm": 2.709892511367798, "learning_rate": 1.4146148106696512e-07, "loss": 0.104, "step": 110510 }, { "epoch": 2.5769345107982864, "grad_norm": 1.6565933227539062, "learning_rate": 1.4138375924889633e-07, "loss": 0.1125, "step": 110520 }, { "epoch": 2.5771676721751042, "grad_norm": 1.8836252689361572, "learning_rate": 1.413060374308276e-07, "loss": 0.0986, "step": 110530 }, { "epoch": 2.577400833551922, "grad_norm": 1.6250163316726685, "learning_rate": 1.412283156127588e-07, "loss": 0.0928, "step": 110540 }, { "epoch": 2.57763399492874, "grad_norm": 1.5099111795425415, "learning_rate": 1.4115059379469004e-07, "loss": 0.1017, "step": 110550 }, { "epoch": 2.577867156305558, "grad_norm": 1.1929607391357422, "learning_rate": 1.4107287197662128e-07, "loss": 0.0969, "step": 110560 }, { "epoch": 2.578100317682376, "grad_norm": 1.7229655981063843, "learning_rate": 1.409951501585525e-07, "loss": 0.1064, "step": 110570 }, { "epoch": 2.578333479059194, "grad_norm": 1.788468599319458, "learning_rate": 1.4091742834048372e-07, "loss": 0.0989, "step": 110580 }, { "epoch": 2.5785666404360117, "grad_norm": 1.6793292760849, "learning_rate": 1.4083970652241498e-07, "loss": 0.1045, "step": 110590 }, { "epoch": 2.5787998018128295, "grad_norm": 1.1809028387069702, "learning_rate": 1.407619847043462e-07, "loss": 0.1104, "step": 110600 }, { "epoch": 2.579032963189648, "grad_norm": 1.5077526569366455, "learning_rate": 1.4068426288627743e-07, "loss": 0.0864, "step": 110610 }, { "epoch": 2.5792661245664656, "grad_norm": 3.320603847503662, "learning_rate": 1.4060654106820866e-07, "loss": 0.0902, "step": 110620 }, { "epoch": 2.5794992859432835, "grad_norm": 1.7170664072036743, "learning_rate": 1.405288192501399e-07, "loss": 0.1109, "step": 110630 }, { "epoch": 2.5797324473201013, "grad_norm": 0.8732591271400452, "learning_rate": 1.404510974320711e-07, "loss": 0.0986, "step": 110640 }, { "epoch": 2.579965608696919, "grad_norm": 1.7252223491668701, "learning_rate": 1.4037337561400237e-07, "loss": 0.1131, "step": 110650 }, { "epoch": 2.5801987700737374, "grad_norm": 2.740997552871704, "learning_rate": 1.4029565379593358e-07, "loss": 0.1142, "step": 110660 }, { "epoch": 2.5804319314505553, "grad_norm": 1.412933349609375, "learning_rate": 1.4021793197786482e-07, "loss": 0.0998, "step": 110670 }, { "epoch": 2.580665092827373, "grad_norm": 1.1593753099441528, "learning_rate": 1.4014021015979605e-07, "loss": 0.0949, "step": 110680 }, { "epoch": 2.580898254204191, "grad_norm": 1.9951858520507812, "learning_rate": 1.400624883417273e-07, "loss": 0.1033, "step": 110690 }, { "epoch": 2.581131415581009, "grad_norm": 2.2306647300720215, "learning_rate": 1.3998476652365852e-07, "loss": 0.1005, "step": 110700 }, { "epoch": 2.581364576957827, "grad_norm": 1.9527575969696045, "learning_rate": 1.3990704470558976e-07, "loss": 0.1022, "step": 110710 }, { "epoch": 2.581597738334645, "grad_norm": 1.5187097787857056, "learning_rate": 1.3982932288752097e-07, "loss": 0.1088, "step": 110720 }, { "epoch": 2.5818308997114627, "grad_norm": 2.823207139968872, "learning_rate": 1.3975160106945223e-07, "loss": 0.1057, "step": 110730 }, { "epoch": 2.5820640610882806, "grad_norm": 1.3664982318878174, "learning_rate": 1.3967387925138344e-07, "loss": 0.1093, "step": 110740 }, { "epoch": 2.5822972224650984, "grad_norm": 1.2455755472183228, "learning_rate": 1.3959615743331467e-07, "loss": 0.1003, "step": 110750 }, { "epoch": 2.5825303838419167, "grad_norm": 1.8656877279281616, "learning_rate": 1.395184356152459e-07, "loss": 0.1099, "step": 110760 }, { "epoch": 2.5827635452187345, "grad_norm": 2.4522016048431396, "learning_rate": 1.3944071379717715e-07, "loss": 0.1003, "step": 110770 }, { "epoch": 2.5829967065955524, "grad_norm": 1.4978687763214111, "learning_rate": 1.3936299197910836e-07, "loss": 0.1073, "step": 110780 }, { "epoch": 2.5832298679723706, "grad_norm": 1.9795600175857544, "learning_rate": 1.3928527016103962e-07, "loss": 0.1063, "step": 110790 }, { "epoch": 2.583463029349188, "grad_norm": 1.4791115522384644, "learning_rate": 1.3920754834297083e-07, "loss": 0.1073, "step": 110800 }, { "epoch": 2.5836961907260063, "grad_norm": 2.407123327255249, "learning_rate": 1.3912982652490206e-07, "loss": 0.1019, "step": 110810 }, { "epoch": 2.583929352102824, "grad_norm": 1.1846470832824707, "learning_rate": 1.390521047068333e-07, "loss": 0.1021, "step": 110820 }, { "epoch": 2.584162513479642, "grad_norm": 2.7006876468658447, "learning_rate": 1.3897438288876453e-07, "loss": 0.1054, "step": 110830 }, { "epoch": 2.5843956748564603, "grad_norm": 1.3497707843780518, "learning_rate": 1.3889666107069574e-07, "loss": 0.0993, "step": 110840 }, { "epoch": 2.584628836233278, "grad_norm": 1.647120714187622, "learning_rate": 1.38818939252627e-07, "loss": 0.0973, "step": 110850 }, { "epoch": 2.584861997610096, "grad_norm": 1.8322447538375854, "learning_rate": 1.3874121743455821e-07, "loss": 0.1014, "step": 110860 }, { "epoch": 2.5850951589869138, "grad_norm": 1.5433640480041504, "learning_rate": 1.3866349561648945e-07, "loss": 0.0987, "step": 110870 }, { "epoch": 2.5853283203637316, "grad_norm": 1.1698638200759888, "learning_rate": 1.3858577379842069e-07, "loss": 0.1053, "step": 110880 }, { "epoch": 2.58556148174055, "grad_norm": 1.8656479120254517, "learning_rate": 1.3850805198035192e-07, "loss": 0.1104, "step": 110890 }, { "epoch": 2.5857946431173677, "grad_norm": 1.317184567451477, "learning_rate": 1.3843033016228313e-07, "loss": 0.1028, "step": 110900 }, { "epoch": 2.5860278044941856, "grad_norm": 1.6822677850723267, "learning_rate": 1.383526083442144e-07, "loss": 0.1081, "step": 110910 }, { "epoch": 2.5862609658710034, "grad_norm": 2.2706308364868164, "learning_rate": 1.382748865261456e-07, "loss": 0.1044, "step": 110920 }, { "epoch": 2.5864941272478212, "grad_norm": 2.520956039428711, "learning_rate": 1.3819716470807686e-07, "loss": 0.1074, "step": 110930 }, { "epoch": 2.5867272886246395, "grad_norm": 2.0913078784942627, "learning_rate": 1.3811944289000807e-07, "loss": 0.1093, "step": 110940 }, { "epoch": 2.5869604500014574, "grad_norm": 1.2209193706512451, "learning_rate": 1.380417210719393e-07, "loss": 0.1148, "step": 110950 }, { "epoch": 2.587193611378275, "grad_norm": 2.6814873218536377, "learning_rate": 1.3796399925387054e-07, "loss": 0.1027, "step": 110960 }, { "epoch": 2.587426772755093, "grad_norm": 1.0797975063323975, "learning_rate": 1.3788627743580178e-07, "loss": 0.0952, "step": 110970 }, { "epoch": 2.587659934131911, "grad_norm": 2.1599249839782715, "learning_rate": 1.37808555617733e-07, "loss": 0.0981, "step": 110980 }, { "epoch": 2.587893095508729, "grad_norm": 1.8783814907073975, "learning_rate": 1.3773083379966425e-07, "loss": 0.1013, "step": 110990 }, { "epoch": 2.588126256885547, "grad_norm": 1.1751012802124023, "learning_rate": 1.3765311198159546e-07, "loss": 0.1068, "step": 111000 }, { "epoch": 2.588359418262365, "grad_norm": 1.8573482036590576, "learning_rate": 1.375753901635267e-07, "loss": 0.0994, "step": 111010 }, { "epoch": 2.5885925796391827, "grad_norm": 1.3201534748077393, "learning_rate": 1.3749766834545793e-07, "loss": 0.1122, "step": 111020 }, { "epoch": 2.5888257410160005, "grad_norm": 1.5170495510101318, "learning_rate": 1.3741994652738917e-07, "loss": 0.095, "step": 111030 }, { "epoch": 2.5890589023928188, "grad_norm": 1.2662320137023926, "learning_rate": 1.3734222470932038e-07, "loss": 0.1065, "step": 111040 }, { "epoch": 2.5892920637696366, "grad_norm": 4.1307501792907715, "learning_rate": 1.3726450289125164e-07, "loss": 0.1005, "step": 111050 }, { "epoch": 2.5895252251464544, "grad_norm": 1.7904609441757202, "learning_rate": 1.3718678107318285e-07, "loss": 0.0941, "step": 111060 }, { "epoch": 2.5897583865232723, "grad_norm": 2.137153148651123, "learning_rate": 1.3710905925511408e-07, "loss": 0.1047, "step": 111070 }, { "epoch": 2.58999154790009, "grad_norm": 1.389341115951538, "learning_rate": 1.3703133743704532e-07, "loss": 0.0844, "step": 111080 }, { "epoch": 2.5902247092769084, "grad_norm": 1.7730035781860352, "learning_rate": 1.3695361561897656e-07, "loss": 0.099, "step": 111090 }, { "epoch": 2.5904578706537262, "grad_norm": 1.5626906156539917, "learning_rate": 1.3687589380090776e-07, "loss": 0.1064, "step": 111100 }, { "epoch": 2.590691032030544, "grad_norm": 1.323441743850708, "learning_rate": 1.3679817198283903e-07, "loss": 0.1046, "step": 111110 }, { "epoch": 2.590924193407362, "grad_norm": 1.1942602396011353, "learning_rate": 1.3672045016477024e-07, "loss": 0.1108, "step": 111120 }, { "epoch": 2.5911573547841797, "grad_norm": 2.4508609771728516, "learning_rate": 1.366427283467015e-07, "loss": 0.106, "step": 111130 }, { "epoch": 2.591390516160998, "grad_norm": 1.6203622817993164, "learning_rate": 1.365650065286327e-07, "loss": 0.0924, "step": 111140 }, { "epoch": 2.591623677537816, "grad_norm": 1.761730670928955, "learning_rate": 1.3648728471056394e-07, "loss": 0.1076, "step": 111150 }, { "epoch": 2.5918568389146337, "grad_norm": 1.952412486076355, "learning_rate": 1.3640956289249518e-07, "loss": 0.0955, "step": 111160 }, { "epoch": 2.592090000291452, "grad_norm": 1.6180775165557861, "learning_rate": 1.3633184107442641e-07, "loss": 0.1084, "step": 111170 }, { "epoch": 2.59232316166827, "grad_norm": 1.1375254392623901, "learning_rate": 1.3625411925635762e-07, "loss": 0.1019, "step": 111180 }, { "epoch": 2.5925563230450877, "grad_norm": 1.746276617050171, "learning_rate": 1.3617639743828889e-07, "loss": 0.1117, "step": 111190 }, { "epoch": 2.5927894844219055, "grad_norm": 2.3571455478668213, "learning_rate": 1.360986756202201e-07, "loss": 0.0995, "step": 111200 }, { "epoch": 2.5930226457987233, "grad_norm": 2.1874911785125732, "learning_rate": 1.3602095380215133e-07, "loss": 0.0912, "step": 111210 }, { "epoch": 2.5932558071755416, "grad_norm": 2.3847827911376953, "learning_rate": 1.3594323198408257e-07, "loss": 0.1064, "step": 111220 }, { "epoch": 2.5934889685523594, "grad_norm": 1.620090126991272, "learning_rate": 1.358655101660138e-07, "loss": 0.1077, "step": 111230 }, { "epoch": 2.5937221299291773, "grad_norm": 1.8056327104568481, "learning_rate": 1.35787788347945e-07, "loss": 0.0911, "step": 111240 }, { "epoch": 2.593955291305995, "grad_norm": 1.3226643800735474, "learning_rate": 1.3571006652987627e-07, "loss": 0.1011, "step": 111250 }, { "epoch": 2.594188452682813, "grad_norm": 2.334462881088257, "learning_rate": 1.3563234471180748e-07, "loss": 0.1166, "step": 111260 }, { "epoch": 2.5944216140596312, "grad_norm": 1.6962445974349976, "learning_rate": 1.3555462289373872e-07, "loss": 0.1037, "step": 111270 }, { "epoch": 2.594654775436449, "grad_norm": 1.3103657960891724, "learning_rate": 1.3547690107566995e-07, "loss": 0.1009, "step": 111280 }, { "epoch": 2.594887936813267, "grad_norm": 1.4839515686035156, "learning_rate": 1.353991792576012e-07, "loss": 0.1059, "step": 111290 }, { "epoch": 2.5951210981900847, "grad_norm": 1.3721909523010254, "learning_rate": 1.353214574395324e-07, "loss": 0.0968, "step": 111300 }, { "epoch": 2.5953542595669026, "grad_norm": 2.094139575958252, "learning_rate": 1.3524373562146366e-07, "loss": 0.0952, "step": 111310 }, { "epoch": 2.595587420943721, "grad_norm": 1.2171180248260498, "learning_rate": 1.3516601380339487e-07, "loss": 0.1111, "step": 111320 }, { "epoch": 2.5958205823205387, "grad_norm": 1.3204212188720703, "learning_rate": 1.3508829198532613e-07, "loss": 0.1034, "step": 111330 }, { "epoch": 2.5960537436973565, "grad_norm": 1.3710192441940308, "learning_rate": 1.3501057016725734e-07, "loss": 0.1026, "step": 111340 }, { "epoch": 2.5962869050741744, "grad_norm": 1.400743007659912, "learning_rate": 1.3493284834918858e-07, "loss": 0.1022, "step": 111350 }, { "epoch": 2.596520066450992, "grad_norm": 4.653966903686523, "learning_rate": 1.348551265311198e-07, "loss": 0.1153, "step": 111360 }, { "epoch": 2.5967532278278105, "grad_norm": 1.3532220125198364, "learning_rate": 1.3477740471305105e-07, "loss": 0.0997, "step": 111370 }, { "epoch": 2.5969863892046283, "grad_norm": 1.156098484992981, "learning_rate": 1.3469968289498226e-07, "loss": 0.1049, "step": 111380 }, { "epoch": 2.597219550581446, "grad_norm": 2.6031532287597656, "learning_rate": 1.3462196107691352e-07, "loss": 0.0997, "step": 111390 }, { "epoch": 2.597452711958264, "grad_norm": 2.5804696083068848, "learning_rate": 1.3454423925884473e-07, "loss": 0.0979, "step": 111400 }, { "epoch": 2.597685873335082, "grad_norm": 1.0354965925216675, "learning_rate": 1.3446651744077596e-07, "loss": 0.0997, "step": 111410 }, { "epoch": 2.5979190347119, "grad_norm": 1.1910887956619263, "learning_rate": 1.343887956227072e-07, "loss": 0.1088, "step": 111420 }, { "epoch": 2.598152196088718, "grad_norm": 3.0507125854492188, "learning_rate": 1.3431107380463844e-07, "loss": 0.1077, "step": 111430 }, { "epoch": 2.598385357465536, "grad_norm": 2.8137357234954834, "learning_rate": 1.3423335198656965e-07, "loss": 0.0996, "step": 111440 }, { "epoch": 2.5986185188423536, "grad_norm": 2.6182024478912354, "learning_rate": 1.341556301685009e-07, "loss": 0.0992, "step": 111450 }, { "epoch": 2.5988516802191715, "grad_norm": 1.6546709537506104, "learning_rate": 1.3407790835043212e-07, "loss": 0.1067, "step": 111460 }, { "epoch": 2.5990848415959897, "grad_norm": 1.9567747116088867, "learning_rate": 1.3400018653236335e-07, "loss": 0.1076, "step": 111470 }, { "epoch": 2.5993180029728076, "grad_norm": 1.6811389923095703, "learning_rate": 1.339224647142946e-07, "loss": 0.114, "step": 111480 }, { "epoch": 2.5995511643496254, "grad_norm": 1.7860568761825562, "learning_rate": 1.3384474289622582e-07, "loss": 0.0978, "step": 111490 }, { "epoch": 2.5997843257264437, "grad_norm": 1.1466463804244995, "learning_rate": 1.3376702107815703e-07, "loss": 0.1098, "step": 111500 }, { "epoch": 2.600017487103261, "grad_norm": 1.849417805671692, "learning_rate": 1.336892992600883e-07, "loss": 0.0951, "step": 111510 }, { "epoch": 2.6002506484800794, "grad_norm": 1.1400361061096191, "learning_rate": 1.336115774420195e-07, "loss": 0.0968, "step": 111520 }, { "epoch": 2.600483809856897, "grad_norm": 3.3256399631500244, "learning_rate": 1.3353385562395074e-07, "loss": 0.1106, "step": 111530 }, { "epoch": 2.600716971233715, "grad_norm": 1.4941574335098267, "learning_rate": 1.3345613380588198e-07, "loss": 0.0989, "step": 111540 }, { "epoch": 2.6009501326105333, "grad_norm": 1.2005239725112915, "learning_rate": 1.333784119878132e-07, "loss": 0.0852, "step": 111550 }, { "epoch": 2.601183293987351, "grad_norm": 1.1571706533432007, "learning_rate": 1.3330069016974445e-07, "loss": 0.105, "step": 111560 }, { "epoch": 2.601416455364169, "grad_norm": 1.8445998430252075, "learning_rate": 1.3322296835167568e-07, "loss": 0.1127, "step": 111570 }, { "epoch": 2.601649616740987, "grad_norm": 1.173627495765686, "learning_rate": 1.331452465336069e-07, "loss": 0.1046, "step": 111580 }, { "epoch": 2.6018827781178047, "grad_norm": 1.4265427589416504, "learning_rate": 1.3306752471553815e-07, "loss": 0.1024, "step": 111590 }, { "epoch": 2.602115939494623, "grad_norm": 2.104398488998413, "learning_rate": 1.3298980289746936e-07, "loss": 0.1056, "step": 111600 }, { "epoch": 2.602349100871441, "grad_norm": 1.5923634767532349, "learning_rate": 1.329120810794006e-07, "loss": 0.0954, "step": 111610 }, { "epoch": 2.6025822622482586, "grad_norm": 1.6227539777755737, "learning_rate": 1.3283435926133183e-07, "loss": 0.1031, "step": 111620 }, { "epoch": 2.6028154236250765, "grad_norm": 1.3880332708358765, "learning_rate": 1.3275663744326307e-07, "loss": 0.1019, "step": 111630 }, { "epoch": 2.6030485850018943, "grad_norm": 2.4728362560272217, "learning_rate": 1.3267891562519428e-07, "loss": 0.0951, "step": 111640 }, { "epoch": 2.6032817463787126, "grad_norm": 1.3578108549118042, "learning_rate": 1.3260119380712554e-07, "loss": 0.099, "step": 111650 }, { "epoch": 2.6035149077555304, "grad_norm": 2.1262214183807373, "learning_rate": 1.3252347198905675e-07, "loss": 0.1195, "step": 111660 }, { "epoch": 2.6037480691323482, "grad_norm": 1.2526105642318726, "learning_rate": 1.3244575017098799e-07, "loss": 0.1067, "step": 111670 }, { "epoch": 2.603981230509166, "grad_norm": 1.3344346284866333, "learning_rate": 1.3236802835291922e-07, "loss": 0.1159, "step": 111680 }, { "epoch": 2.604214391885984, "grad_norm": 1.317545771598816, "learning_rate": 1.3229030653485046e-07, "loss": 0.0952, "step": 111690 }, { "epoch": 2.604447553262802, "grad_norm": 1.3689476251602173, "learning_rate": 1.3221258471678167e-07, "loss": 0.1012, "step": 111700 }, { "epoch": 2.60468071463962, "grad_norm": 1.731671929359436, "learning_rate": 1.3213486289871293e-07, "loss": 0.0939, "step": 111710 }, { "epoch": 2.604913876016438, "grad_norm": 1.2904731035232544, "learning_rate": 1.3205714108064414e-07, "loss": 0.1014, "step": 111720 }, { "epoch": 2.6051470373932557, "grad_norm": 3.345367431640625, "learning_rate": 1.3197941926257537e-07, "loss": 0.0917, "step": 111730 }, { "epoch": 2.6053801987700735, "grad_norm": 1.731690764427185, "learning_rate": 1.319016974445066e-07, "loss": 0.1007, "step": 111740 }, { "epoch": 2.605613360146892, "grad_norm": 1.7189425230026245, "learning_rate": 1.3182397562643785e-07, "loss": 0.1124, "step": 111750 }, { "epoch": 2.6058465215237097, "grad_norm": 1.2704670429229736, "learning_rate": 1.3174625380836905e-07, "loss": 0.0954, "step": 111760 }, { "epoch": 2.6060796829005275, "grad_norm": 3.2982616424560547, "learning_rate": 1.3166853199030032e-07, "loss": 0.096, "step": 111770 }, { "epoch": 2.6063128442773453, "grad_norm": 2.014630079269409, "learning_rate": 1.3159081017223153e-07, "loss": 0.1027, "step": 111780 }, { "epoch": 2.606546005654163, "grad_norm": 1.2832366228103638, "learning_rate": 1.315130883541628e-07, "loss": 0.1038, "step": 111790 }, { "epoch": 2.6067791670309814, "grad_norm": 1.1983911991119385, "learning_rate": 1.31435366536094e-07, "loss": 0.1264, "step": 111800 }, { "epoch": 2.6070123284077993, "grad_norm": 1.2462880611419678, "learning_rate": 1.3135764471802523e-07, "loss": 0.1, "step": 111810 }, { "epoch": 2.607245489784617, "grad_norm": 1.9817854166030884, "learning_rate": 1.3127992289995647e-07, "loss": 0.0924, "step": 111820 }, { "epoch": 2.607478651161435, "grad_norm": 1.3979182243347168, "learning_rate": 1.312022010818877e-07, "loss": 0.1023, "step": 111830 }, { "epoch": 2.607711812538253, "grad_norm": 1.4379457235336304, "learning_rate": 1.3112447926381891e-07, "loss": 0.1165, "step": 111840 }, { "epoch": 2.607944973915071, "grad_norm": 1.6259359121322632, "learning_rate": 1.3104675744575018e-07, "loss": 0.0998, "step": 111850 }, { "epoch": 2.608178135291889, "grad_norm": 2.1084792613983154, "learning_rate": 1.3096903562768139e-07, "loss": 0.0991, "step": 111860 }, { "epoch": 2.6084112966687067, "grad_norm": 4.270814895629883, "learning_rate": 1.3089131380961262e-07, "loss": 0.1036, "step": 111870 }, { "epoch": 2.608644458045525, "grad_norm": 2.3139090538024902, "learning_rate": 1.3081359199154386e-07, "loss": 0.1031, "step": 111880 }, { "epoch": 2.6088776194223424, "grad_norm": 1.6482986211776733, "learning_rate": 1.307358701734751e-07, "loss": 0.1049, "step": 111890 }, { "epoch": 2.6091107807991607, "grad_norm": 3.3337032794952393, "learning_rate": 1.306581483554063e-07, "loss": 0.1152, "step": 111900 }, { "epoch": 2.6093439421759785, "grad_norm": 1.2877354621887207, "learning_rate": 1.3058042653733756e-07, "loss": 0.0946, "step": 111910 }, { "epoch": 2.6095771035527964, "grad_norm": 1.6410733461380005, "learning_rate": 1.3050270471926877e-07, "loss": 0.1139, "step": 111920 }, { "epoch": 2.6098102649296147, "grad_norm": 2.891162157058716, "learning_rate": 1.304249829012e-07, "loss": 0.1032, "step": 111930 }, { "epoch": 2.6100434263064325, "grad_norm": 1.2980034351348877, "learning_rate": 1.3034726108313124e-07, "loss": 0.0912, "step": 111940 }, { "epoch": 2.6102765876832503, "grad_norm": 1.3672831058502197, "learning_rate": 1.3026953926506248e-07, "loss": 0.1158, "step": 111950 }, { "epoch": 2.610509749060068, "grad_norm": 2.1862716674804688, "learning_rate": 1.301918174469937e-07, "loss": 0.099, "step": 111960 }, { "epoch": 2.610742910436886, "grad_norm": 3.1159186363220215, "learning_rate": 1.3011409562892495e-07, "loss": 0.1098, "step": 111970 }, { "epoch": 2.6109760718137043, "grad_norm": 2.303208351135254, "learning_rate": 1.3003637381085616e-07, "loss": 0.1036, "step": 111980 }, { "epoch": 2.611209233190522, "grad_norm": 3.722343921661377, "learning_rate": 1.2995865199278742e-07, "loss": 0.1021, "step": 111990 }, { "epoch": 2.61144239456734, "grad_norm": 1.5309603214263916, "learning_rate": 1.2988093017471863e-07, "loss": 0.1038, "step": 112000 }, { "epoch": 2.611675555944158, "grad_norm": 1.2343664169311523, "learning_rate": 1.2980320835664987e-07, "loss": 0.0989, "step": 112010 }, { "epoch": 2.6119087173209756, "grad_norm": 2.276027202606201, "learning_rate": 1.297254865385811e-07, "loss": 0.0937, "step": 112020 }, { "epoch": 2.612141878697794, "grad_norm": 1.6413984298706055, "learning_rate": 1.2964776472051234e-07, "loss": 0.0922, "step": 112030 }, { "epoch": 2.6123750400746117, "grad_norm": 2.4291346073150635, "learning_rate": 1.2957004290244355e-07, "loss": 0.1082, "step": 112040 }, { "epoch": 2.6126082014514296, "grad_norm": 1.6420127153396606, "learning_rate": 1.294923210843748e-07, "loss": 0.1112, "step": 112050 }, { "epoch": 2.6128413628282474, "grad_norm": 1.3639065027236938, "learning_rate": 1.2941459926630602e-07, "loss": 0.1014, "step": 112060 }, { "epoch": 2.6130745242050653, "grad_norm": 1.3836619853973389, "learning_rate": 1.2933687744823726e-07, "loss": 0.0834, "step": 112070 }, { "epoch": 2.6133076855818835, "grad_norm": 1.3738638162612915, "learning_rate": 1.292591556301685e-07, "loss": 0.1088, "step": 112080 }, { "epoch": 2.6135408469587014, "grad_norm": 2.940187692642212, "learning_rate": 1.2918143381209973e-07, "loss": 0.1003, "step": 112090 }, { "epoch": 2.613774008335519, "grad_norm": 1.190896987915039, "learning_rate": 1.2910371199403094e-07, "loss": 0.1045, "step": 112100 }, { "epoch": 2.614007169712337, "grad_norm": 4.402031898498535, "learning_rate": 1.290259901759622e-07, "loss": 0.1061, "step": 112110 }, { "epoch": 2.614240331089155, "grad_norm": 2.34655499458313, "learning_rate": 1.289482683578934e-07, "loss": 0.1117, "step": 112120 }, { "epoch": 2.614473492465973, "grad_norm": 1.310596227645874, "learning_rate": 1.2887054653982464e-07, "loss": 0.0984, "step": 112130 }, { "epoch": 2.614706653842791, "grad_norm": 1.2475818395614624, "learning_rate": 1.2879282472175588e-07, "loss": 0.1096, "step": 112140 }, { "epoch": 2.614939815219609, "grad_norm": 1.4081878662109375, "learning_rate": 1.2871510290368711e-07, "loss": 0.1002, "step": 112150 }, { "epoch": 2.6151729765964267, "grad_norm": 1.758087158203125, "learning_rate": 1.2863738108561835e-07, "loss": 0.1083, "step": 112160 }, { "epoch": 2.6154061379732445, "grad_norm": 0.9085190892219543, "learning_rate": 1.2855965926754959e-07, "loss": 0.1053, "step": 112170 }, { "epoch": 2.615639299350063, "grad_norm": 3.46134614944458, "learning_rate": 1.2848193744948082e-07, "loss": 0.1161, "step": 112180 }, { "epoch": 2.6158724607268806, "grad_norm": 1.18251633644104, "learning_rate": 1.2840421563141206e-07, "loss": 0.1018, "step": 112190 }, { "epoch": 2.6161056221036985, "grad_norm": 2.3530008792877197, "learning_rate": 1.283264938133433e-07, "loss": 0.1063, "step": 112200 }, { "epoch": 2.6163387834805163, "grad_norm": 1.4103106260299683, "learning_rate": 1.282487719952745e-07, "loss": 0.1128, "step": 112210 }, { "epoch": 2.616571944857334, "grad_norm": 1.7931524515151978, "learning_rate": 1.2817105017720576e-07, "loss": 0.1066, "step": 112220 }, { "epoch": 2.6168051062341524, "grad_norm": 1.717052936553955, "learning_rate": 1.2809332835913697e-07, "loss": 0.1164, "step": 112230 }, { "epoch": 2.6170382676109702, "grad_norm": 2.0805652141571045, "learning_rate": 1.280156065410682e-07, "loss": 0.0962, "step": 112240 }, { "epoch": 2.617271428987788, "grad_norm": 1.3405681848526, "learning_rate": 1.2793788472299944e-07, "loss": 0.0993, "step": 112250 }, { "epoch": 2.6175045903646064, "grad_norm": 1.9645885229110718, "learning_rate": 1.2786016290493068e-07, "loss": 0.102, "step": 112260 }, { "epoch": 2.6177377517414238, "grad_norm": 1.4787434339523315, "learning_rate": 1.277824410868619e-07, "loss": 0.0967, "step": 112270 }, { "epoch": 2.617970913118242, "grad_norm": 2.257563591003418, "learning_rate": 1.2770471926879315e-07, "loss": 0.1045, "step": 112280 }, { "epoch": 2.61820407449506, "grad_norm": 1.4217004776000977, "learning_rate": 1.2762699745072436e-07, "loss": 0.1022, "step": 112290 }, { "epoch": 2.6184372358718777, "grad_norm": 1.4378420114517212, "learning_rate": 1.275492756326556e-07, "loss": 0.1034, "step": 112300 }, { "epoch": 2.618670397248696, "grad_norm": 1.3155803680419922, "learning_rate": 1.2747155381458683e-07, "loss": 0.1137, "step": 112310 }, { "epoch": 2.618903558625514, "grad_norm": 1.2877832651138306, "learning_rate": 1.2739383199651807e-07, "loss": 0.1119, "step": 112320 }, { "epoch": 2.6191367200023317, "grad_norm": 1.3195990324020386, "learning_rate": 1.2731611017844928e-07, "loss": 0.1016, "step": 112330 }, { "epoch": 2.6193698813791495, "grad_norm": 1.220952033996582, "learning_rate": 1.2723838836038054e-07, "loss": 0.1005, "step": 112340 }, { "epoch": 2.6196030427559673, "grad_norm": 1.3989036083221436, "learning_rate": 1.2716066654231175e-07, "loss": 0.1091, "step": 112350 }, { "epoch": 2.6198362041327856, "grad_norm": 2.209023952484131, "learning_rate": 1.2708294472424298e-07, "loss": 0.1162, "step": 112360 }, { "epoch": 2.6200693655096035, "grad_norm": 1.204715371131897, "learning_rate": 1.2700522290617422e-07, "loss": 0.1026, "step": 112370 }, { "epoch": 2.6203025268864213, "grad_norm": 1.446114182472229, "learning_rate": 1.2692750108810546e-07, "loss": 0.0964, "step": 112380 }, { "epoch": 2.620535688263239, "grad_norm": 1.3690482378005981, "learning_rate": 1.2684977927003666e-07, "loss": 0.1016, "step": 112390 }, { "epoch": 2.620768849640057, "grad_norm": 2.0810546875, "learning_rate": 1.2677205745196793e-07, "loss": 0.0983, "step": 112400 }, { "epoch": 2.6210020110168752, "grad_norm": 1.6596598625183105, "learning_rate": 1.2669433563389914e-07, "loss": 0.1227, "step": 112410 }, { "epoch": 2.621235172393693, "grad_norm": 2.7712745666503906, "learning_rate": 1.266166138158304e-07, "loss": 0.1117, "step": 112420 }, { "epoch": 2.621468333770511, "grad_norm": 1.6692874431610107, "learning_rate": 1.265388919977616e-07, "loss": 0.1082, "step": 112430 }, { "epoch": 2.6217014951473288, "grad_norm": 2.2866530418395996, "learning_rate": 1.2646117017969284e-07, "loss": 0.0916, "step": 112440 }, { "epoch": 2.6219346565241466, "grad_norm": 1.6444135904312134, "learning_rate": 1.2638344836162408e-07, "loss": 0.0906, "step": 112450 }, { "epoch": 2.622167817900965, "grad_norm": 1.861290693283081, "learning_rate": 1.2630572654355531e-07, "loss": 0.0949, "step": 112460 }, { "epoch": 2.6224009792777827, "grad_norm": 1.5807520151138306, "learning_rate": 1.2622800472548652e-07, "loss": 0.1075, "step": 112470 }, { "epoch": 2.6226341406546005, "grad_norm": 1.4895775318145752, "learning_rate": 1.2615028290741779e-07, "loss": 0.0991, "step": 112480 }, { "epoch": 2.6228673020314184, "grad_norm": 2.0752146244049072, "learning_rate": 1.26072561089349e-07, "loss": 0.0991, "step": 112490 }, { "epoch": 2.623100463408236, "grad_norm": 1.4353523254394531, "learning_rate": 1.2599483927128023e-07, "loss": 0.0991, "step": 112500 }, { "epoch": 2.6233336247850545, "grad_norm": 2.2148244380950928, "learning_rate": 1.2591711745321147e-07, "loss": 0.0981, "step": 112510 }, { "epoch": 2.6235667861618723, "grad_norm": 1.4261521100997925, "learning_rate": 1.258393956351427e-07, "loss": 0.1041, "step": 112520 }, { "epoch": 2.62379994753869, "grad_norm": 1.734910249710083, "learning_rate": 1.257616738170739e-07, "loss": 0.1008, "step": 112530 }, { "epoch": 2.624033108915508, "grad_norm": 1.430145025253296, "learning_rate": 1.2568395199900517e-07, "loss": 0.1033, "step": 112540 }, { "epoch": 2.624266270292326, "grad_norm": 1.4062044620513916, "learning_rate": 1.2560623018093638e-07, "loss": 0.0929, "step": 112550 }, { "epoch": 2.624499431669144, "grad_norm": 3.062778949737549, "learning_rate": 1.2552850836286762e-07, "loss": 0.0963, "step": 112560 }, { "epoch": 2.624732593045962, "grad_norm": 1.2460787296295166, "learning_rate": 1.2545078654479885e-07, "loss": 0.1042, "step": 112570 }, { "epoch": 2.62496575442278, "grad_norm": 1.3600454330444336, "learning_rate": 1.253730647267301e-07, "loss": 0.1063, "step": 112580 }, { "epoch": 2.6251989157995976, "grad_norm": 2.6971774101257324, "learning_rate": 1.252953429086613e-07, "loss": 0.0985, "step": 112590 }, { "epoch": 2.6254320771764155, "grad_norm": 1.2543776035308838, "learning_rate": 1.2521762109059256e-07, "loss": 0.1068, "step": 112600 }, { "epoch": 2.6256652385532337, "grad_norm": 1.6487420797348022, "learning_rate": 1.2513989927252377e-07, "loss": 0.1118, "step": 112610 }, { "epoch": 2.6258983999300516, "grad_norm": 1.2779525518417358, "learning_rate": 1.2506217745445503e-07, "loss": 0.0992, "step": 112620 }, { "epoch": 2.6261315613068694, "grad_norm": 1.687424659729004, "learning_rate": 1.2498445563638624e-07, "loss": 0.098, "step": 112630 }, { "epoch": 2.6263647226836877, "grad_norm": 1.2431031465530396, "learning_rate": 1.2490673381831748e-07, "loss": 0.098, "step": 112640 }, { "epoch": 2.6265978840605055, "grad_norm": 1.2010825872421265, "learning_rate": 1.248290120002487e-07, "loss": 0.0972, "step": 112650 }, { "epoch": 2.6268310454373234, "grad_norm": 1.350855827331543, "learning_rate": 1.2475129018217992e-07, "loss": 0.1008, "step": 112660 }, { "epoch": 2.627064206814141, "grad_norm": 1.4560511112213135, "learning_rate": 1.2467356836411116e-07, "loss": 0.1004, "step": 112670 }, { "epoch": 2.627297368190959, "grad_norm": 1.637428641319275, "learning_rate": 1.245958465460424e-07, "loss": 0.1026, "step": 112680 }, { "epoch": 2.6275305295677773, "grad_norm": 1.3758845329284668, "learning_rate": 1.2451812472797363e-07, "loss": 0.1118, "step": 112690 }, { "epoch": 2.627763690944595, "grad_norm": 1.2154321670532227, "learning_rate": 1.2444040290990486e-07, "loss": 0.0959, "step": 112700 }, { "epoch": 2.627996852321413, "grad_norm": 1.1297086477279663, "learning_rate": 1.243626810918361e-07, "loss": 0.0944, "step": 112710 }, { "epoch": 2.628230013698231, "grad_norm": 1.130084753036499, "learning_rate": 1.2428495927376734e-07, "loss": 0.1024, "step": 112720 }, { "epoch": 2.6284631750750487, "grad_norm": 1.907798409461975, "learning_rate": 1.2420723745569855e-07, "loss": 0.0946, "step": 112730 }, { "epoch": 2.628696336451867, "grad_norm": 2.5470471382141113, "learning_rate": 1.2412951563762978e-07, "loss": 0.0948, "step": 112740 }, { "epoch": 2.628929497828685, "grad_norm": 1.5321502685546875, "learning_rate": 1.2405179381956102e-07, "loss": 0.1098, "step": 112750 }, { "epoch": 2.6291626592055026, "grad_norm": 1.5416550636291504, "learning_rate": 1.2397407200149225e-07, "loss": 0.101, "step": 112760 }, { "epoch": 2.6293958205823205, "grad_norm": 3.280447244644165, "learning_rate": 1.238963501834235e-07, "loss": 0.0943, "step": 112770 }, { "epoch": 2.6296289819591383, "grad_norm": 1.08127760887146, "learning_rate": 1.2381862836535472e-07, "loss": 0.0956, "step": 112780 }, { "epoch": 2.6298621433359566, "grad_norm": 1.6948884725570679, "learning_rate": 1.2374090654728593e-07, "loss": 0.1142, "step": 112790 }, { "epoch": 2.6300953047127744, "grad_norm": 1.325183629989624, "learning_rate": 1.2366318472921717e-07, "loss": 0.1051, "step": 112800 }, { "epoch": 2.6303284660895923, "grad_norm": 1.5574432611465454, "learning_rate": 1.235854629111484e-07, "loss": 0.1007, "step": 112810 }, { "epoch": 2.63056162746641, "grad_norm": 1.6463580131530762, "learning_rate": 1.2350774109307964e-07, "loss": 0.1054, "step": 112820 }, { "epoch": 2.630794788843228, "grad_norm": 1.3154211044311523, "learning_rate": 1.2343001927501088e-07, "loss": 0.1, "step": 112830 }, { "epoch": 2.631027950220046, "grad_norm": 3.6571836471557617, "learning_rate": 1.233522974569421e-07, "loss": 0.1029, "step": 112840 }, { "epoch": 2.631261111596864, "grad_norm": 1.630454182624817, "learning_rate": 1.2327457563887335e-07, "loss": 0.1043, "step": 112850 }, { "epoch": 2.631494272973682, "grad_norm": 1.5519566535949707, "learning_rate": 1.2319685382080456e-07, "loss": 0.1047, "step": 112860 }, { "epoch": 2.6317274343504997, "grad_norm": 1.45756196975708, "learning_rate": 1.231191320027358e-07, "loss": 0.0973, "step": 112870 }, { "epoch": 2.6319605957273176, "grad_norm": 2.568368434906006, "learning_rate": 1.2304141018466703e-07, "loss": 0.1145, "step": 112880 }, { "epoch": 2.632193757104136, "grad_norm": 1.8960498571395874, "learning_rate": 1.2296368836659826e-07, "loss": 0.1031, "step": 112890 }, { "epoch": 2.6324269184809537, "grad_norm": 1.8185561895370483, "learning_rate": 1.228859665485295e-07, "loss": 0.0994, "step": 112900 }, { "epoch": 2.6326600798577715, "grad_norm": 1.507053017616272, "learning_rate": 1.2280824473046073e-07, "loss": 0.1133, "step": 112910 }, { "epoch": 2.6328932412345893, "grad_norm": 1.5610604286193848, "learning_rate": 1.2273052291239194e-07, "loss": 0.1075, "step": 112920 }, { "epoch": 2.633126402611407, "grad_norm": 2.436309576034546, "learning_rate": 1.2265280109432318e-07, "loss": 0.103, "step": 112930 }, { "epoch": 2.6333595639882255, "grad_norm": 0.945306658744812, "learning_rate": 1.2257507927625442e-07, "loss": 0.0976, "step": 112940 }, { "epoch": 2.6335927253650433, "grad_norm": 2.551158905029297, "learning_rate": 1.2249735745818565e-07, "loss": 0.1118, "step": 112950 }, { "epoch": 2.633825886741861, "grad_norm": 1.606138825416565, "learning_rate": 1.2241963564011689e-07, "loss": 0.1053, "step": 112960 }, { "epoch": 2.6340590481186794, "grad_norm": 1.6758943796157837, "learning_rate": 1.2234191382204812e-07, "loss": 0.1004, "step": 112970 }, { "epoch": 2.634292209495497, "grad_norm": 1.4527901411056519, "learning_rate": 1.2226419200397936e-07, "loss": 0.1038, "step": 112980 }, { "epoch": 2.634525370872315, "grad_norm": 1.050073266029358, "learning_rate": 1.2218647018591057e-07, "loss": 0.1054, "step": 112990 }, { "epoch": 2.634758532249133, "grad_norm": 1.8070532083511353, "learning_rate": 1.221087483678418e-07, "loss": 0.1143, "step": 113000 }, { "epoch": 2.6349916936259508, "grad_norm": 2.962803602218628, "learning_rate": 1.2203102654977304e-07, "loss": 0.1031, "step": 113010 }, { "epoch": 2.635224855002769, "grad_norm": 1.6603339910507202, "learning_rate": 1.2195330473170427e-07, "loss": 0.1113, "step": 113020 }, { "epoch": 2.635458016379587, "grad_norm": 1.3644464015960693, "learning_rate": 1.218755829136355e-07, "loss": 0.1004, "step": 113030 }, { "epoch": 2.6356911777564047, "grad_norm": 3.515462875366211, "learning_rate": 1.2179786109556675e-07, "loss": 0.1044, "step": 113040 }, { "epoch": 2.6359243391332225, "grad_norm": 1.8087356090545654, "learning_rate": 1.2172013927749798e-07, "loss": 0.1052, "step": 113050 }, { "epoch": 2.6361575005100404, "grad_norm": 1.7895138263702393, "learning_rate": 1.216424174594292e-07, "loss": 0.0967, "step": 113060 }, { "epoch": 2.6363906618868587, "grad_norm": 1.350508213043213, "learning_rate": 1.2156469564136043e-07, "loss": 0.0855, "step": 113070 }, { "epoch": 2.6366238232636765, "grad_norm": 1.6960091590881348, "learning_rate": 1.2148697382329166e-07, "loss": 0.1041, "step": 113080 }, { "epoch": 2.6368569846404943, "grad_norm": 3.5625219345092773, "learning_rate": 1.2141702418702976e-07, "loss": 0.1062, "step": 113090 }, { "epoch": 2.637090146017312, "grad_norm": 3.170208215713501, "learning_rate": 1.21339302368961e-07, "loss": 0.099, "step": 113100 }, { "epoch": 2.63732330739413, "grad_norm": 1.8729071617126465, "learning_rate": 1.2126158055089223e-07, "loss": 0.1, "step": 113110 }, { "epoch": 2.6375564687709483, "grad_norm": 1.7354398965835571, "learning_rate": 1.2118385873282347e-07, "loss": 0.1016, "step": 113120 }, { "epoch": 2.637789630147766, "grad_norm": 1.699614405632019, "learning_rate": 1.211061369147547e-07, "loss": 0.0983, "step": 113130 }, { "epoch": 2.638022791524584, "grad_norm": 1.7439223527908325, "learning_rate": 1.2102841509668594e-07, "loss": 0.1113, "step": 113140 }, { "epoch": 2.638255952901402, "grad_norm": 2.184187650680542, "learning_rate": 1.2095069327861717e-07, "loss": 0.105, "step": 113150 }, { "epoch": 2.6384891142782196, "grad_norm": 1.5386714935302734, "learning_rate": 1.2087297146054838e-07, "loss": 0.1074, "step": 113160 }, { "epoch": 2.638722275655038, "grad_norm": 1.2844287157058716, "learning_rate": 1.2079524964247962e-07, "loss": 0.0877, "step": 113170 }, { "epoch": 2.6389554370318558, "grad_norm": 3.3515231609344482, "learning_rate": 1.2072530000621775e-07, "loss": 0.1028, "step": 113180 }, { "epoch": 2.6391885984086736, "grad_norm": 1.175418496131897, "learning_rate": 1.2064757818814898e-07, "loss": 0.1001, "step": 113190 }, { "epoch": 2.6394217597854914, "grad_norm": 3.3830440044403076, "learning_rate": 1.205698563700802e-07, "loss": 0.0999, "step": 113200 }, { "epoch": 2.6396549211623093, "grad_norm": 2.4774367809295654, "learning_rate": 1.2049213455201143e-07, "loss": 0.0935, "step": 113210 }, { "epoch": 2.6398880825391275, "grad_norm": 1.1923818588256836, "learning_rate": 1.2041441273394266e-07, "loss": 0.1062, "step": 113220 }, { "epoch": 2.6401212439159454, "grad_norm": 1.5359011888504028, "learning_rate": 1.203366909158739e-07, "loss": 0.1075, "step": 113230 }, { "epoch": 2.640354405292763, "grad_norm": 1.0380343198776245, "learning_rate": 1.2025896909780513e-07, "loss": 0.0984, "step": 113240 }, { "epoch": 2.640587566669581, "grad_norm": 1.5295796394348145, "learning_rate": 1.2018124727973637e-07, "loss": 0.11, "step": 113250 }, { "epoch": 2.640820728046399, "grad_norm": 1.3737255334854126, "learning_rate": 1.2010352546166758e-07, "loss": 0.113, "step": 113260 }, { "epoch": 2.641053889423217, "grad_norm": 2.8920624256134033, "learning_rate": 1.200258036435988e-07, "loss": 0.0986, "step": 113270 }, { "epoch": 2.641287050800035, "grad_norm": 1.3389452695846558, "learning_rate": 1.1994808182553005e-07, "loss": 0.1032, "step": 113280 }, { "epoch": 2.641520212176853, "grad_norm": 1.4243777990341187, "learning_rate": 1.1987036000746128e-07, "loss": 0.1054, "step": 113290 }, { "epoch": 2.6417533735536707, "grad_norm": 2.0193426609039307, "learning_rate": 1.1979263818939252e-07, "loss": 0.1042, "step": 113300 }, { "epoch": 2.6419865349304885, "grad_norm": 1.3117645978927612, "learning_rate": 1.1971491637132376e-07, "loss": 0.1035, "step": 113310 }, { "epoch": 2.642219696307307, "grad_norm": 2.0054702758789062, "learning_rate": 1.19637194553255e-07, "loss": 0.1007, "step": 113320 }, { "epoch": 2.6424528576841246, "grad_norm": 1.4285030364990234, "learning_rate": 1.195594727351862e-07, "loss": 0.1088, "step": 113330 }, { "epoch": 2.6426860190609425, "grad_norm": 1.7472189664840698, "learning_rate": 1.1948175091711744e-07, "loss": 0.1058, "step": 113340 }, { "epoch": 2.6429191804377608, "grad_norm": 2.1199193000793457, "learning_rate": 1.1940402909904867e-07, "loss": 0.0941, "step": 113350 }, { "epoch": 2.643152341814578, "grad_norm": 1.906668782234192, "learning_rate": 1.193263072809799e-07, "loss": 0.1019, "step": 113360 }, { "epoch": 2.6433855031913964, "grad_norm": 1.6353920698165894, "learning_rate": 1.1924858546291114e-07, "loss": 0.1043, "step": 113370 }, { "epoch": 2.6436186645682143, "grad_norm": 3.76737380027771, "learning_rate": 1.1917086364484237e-07, "loss": 0.1025, "step": 113380 }, { "epoch": 2.643851825945032, "grad_norm": 2.1553971767425537, "learning_rate": 1.190931418267736e-07, "loss": 0.1026, "step": 113390 }, { "epoch": 2.6440849873218504, "grad_norm": 1.3061329126358032, "learning_rate": 1.1901542000870484e-07, "loss": 0.1114, "step": 113400 }, { "epoch": 2.644318148698668, "grad_norm": 1.3535168170928955, "learning_rate": 1.1893769819063606e-07, "loss": 0.1029, "step": 113410 }, { "epoch": 2.644551310075486, "grad_norm": 1.2356834411621094, "learning_rate": 1.188599763725673e-07, "loss": 0.1106, "step": 113420 }, { "epoch": 2.644784471452304, "grad_norm": 1.161731481552124, "learning_rate": 1.1878225455449853e-07, "loss": 0.0857, "step": 113430 }, { "epoch": 2.6450176328291217, "grad_norm": 1.4350546598434448, "learning_rate": 1.1870453273642977e-07, "loss": 0.0962, "step": 113440 }, { "epoch": 2.64525079420594, "grad_norm": 2.181447744369507, "learning_rate": 1.1862681091836099e-07, "loss": 0.1031, "step": 113450 }, { "epoch": 2.645483955582758, "grad_norm": 1.2071819305419922, "learning_rate": 1.1854908910029222e-07, "loss": 0.0966, "step": 113460 }, { "epoch": 2.6457171169595757, "grad_norm": 1.2566014528274536, "learning_rate": 1.1847136728222346e-07, "loss": 0.0982, "step": 113470 }, { "epoch": 2.6459502783363935, "grad_norm": 2.7930781841278076, "learning_rate": 1.1839364546415468e-07, "loss": 0.1033, "step": 113480 }, { "epoch": 2.6461834397132113, "grad_norm": 1.761504888534546, "learning_rate": 1.1831592364608592e-07, "loss": 0.1069, "step": 113490 }, { "epoch": 2.6464166010900296, "grad_norm": 1.6723358631134033, "learning_rate": 1.1823820182801715e-07, "loss": 0.0967, "step": 113500 }, { "epoch": 2.6466497624668475, "grad_norm": 1.6107820272445679, "learning_rate": 1.1816048000994838e-07, "loss": 0.1012, "step": 113510 }, { "epoch": 2.6468829238436653, "grad_norm": 1.7716504335403442, "learning_rate": 1.1808275819187961e-07, "loss": 0.097, "step": 113520 }, { "epoch": 2.647116085220483, "grad_norm": 1.6493192911148071, "learning_rate": 1.1800503637381085e-07, "loss": 0.105, "step": 113530 }, { "epoch": 2.647349246597301, "grad_norm": 1.9589829444885254, "learning_rate": 1.1792731455574207e-07, "loss": 0.1008, "step": 113540 }, { "epoch": 2.6475824079741193, "grad_norm": 1.049735188484192, "learning_rate": 1.178495927376733e-07, "loss": 0.1042, "step": 113550 }, { "epoch": 2.647815569350937, "grad_norm": 1.1701730489730835, "learning_rate": 1.1777187091960454e-07, "loss": 0.0881, "step": 113560 }, { "epoch": 2.648048730727755, "grad_norm": 3.038384199142456, "learning_rate": 1.1769414910153578e-07, "loss": 0.1048, "step": 113570 }, { "epoch": 2.6482818921045728, "grad_norm": 2.457817792892456, "learning_rate": 1.17616427283467e-07, "loss": 0.0981, "step": 113580 }, { "epoch": 2.6485150534813906, "grad_norm": 1.8232228755950928, "learning_rate": 1.1753870546539824e-07, "loss": 0.0999, "step": 113590 }, { "epoch": 2.648748214858209, "grad_norm": 1.6202633380889893, "learning_rate": 1.1746098364732947e-07, "loss": 0.0923, "step": 113600 }, { "epoch": 2.6489813762350267, "grad_norm": 2.847529649734497, "learning_rate": 1.173832618292607e-07, "loss": 0.1027, "step": 113610 }, { "epoch": 2.6492145376118446, "grad_norm": 1.730877161026001, "learning_rate": 1.1730554001119193e-07, "loss": 0.1067, "step": 113620 }, { "epoch": 2.6494476989886624, "grad_norm": 1.188609004020691, "learning_rate": 1.1722781819312317e-07, "loss": 0.1137, "step": 113630 }, { "epoch": 2.6496808603654802, "grad_norm": 1.566346526145935, "learning_rate": 1.1715009637505439e-07, "loss": 0.1035, "step": 113640 }, { "epoch": 2.6499140217422985, "grad_norm": 3.2685585021972656, "learning_rate": 1.1707237455698562e-07, "loss": 0.109, "step": 113650 }, { "epoch": 2.6501471831191163, "grad_norm": 1.14082670211792, "learning_rate": 1.1699465273891686e-07, "loss": 0.1057, "step": 113660 }, { "epoch": 2.650380344495934, "grad_norm": 1.44966721534729, "learning_rate": 1.169169309208481e-07, "loss": 0.0939, "step": 113670 }, { "epoch": 2.650613505872752, "grad_norm": 1.4712601900100708, "learning_rate": 1.1683920910277932e-07, "loss": 0.0946, "step": 113680 }, { "epoch": 2.65084666724957, "grad_norm": 2.825632333755493, "learning_rate": 1.1676148728471055e-07, "loss": 0.1025, "step": 113690 }, { "epoch": 2.651079828626388, "grad_norm": 2.2089316844940186, "learning_rate": 1.1668376546664179e-07, "loss": 0.1069, "step": 113700 }, { "epoch": 2.651312990003206, "grad_norm": 2.5804004669189453, "learning_rate": 1.1660604364857301e-07, "loss": 0.096, "step": 113710 }, { "epoch": 2.651546151380024, "grad_norm": 2.036271095275879, "learning_rate": 1.1652832183050425e-07, "loss": 0.11, "step": 113720 }, { "epoch": 2.651779312756842, "grad_norm": 1.0654456615447998, "learning_rate": 1.1645060001243548e-07, "loss": 0.099, "step": 113730 }, { "epoch": 2.65201247413366, "grad_norm": 1.659115195274353, "learning_rate": 1.163728781943667e-07, "loss": 0.0881, "step": 113740 }, { "epoch": 2.6522456355104778, "grad_norm": 1.8418176174163818, "learning_rate": 1.1629515637629794e-07, "loss": 0.1088, "step": 113750 }, { "epoch": 2.6524787968872956, "grad_norm": 1.6656451225280762, "learning_rate": 1.1621743455822918e-07, "loss": 0.1112, "step": 113760 }, { "epoch": 2.6527119582641134, "grad_norm": 1.363006353378296, "learning_rate": 1.1613971274016041e-07, "loss": 0.1179, "step": 113770 }, { "epoch": 2.6529451196409317, "grad_norm": 1.697909951210022, "learning_rate": 1.1606199092209163e-07, "loss": 0.1024, "step": 113780 }, { "epoch": 2.6531782810177496, "grad_norm": 2.486633777618408, "learning_rate": 1.1598426910402287e-07, "loss": 0.0946, "step": 113790 }, { "epoch": 2.6534114423945674, "grad_norm": 1.354451060295105, "learning_rate": 1.159065472859541e-07, "loss": 0.1001, "step": 113800 }, { "epoch": 2.6536446037713852, "grad_norm": 1.5877684354782104, "learning_rate": 1.1582882546788533e-07, "loss": 0.113, "step": 113810 }, { "epoch": 2.653877765148203, "grad_norm": 1.2424702644348145, "learning_rate": 1.1575110364981656e-07, "loss": 0.1035, "step": 113820 }, { "epoch": 2.6541109265250213, "grad_norm": 1.19199538230896, "learning_rate": 1.156733818317478e-07, "loss": 0.1067, "step": 113830 }, { "epoch": 2.654344087901839, "grad_norm": 1.7077399492263794, "learning_rate": 1.1559566001367904e-07, "loss": 0.1034, "step": 113840 }, { "epoch": 2.654577249278657, "grad_norm": 1.3569364547729492, "learning_rate": 1.1551793819561027e-07, "loss": 0.1012, "step": 113850 }, { "epoch": 2.654810410655475, "grad_norm": 2.1133227348327637, "learning_rate": 1.154402163775415e-07, "loss": 0.1062, "step": 113860 }, { "epoch": 2.6550435720322927, "grad_norm": 2.085597038269043, "learning_rate": 1.1536249455947274e-07, "loss": 0.1074, "step": 113870 }, { "epoch": 2.655276733409111, "grad_norm": 1.3968459367752075, "learning_rate": 1.1528477274140396e-07, "loss": 0.1021, "step": 113880 }, { "epoch": 2.655509894785929, "grad_norm": 2.216546058654785, "learning_rate": 1.152070509233352e-07, "loss": 0.1025, "step": 113890 }, { "epoch": 2.6557430561627466, "grad_norm": 1.2064106464385986, "learning_rate": 1.1512932910526644e-07, "loss": 0.0998, "step": 113900 }, { "epoch": 2.6559762175395645, "grad_norm": 1.2482459545135498, "learning_rate": 1.1505160728719766e-07, "loss": 0.104, "step": 113910 }, { "epoch": 2.6562093789163823, "grad_norm": 3.956207036972046, "learning_rate": 1.149738854691289e-07, "loss": 0.1103, "step": 113920 }, { "epoch": 2.6564425402932006, "grad_norm": 1.0935757160186768, "learning_rate": 1.1489616365106013e-07, "loss": 0.1004, "step": 113930 }, { "epoch": 2.6566757016700184, "grad_norm": 2.2330739498138428, "learning_rate": 1.1481844183299135e-07, "loss": 0.1081, "step": 113940 }, { "epoch": 2.6569088630468363, "grad_norm": 1.3783440589904785, "learning_rate": 1.1474072001492259e-07, "loss": 0.1135, "step": 113950 }, { "epoch": 2.657142024423654, "grad_norm": 2.6007204055786133, "learning_rate": 1.1466299819685382e-07, "loss": 0.108, "step": 113960 }, { "epoch": 2.657375185800472, "grad_norm": 2.067258358001709, "learning_rate": 1.1458527637878506e-07, "loss": 0.0914, "step": 113970 }, { "epoch": 2.65760834717729, "grad_norm": 1.7173137664794922, "learning_rate": 1.1450755456071628e-07, "loss": 0.0942, "step": 113980 }, { "epoch": 2.657841508554108, "grad_norm": 2.314034938812256, "learning_rate": 1.1442983274264752e-07, "loss": 0.0928, "step": 113990 }, { "epoch": 2.658074669930926, "grad_norm": 2.9179115295410156, "learning_rate": 1.1435211092457875e-07, "loss": 0.1091, "step": 114000 }, { "epoch": 2.6583078313077437, "grad_norm": 2.15610408782959, "learning_rate": 1.1427438910650998e-07, "loss": 0.1023, "step": 114010 }, { "epoch": 2.6585409926845616, "grad_norm": 1.483035922050476, "learning_rate": 1.1419666728844121e-07, "loss": 0.1075, "step": 114020 }, { "epoch": 2.65877415406138, "grad_norm": 1.178507685661316, "learning_rate": 1.1411894547037245e-07, "loss": 0.1014, "step": 114030 }, { "epoch": 2.6590073154381977, "grad_norm": 1.502898931503296, "learning_rate": 1.1404122365230367e-07, "loss": 0.0986, "step": 114040 }, { "epoch": 2.6592404768150155, "grad_norm": 3.5413126945495605, "learning_rate": 1.139635018342349e-07, "loss": 0.1091, "step": 114050 }, { "epoch": 2.659473638191834, "grad_norm": 2.2694835662841797, "learning_rate": 1.1388578001616614e-07, "loss": 0.111, "step": 114060 }, { "epoch": 2.659706799568651, "grad_norm": 1.401226282119751, "learning_rate": 1.1380805819809738e-07, "loss": 0.0946, "step": 114070 }, { "epoch": 2.6599399609454695, "grad_norm": 2.552462100982666, "learning_rate": 1.137303363800286e-07, "loss": 0.1091, "step": 114080 }, { "epoch": 2.6601731223222873, "grad_norm": 1.6425353288650513, "learning_rate": 1.1365261456195983e-07, "loss": 0.1069, "step": 114090 }, { "epoch": 2.660406283699105, "grad_norm": 1.9120651483535767, "learning_rate": 1.1357489274389107e-07, "loss": 0.1011, "step": 114100 }, { "epoch": 2.6606394450759234, "grad_norm": 2.2492125034332275, "learning_rate": 1.1349717092582229e-07, "loss": 0.1089, "step": 114110 }, { "epoch": 2.6608726064527413, "grad_norm": 1.1884398460388184, "learning_rate": 1.1341944910775353e-07, "loss": 0.0936, "step": 114120 }, { "epoch": 2.661105767829559, "grad_norm": 1.462022066116333, "learning_rate": 1.1334172728968476e-07, "loss": 0.0994, "step": 114130 }, { "epoch": 2.661338929206377, "grad_norm": 1.8942179679870605, "learning_rate": 1.1326400547161599e-07, "loss": 0.1008, "step": 114140 }, { "epoch": 2.6615720905831948, "grad_norm": 1.5436700582504272, "learning_rate": 1.1318628365354722e-07, "loss": 0.1026, "step": 114150 }, { "epoch": 2.661805251960013, "grad_norm": 1.0572080612182617, "learning_rate": 1.1310856183547846e-07, "loss": 0.1093, "step": 114160 }, { "epoch": 2.662038413336831, "grad_norm": 1.527656078338623, "learning_rate": 1.130308400174097e-07, "loss": 0.1039, "step": 114170 }, { "epoch": 2.6622715747136487, "grad_norm": 1.1102161407470703, "learning_rate": 1.1295311819934092e-07, "loss": 0.1065, "step": 114180 }, { "epoch": 2.6625047360904666, "grad_norm": 1.324817180633545, "learning_rate": 1.1287539638127215e-07, "loss": 0.1029, "step": 114190 }, { "epoch": 2.6627378974672844, "grad_norm": 1.5223809480667114, "learning_rate": 1.1279767456320339e-07, "loss": 0.0979, "step": 114200 }, { "epoch": 2.6629710588441027, "grad_norm": 1.8553690910339355, "learning_rate": 1.1271995274513461e-07, "loss": 0.1067, "step": 114210 }, { "epoch": 2.6632042202209205, "grad_norm": 2.669212579727173, "learning_rate": 1.1264223092706585e-07, "loss": 0.1172, "step": 114220 }, { "epoch": 2.6634373815977384, "grad_norm": 1.5172817707061768, "learning_rate": 1.1256450910899708e-07, "loss": 0.0963, "step": 114230 }, { "epoch": 2.663670542974556, "grad_norm": 2.8289341926574707, "learning_rate": 1.124867872909283e-07, "loss": 0.0997, "step": 114240 }, { "epoch": 2.663903704351374, "grad_norm": 2.233863115310669, "learning_rate": 1.1240906547285954e-07, "loss": 0.0912, "step": 114250 }, { "epoch": 2.6641368657281923, "grad_norm": 1.8431824445724487, "learning_rate": 1.1233134365479078e-07, "loss": 0.0954, "step": 114260 }, { "epoch": 2.66437002710501, "grad_norm": 1.6624529361724854, "learning_rate": 1.12253621836722e-07, "loss": 0.111, "step": 114270 }, { "epoch": 2.664603188481828, "grad_norm": 1.6953693628311157, "learning_rate": 1.1217590001865323e-07, "loss": 0.1093, "step": 114280 }, { "epoch": 2.664836349858646, "grad_norm": 2.2230048179626465, "learning_rate": 1.1209817820058447e-07, "loss": 0.1099, "step": 114290 }, { "epoch": 2.6650695112354637, "grad_norm": 1.7626830339431763, "learning_rate": 1.120204563825157e-07, "loss": 0.1052, "step": 114300 }, { "epoch": 2.665302672612282, "grad_norm": 1.4728336334228516, "learning_rate": 1.1194273456444693e-07, "loss": 0.1113, "step": 114310 }, { "epoch": 2.6655358339890998, "grad_norm": 1.2178595066070557, "learning_rate": 1.1186501274637816e-07, "loss": 0.1123, "step": 114320 }, { "epoch": 2.6657689953659176, "grad_norm": 2.016934394836426, "learning_rate": 1.117872909283094e-07, "loss": 0.1103, "step": 114330 }, { "epoch": 2.6660021567427354, "grad_norm": 1.618549108505249, "learning_rate": 1.1170956911024062e-07, "loss": 0.1016, "step": 114340 }, { "epoch": 2.6662353181195533, "grad_norm": 1.6238280534744263, "learning_rate": 1.1163184729217186e-07, "loss": 0.0991, "step": 114350 }, { "epoch": 2.6664684794963716, "grad_norm": 3.4783718585968018, "learning_rate": 1.1155412547410309e-07, "loss": 0.1014, "step": 114360 }, { "epoch": 2.6667016408731894, "grad_norm": 1.792203664779663, "learning_rate": 1.1147640365603431e-07, "loss": 0.095, "step": 114370 }, { "epoch": 2.6669348022500072, "grad_norm": 1.6072794198989868, "learning_rate": 1.1139868183796555e-07, "loss": 0.0959, "step": 114380 }, { "epoch": 2.667167963626825, "grad_norm": 1.560250163078308, "learning_rate": 1.1132096001989679e-07, "loss": 0.1029, "step": 114390 }, { "epoch": 2.667401125003643, "grad_norm": 1.9700736999511719, "learning_rate": 1.1124323820182802e-07, "loss": 0.101, "step": 114400 }, { "epoch": 2.667634286380461, "grad_norm": 2.581894636154175, "learning_rate": 1.1116551638375924e-07, "loss": 0.0981, "step": 114410 }, { "epoch": 2.667867447757279, "grad_norm": 1.72541081905365, "learning_rate": 1.1108779456569048e-07, "loss": 0.1074, "step": 114420 }, { "epoch": 2.668100609134097, "grad_norm": 1.189252257347107, "learning_rate": 1.1101007274762172e-07, "loss": 0.0863, "step": 114430 }, { "epoch": 2.668333770510915, "grad_norm": 2.5330615043640137, "learning_rate": 1.1093235092955294e-07, "loss": 0.1074, "step": 114440 }, { "epoch": 2.6685669318877325, "grad_norm": 1.5763293504714966, "learning_rate": 1.1085462911148417e-07, "loss": 0.1033, "step": 114450 }, { "epoch": 2.668800093264551, "grad_norm": 1.1033649444580078, "learning_rate": 1.1077690729341541e-07, "loss": 0.0879, "step": 114460 }, { "epoch": 2.6690332546413686, "grad_norm": 2.2235522270202637, "learning_rate": 1.1069918547534663e-07, "loss": 0.0998, "step": 114470 }, { "epoch": 2.6692664160181865, "grad_norm": 1.0543133020401, "learning_rate": 1.1062146365727787e-07, "loss": 0.1035, "step": 114480 }, { "epoch": 2.6694995773950048, "grad_norm": 2.147421360015869, "learning_rate": 1.105437418392091e-07, "loss": 0.0893, "step": 114490 }, { "epoch": 2.6697327387718226, "grad_norm": 1.4033316373825073, "learning_rate": 1.1046602002114034e-07, "loss": 0.1066, "step": 114500 }, { "epoch": 2.6699659001486404, "grad_norm": 1.262558937072754, "learning_rate": 1.1038829820307156e-07, "loss": 0.1181, "step": 114510 }, { "epoch": 2.6701990615254583, "grad_norm": 2.8250815868377686, "learning_rate": 1.103105763850028e-07, "loss": 0.1069, "step": 114520 }, { "epoch": 2.670432222902276, "grad_norm": 2.9840452671051025, "learning_rate": 1.1023285456693403e-07, "loss": 0.1072, "step": 114530 }, { "epoch": 2.6706653842790944, "grad_norm": 1.2769464254379272, "learning_rate": 1.1015513274886525e-07, "loss": 0.0932, "step": 114540 }, { "epoch": 2.6708985456559122, "grad_norm": 1.0407177209854126, "learning_rate": 1.1007741093079649e-07, "loss": 0.0935, "step": 114550 }, { "epoch": 2.67113170703273, "grad_norm": 1.8218942880630493, "learning_rate": 1.0999968911272773e-07, "loss": 0.1034, "step": 114560 }, { "epoch": 2.671364868409548, "grad_norm": 2.1962618827819824, "learning_rate": 1.0992196729465895e-07, "loss": 0.1049, "step": 114570 }, { "epoch": 2.6715980297863657, "grad_norm": 1.281095266342163, "learning_rate": 1.0984424547659018e-07, "loss": 0.1029, "step": 114580 }, { "epoch": 2.671831191163184, "grad_norm": 1.3734952211380005, "learning_rate": 1.0976652365852142e-07, "loss": 0.0989, "step": 114590 }, { "epoch": 2.672064352540002, "grad_norm": 1.189916968345642, "learning_rate": 1.0968880184045266e-07, "loss": 0.1094, "step": 114600 }, { "epoch": 2.6722975139168197, "grad_norm": 1.6368991136550903, "learning_rate": 1.0961108002238388e-07, "loss": 0.1054, "step": 114610 }, { "epoch": 2.6725306752936375, "grad_norm": 1.8844232559204102, "learning_rate": 1.0953335820431511e-07, "loss": 0.1012, "step": 114620 }, { "epoch": 2.6727638366704554, "grad_norm": 1.7536253929138184, "learning_rate": 1.0945563638624635e-07, "loss": 0.1052, "step": 114630 }, { "epoch": 2.6729969980472736, "grad_norm": 2.1192495822906494, "learning_rate": 1.0937791456817757e-07, "loss": 0.1052, "step": 114640 }, { "epoch": 2.6732301594240915, "grad_norm": 1.129441738128662, "learning_rate": 1.0930019275010881e-07, "loss": 0.1112, "step": 114650 }, { "epoch": 2.6734633208009093, "grad_norm": 1.5979646444320679, "learning_rate": 1.0922247093204004e-07, "loss": 0.1, "step": 114660 }, { "epoch": 2.673696482177727, "grad_norm": 1.5871851444244385, "learning_rate": 1.0914474911397127e-07, "loss": 0.1024, "step": 114670 }, { "epoch": 2.673929643554545, "grad_norm": 2.7221882343292236, "learning_rate": 1.090670272959025e-07, "loss": 0.1052, "step": 114680 }, { "epoch": 2.6741628049313633, "grad_norm": 1.3913780450820923, "learning_rate": 1.0898930547783374e-07, "loss": 0.1036, "step": 114690 }, { "epoch": 2.674395966308181, "grad_norm": 1.3545724153518677, "learning_rate": 1.0891158365976496e-07, "loss": 0.0953, "step": 114700 }, { "epoch": 2.674629127684999, "grad_norm": 1.5813140869140625, "learning_rate": 1.088338618416962e-07, "loss": 0.1019, "step": 114710 }, { "epoch": 2.674862289061817, "grad_norm": 2.3705365657806396, "learning_rate": 1.0875614002362743e-07, "loss": 0.1081, "step": 114720 }, { "epoch": 2.6750954504386346, "grad_norm": 2.577394485473633, "learning_rate": 1.0867841820555867e-07, "loss": 0.1067, "step": 114730 }, { "epoch": 2.675328611815453, "grad_norm": 3.5033206939697266, "learning_rate": 1.0860069638748989e-07, "loss": 0.1089, "step": 114740 }, { "epoch": 2.6755617731922707, "grad_norm": 1.849941611289978, "learning_rate": 1.0852297456942112e-07, "loss": 0.1144, "step": 114750 }, { "epoch": 2.6757949345690886, "grad_norm": 2.081819772720337, "learning_rate": 1.0844525275135236e-07, "loss": 0.0986, "step": 114760 }, { "epoch": 2.6760280959459064, "grad_norm": 1.4923534393310547, "learning_rate": 1.0836753093328358e-07, "loss": 0.1071, "step": 114770 }, { "epoch": 2.6762612573227242, "grad_norm": 1.2424224615097046, "learning_rate": 1.0828980911521482e-07, "loss": 0.1107, "step": 114780 }, { "epoch": 2.6764944186995425, "grad_norm": 1.1043291091918945, "learning_rate": 1.0821208729714605e-07, "loss": 0.1052, "step": 114790 }, { "epoch": 2.6767275800763604, "grad_norm": 1.6883612871170044, "learning_rate": 1.0813436547907728e-07, "loss": 0.1044, "step": 114800 }, { "epoch": 2.676960741453178, "grad_norm": 1.6181604862213135, "learning_rate": 1.0805664366100851e-07, "loss": 0.1007, "step": 114810 }, { "epoch": 2.6771939028299965, "grad_norm": 1.5253809690475464, "learning_rate": 1.0797892184293975e-07, "loss": 0.0985, "step": 114820 }, { "epoch": 2.677427064206814, "grad_norm": 1.2895770072937012, "learning_rate": 1.0790120002487098e-07, "loss": 0.1028, "step": 114830 }, { "epoch": 2.677660225583632, "grad_norm": 1.099980354309082, "learning_rate": 1.078234782068022e-07, "loss": 0.0907, "step": 114840 }, { "epoch": 2.67789338696045, "grad_norm": 1.5954784154891968, "learning_rate": 1.0774575638873344e-07, "loss": 0.0963, "step": 114850 }, { "epoch": 2.678126548337268, "grad_norm": 1.8798410892486572, "learning_rate": 1.0766803457066468e-07, "loss": 0.1084, "step": 114860 }, { "epoch": 2.678359709714086, "grad_norm": 2.734501838684082, "learning_rate": 1.075903127525959e-07, "loss": 0.0955, "step": 114870 }, { "epoch": 2.678592871090904, "grad_norm": 1.484148621559143, "learning_rate": 1.0751259093452714e-07, "loss": 0.0973, "step": 114880 }, { "epoch": 2.6788260324677218, "grad_norm": 1.5491567850112915, "learning_rate": 1.0743486911645837e-07, "loss": 0.0997, "step": 114890 }, { "epoch": 2.6790591938445396, "grad_norm": 1.3648492097854614, "learning_rate": 1.073571472983896e-07, "loss": 0.1133, "step": 114900 }, { "epoch": 2.6792923552213574, "grad_norm": 1.3215463161468506, "learning_rate": 1.0727942548032083e-07, "loss": 0.094, "step": 114910 }, { "epoch": 2.6795255165981757, "grad_norm": 1.4419302940368652, "learning_rate": 1.0720170366225207e-07, "loss": 0.1004, "step": 114920 }, { "epoch": 2.6797586779749936, "grad_norm": 1.5424811840057373, "learning_rate": 1.071239818441833e-07, "loss": 0.11, "step": 114930 }, { "epoch": 2.6799918393518114, "grad_norm": 2.1494696140289307, "learning_rate": 1.0704626002611452e-07, "loss": 0.0916, "step": 114940 }, { "epoch": 2.6802250007286292, "grad_norm": 1.8731549978256226, "learning_rate": 1.0696853820804576e-07, "loss": 0.0914, "step": 114950 }, { "epoch": 2.680458162105447, "grad_norm": 1.0187609195709229, "learning_rate": 1.06890816389977e-07, "loss": 0.0914, "step": 114960 }, { "epoch": 2.6806913234822654, "grad_norm": 1.2967731952667236, "learning_rate": 1.0681309457190822e-07, "loss": 0.1075, "step": 114970 }, { "epoch": 2.680924484859083, "grad_norm": 1.145356297492981, "learning_rate": 1.0673537275383945e-07, "loss": 0.091, "step": 114980 }, { "epoch": 2.681157646235901, "grad_norm": 1.2639909982681274, "learning_rate": 1.0665765093577069e-07, "loss": 0.0958, "step": 114990 }, { "epoch": 2.681390807612719, "grad_norm": 1.4922070503234863, "learning_rate": 1.0657992911770191e-07, "loss": 0.1013, "step": 115000 }, { "epoch": 2.6816239689895367, "grad_norm": 1.7659647464752197, "learning_rate": 1.0650220729963315e-07, "loss": 0.1012, "step": 115010 }, { "epoch": 2.681857130366355, "grad_norm": 1.9564474821090698, "learning_rate": 1.0642448548156438e-07, "loss": 0.0976, "step": 115020 }, { "epoch": 2.682090291743173, "grad_norm": 3.3267242908477783, "learning_rate": 1.0634676366349562e-07, "loss": 0.1091, "step": 115030 }, { "epoch": 2.6823234531199907, "grad_norm": 1.6529783010482788, "learning_rate": 1.0626904184542684e-07, "loss": 0.1027, "step": 115040 }, { "epoch": 2.6825566144968085, "grad_norm": 1.3460512161254883, "learning_rate": 1.0619132002735808e-07, "loss": 0.1052, "step": 115050 }, { "epoch": 2.6827897758736263, "grad_norm": 4.545507907867432, "learning_rate": 1.0611359820928931e-07, "loss": 0.1128, "step": 115060 }, { "epoch": 2.6830229372504446, "grad_norm": 1.1387767791748047, "learning_rate": 1.0603587639122053e-07, "loss": 0.098, "step": 115070 }, { "epoch": 2.6832560986272624, "grad_norm": 2.121661424636841, "learning_rate": 1.0595815457315177e-07, "loss": 0.1007, "step": 115080 }, { "epoch": 2.6834892600040803, "grad_norm": 1.555941104888916, "learning_rate": 1.05880432755083e-07, "loss": 0.0947, "step": 115090 }, { "epoch": 2.683722421380898, "grad_norm": 2.599560499191284, "learning_rate": 1.0580271093701423e-07, "loss": 0.1014, "step": 115100 }, { "epoch": 2.683955582757716, "grad_norm": 1.2866541147232056, "learning_rate": 1.0572498911894546e-07, "loss": 0.1107, "step": 115110 }, { "epoch": 2.6841887441345342, "grad_norm": 1.470394253730774, "learning_rate": 1.056472673008767e-07, "loss": 0.1037, "step": 115120 }, { "epoch": 2.684421905511352, "grad_norm": 1.2809245586395264, "learning_rate": 1.0556954548280792e-07, "loss": 0.1175, "step": 115130 }, { "epoch": 2.68465506688817, "grad_norm": 2.656090259552002, "learning_rate": 1.0549182366473916e-07, "loss": 0.1116, "step": 115140 }, { "epoch": 2.6848882282649877, "grad_norm": 1.5120246410369873, "learning_rate": 1.0541410184667039e-07, "loss": 0.0935, "step": 115150 }, { "epoch": 2.6851213896418056, "grad_norm": 1.9571444988250732, "learning_rate": 1.0533638002860163e-07, "loss": 0.1187, "step": 115160 }, { "epoch": 2.685354551018624, "grad_norm": 2.503018379211426, "learning_rate": 1.0525865821053285e-07, "loss": 0.1071, "step": 115170 }, { "epoch": 2.6855877123954417, "grad_norm": 1.0099071264266968, "learning_rate": 1.0518093639246409e-07, "loss": 0.1013, "step": 115180 }, { "epoch": 2.6858208737722595, "grad_norm": 1.8902924060821533, "learning_rate": 1.0510321457439532e-07, "loss": 0.096, "step": 115190 }, { "epoch": 2.686054035149078, "grad_norm": 1.4281065464019775, "learning_rate": 1.0502549275632655e-07, "loss": 0.1073, "step": 115200 }, { "epoch": 2.6862871965258956, "grad_norm": 1.924679160118103, "learning_rate": 1.0494777093825778e-07, "loss": 0.1004, "step": 115210 }, { "epoch": 2.6865203579027135, "grad_norm": 1.6247133016586304, "learning_rate": 1.0487004912018902e-07, "loss": 0.1055, "step": 115220 }, { "epoch": 2.6867535192795313, "grad_norm": 1.317758321762085, "learning_rate": 1.0479232730212024e-07, "loss": 0.0958, "step": 115230 }, { "epoch": 2.686986680656349, "grad_norm": 2.1112618446350098, "learning_rate": 1.0471460548405147e-07, "loss": 0.1096, "step": 115240 }, { "epoch": 2.6872198420331674, "grad_norm": 1.4622913599014282, "learning_rate": 1.0463688366598271e-07, "loss": 0.0979, "step": 115250 }, { "epoch": 2.6874530034099853, "grad_norm": 2.0772736072540283, "learning_rate": 1.0455916184791395e-07, "loss": 0.0891, "step": 115260 }, { "epoch": 2.687686164786803, "grad_norm": 1.137076497077942, "learning_rate": 1.0448144002984517e-07, "loss": 0.0969, "step": 115270 }, { "epoch": 2.687919326163621, "grad_norm": 2.1423614025115967, "learning_rate": 1.044037182117764e-07, "loss": 0.1046, "step": 115280 }, { "epoch": 2.688152487540439, "grad_norm": 1.9028820991516113, "learning_rate": 1.0432599639370764e-07, "loss": 0.0926, "step": 115290 }, { "epoch": 2.688385648917257, "grad_norm": 1.5011272430419922, "learning_rate": 1.0424827457563886e-07, "loss": 0.0955, "step": 115300 }, { "epoch": 2.688618810294075, "grad_norm": 2.895847797393799, "learning_rate": 1.041705527575701e-07, "loss": 0.1133, "step": 115310 }, { "epoch": 2.6888519716708927, "grad_norm": 1.9159984588623047, "learning_rate": 1.0409283093950133e-07, "loss": 0.109, "step": 115320 }, { "epoch": 2.6890851330477106, "grad_norm": 2.0785858631134033, "learning_rate": 1.0401510912143256e-07, "loss": 0.1055, "step": 115330 }, { "epoch": 2.6893182944245284, "grad_norm": 1.9122518301010132, "learning_rate": 1.0393738730336379e-07, "loss": 0.0973, "step": 115340 }, { "epoch": 2.6895514558013467, "grad_norm": 2.4806811809539795, "learning_rate": 1.0385966548529503e-07, "loss": 0.1097, "step": 115350 }, { "epoch": 2.6897846171781645, "grad_norm": 1.361956238746643, "learning_rate": 1.0378194366722626e-07, "loss": 0.0968, "step": 115360 }, { "epoch": 2.6900177785549824, "grad_norm": 1.400754690170288, "learning_rate": 1.0370422184915749e-07, "loss": 0.0949, "step": 115370 }, { "epoch": 2.6902509399318, "grad_norm": 1.7135387659072876, "learning_rate": 1.0362650003108872e-07, "loss": 0.1051, "step": 115380 }, { "epoch": 2.690484101308618, "grad_norm": 1.428924798965454, "learning_rate": 1.0354877821301996e-07, "loss": 0.1124, "step": 115390 }, { "epoch": 2.6907172626854363, "grad_norm": 1.7551506757736206, "learning_rate": 1.0347105639495118e-07, "loss": 0.1035, "step": 115400 }, { "epoch": 2.690950424062254, "grad_norm": 1.337466835975647, "learning_rate": 1.0339333457688242e-07, "loss": 0.0961, "step": 115410 }, { "epoch": 2.691183585439072, "grad_norm": 1.4701204299926758, "learning_rate": 1.0331561275881365e-07, "loss": 0.1089, "step": 115420 }, { "epoch": 2.69141674681589, "grad_norm": 1.481571912765503, "learning_rate": 1.0323789094074487e-07, "loss": 0.0968, "step": 115430 }, { "epoch": 2.6916499081927077, "grad_norm": 1.4632488489151, "learning_rate": 1.0316016912267611e-07, "loss": 0.1038, "step": 115440 }, { "epoch": 2.691883069569526, "grad_norm": 1.3344662189483643, "learning_rate": 1.0308244730460734e-07, "loss": 0.0901, "step": 115450 }, { "epoch": 2.692116230946344, "grad_norm": 1.4373842477798462, "learning_rate": 1.0300472548653858e-07, "loss": 0.1046, "step": 115460 }, { "epoch": 2.6923493923231616, "grad_norm": 1.6936943531036377, "learning_rate": 1.029270036684698e-07, "loss": 0.0867, "step": 115470 }, { "epoch": 2.6925825536999795, "grad_norm": 3.3455562591552734, "learning_rate": 1.0284928185040104e-07, "loss": 0.1116, "step": 115480 }, { "epoch": 2.6928157150767973, "grad_norm": 1.0576918125152588, "learning_rate": 1.0277156003233227e-07, "loss": 0.0924, "step": 115490 }, { "epoch": 2.6930488764536156, "grad_norm": 2.6729443073272705, "learning_rate": 1.026938382142635e-07, "loss": 0.0966, "step": 115500 }, { "epoch": 2.6932820378304334, "grad_norm": 2.2169408798217773, "learning_rate": 1.0261611639619473e-07, "loss": 0.0969, "step": 115510 }, { "epoch": 2.6935151992072512, "grad_norm": 2.9878931045532227, "learning_rate": 1.0253839457812597e-07, "loss": 0.0899, "step": 115520 }, { "epoch": 2.6937483605840695, "grad_norm": 1.4723280668258667, "learning_rate": 1.0246067276005719e-07, "loss": 0.1083, "step": 115530 }, { "epoch": 2.693981521960887, "grad_norm": 1.2894819974899292, "learning_rate": 1.0238295094198843e-07, "loss": 0.0917, "step": 115540 }, { "epoch": 2.694214683337705, "grad_norm": 1.8059628009796143, "learning_rate": 1.0230522912391966e-07, "loss": 0.0977, "step": 115550 }, { "epoch": 2.694447844714523, "grad_norm": 1.0587788820266724, "learning_rate": 1.022275073058509e-07, "loss": 0.0952, "step": 115560 }, { "epoch": 2.694681006091341, "grad_norm": 1.327831506729126, "learning_rate": 1.0214978548778212e-07, "loss": 0.106, "step": 115570 }, { "epoch": 2.694914167468159, "grad_norm": 1.8276786804199219, "learning_rate": 1.0207206366971336e-07, "loss": 0.0886, "step": 115580 }, { "epoch": 2.695147328844977, "grad_norm": 3.044304132461548, "learning_rate": 1.0199434185164459e-07, "loss": 0.1085, "step": 115590 }, { "epoch": 2.695380490221795, "grad_norm": 1.5983214378356934, "learning_rate": 1.0191662003357581e-07, "loss": 0.1054, "step": 115600 }, { "epoch": 2.6956136515986127, "grad_norm": 1.508171796798706, "learning_rate": 1.0183889821550705e-07, "loss": 0.101, "step": 115610 }, { "epoch": 2.6958468129754305, "grad_norm": 1.5651113986968994, "learning_rate": 1.0176117639743828e-07, "loss": 0.1142, "step": 115620 }, { "epoch": 2.6960799743522488, "grad_norm": 1.4301831722259521, "learning_rate": 1.0168345457936951e-07, "loss": 0.1064, "step": 115630 }, { "epoch": 2.6963131357290666, "grad_norm": 2.7082481384277344, "learning_rate": 1.0160573276130074e-07, "loss": 0.1076, "step": 115640 }, { "epoch": 2.6965462971058844, "grad_norm": 2.158154249191284, "learning_rate": 1.0152801094323198e-07, "loss": 0.1038, "step": 115650 }, { "epoch": 2.6967794584827023, "grad_norm": 1.5644198656082153, "learning_rate": 1.014502891251632e-07, "loss": 0.1057, "step": 115660 }, { "epoch": 2.69701261985952, "grad_norm": 1.8655192852020264, "learning_rate": 1.0137256730709444e-07, "loss": 0.0984, "step": 115670 }, { "epoch": 2.6972457812363384, "grad_norm": 1.2009992599487305, "learning_rate": 1.0129484548902567e-07, "loss": 0.0962, "step": 115680 }, { "epoch": 2.6974789426131562, "grad_norm": 4.113626003265381, "learning_rate": 1.0121712367095691e-07, "loss": 0.1101, "step": 115690 }, { "epoch": 2.697712103989974, "grad_norm": 2.0702784061431885, "learning_rate": 1.0113940185288813e-07, "loss": 0.1079, "step": 115700 }, { "epoch": 2.697945265366792, "grad_norm": 1.6089898347854614, "learning_rate": 1.0106168003481937e-07, "loss": 0.1003, "step": 115710 }, { "epoch": 2.6981784267436097, "grad_norm": 1.4662060737609863, "learning_rate": 1.009839582167506e-07, "loss": 0.1102, "step": 115720 }, { "epoch": 2.698411588120428, "grad_norm": 1.21469247341156, "learning_rate": 1.0090623639868182e-07, "loss": 0.0991, "step": 115730 }, { "epoch": 2.698644749497246, "grad_norm": 2.178823947906494, "learning_rate": 1.0082851458061306e-07, "loss": 0.1074, "step": 115740 }, { "epoch": 2.6988779108740637, "grad_norm": 1.7513597011566162, "learning_rate": 1.007507927625443e-07, "loss": 0.0997, "step": 115750 }, { "epoch": 2.6991110722508815, "grad_norm": 1.528531551361084, "learning_rate": 1.0067307094447552e-07, "loss": 0.1154, "step": 115760 }, { "epoch": 2.6993442336276994, "grad_norm": 3.384457588195801, "learning_rate": 1.0059534912640675e-07, "loss": 0.0981, "step": 115770 }, { "epoch": 2.6995773950045177, "grad_norm": 2.423207998275757, "learning_rate": 1.0051762730833799e-07, "loss": 0.1146, "step": 115780 }, { "epoch": 2.6998105563813355, "grad_norm": 1.7658857107162476, "learning_rate": 1.0043990549026923e-07, "loss": 0.1104, "step": 115790 }, { "epoch": 2.7000437177581533, "grad_norm": 1.4612587690353394, "learning_rate": 1.0036218367220045e-07, "loss": 0.0996, "step": 115800 }, { "epoch": 2.700276879134971, "grad_norm": 3.1302261352539062, "learning_rate": 1.0028446185413168e-07, "loss": 0.1006, "step": 115810 }, { "epoch": 2.700510040511789, "grad_norm": 1.134676456451416, "learning_rate": 1.0020674003606292e-07, "loss": 0.1053, "step": 115820 }, { "epoch": 2.7007432018886073, "grad_norm": 1.7701992988586426, "learning_rate": 1.0012901821799414e-07, "loss": 0.1059, "step": 115830 }, { "epoch": 2.700976363265425, "grad_norm": 1.508619785308838, "learning_rate": 1.0005129639992538e-07, "loss": 0.0984, "step": 115840 }, { "epoch": 2.701209524642243, "grad_norm": 1.8710784912109375, "learning_rate": 9.997357458185661e-08, "loss": 0.0924, "step": 115850 }, { "epoch": 2.701442686019061, "grad_norm": 1.5393106937408447, "learning_rate": 9.989585276378784e-08, "loss": 0.1122, "step": 115860 }, { "epoch": 2.7016758473958786, "grad_norm": 1.2387490272521973, "learning_rate": 9.981813094571907e-08, "loss": 0.0894, "step": 115870 }, { "epoch": 2.701909008772697, "grad_norm": 1.3838118314743042, "learning_rate": 9.974040912765031e-08, "loss": 0.1005, "step": 115880 }, { "epoch": 2.7021421701495147, "grad_norm": 1.4396559000015259, "learning_rate": 9.966268730958154e-08, "loss": 0.0957, "step": 115890 }, { "epoch": 2.7023753315263326, "grad_norm": 1.2079097032546997, "learning_rate": 9.958496549151276e-08, "loss": 0.1008, "step": 115900 }, { "epoch": 2.702608492903151, "grad_norm": 1.0502667427062988, "learning_rate": 9.9507243673444e-08, "loss": 0.0943, "step": 115910 }, { "epoch": 2.7028416542799683, "grad_norm": 1.1418261528015137, "learning_rate": 9.942952185537524e-08, "loss": 0.0927, "step": 115920 }, { "epoch": 2.7030748156567865, "grad_norm": 1.5238237380981445, "learning_rate": 9.935180003730646e-08, "loss": 0.0911, "step": 115930 }, { "epoch": 2.7033079770336044, "grad_norm": 4.738273620605469, "learning_rate": 9.92740782192377e-08, "loss": 0.1141, "step": 115940 }, { "epoch": 2.703541138410422, "grad_norm": 1.7464208602905273, "learning_rate": 9.919635640116893e-08, "loss": 0.1173, "step": 115950 }, { "epoch": 2.7037742997872405, "grad_norm": 1.6110576391220093, "learning_rate": 9.911863458310015e-08, "loss": 0.0905, "step": 115960 }, { "epoch": 2.7040074611640583, "grad_norm": 1.666809320449829, "learning_rate": 9.904091276503139e-08, "loss": 0.1092, "step": 115970 }, { "epoch": 2.704240622540876, "grad_norm": 1.1948966979980469, "learning_rate": 9.896319094696262e-08, "loss": 0.0988, "step": 115980 }, { "epoch": 2.704473783917694, "grad_norm": 1.2923296689987183, "learning_rate": 9.888546912889386e-08, "loss": 0.1009, "step": 115990 }, { "epoch": 2.704706945294512, "grad_norm": 1.6406742334365845, "learning_rate": 9.880774731082508e-08, "loss": 0.0989, "step": 116000 }, { "epoch": 2.70494010667133, "grad_norm": 1.4037526845932007, "learning_rate": 9.873002549275632e-08, "loss": 0.1066, "step": 116010 }, { "epoch": 2.705173268048148, "grad_norm": 1.3587695360183716, "learning_rate": 9.865230367468755e-08, "loss": 0.1056, "step": 116020 }, { "epoch": 2.705406429424966, "grad_norm": 1.4646127223968506, "learning_rate": 9.857458185661878e-08, "loss": 0.1077, "step": 116030 }, { "epoch": 2.7056395908017836, "grad_norm": 2.470629930496216, "learning_rate": 9.849686003855001e-08, "loss": 0.097, "step": 116040 }, { "epoch": 2.7058727521786015, "grad_norm": 1.0709030628204346, "learning_rate": 9.841913822048125e-08, "loss": 0.1043, "step": 116050 }, { "epoch": 2.7061059135554197, "grad_norm": 2.718069314956665, "learning_rate": 9.834141640241247e-08, "loss": 0.1081, "step": 116060 }, { "epoch": 2.7063390749322376, "grad_norm": 2.677733898162842, "learning_rate": 9.82636945843437e-08, "loss": 0.0984, "step": 116070 }, { "epoch": 2.7065722363090554, "grad_norm": 1.3968071937561035, "learning_rate": 9.818597276627494e-08, "loss": 0.1056, "step": 116080 }, { "epoch": 2.7068053976858732, "grad_norm": 1.187321662902832, "learning_rate": 9.810825094820616e-08, "loss": 0.0995, "step": 116090 }, { "epoch": 2.707038559062691, "grad_norm": 1.3425283432006836, "learning_rate": 9.80305291301374e-08, "loss": 0.1096, "step": 116100 }, { "epoch": 2.7072717204395094, "grad_norm": 1.4075777530670166, "learning_rate": 9.795280731206863e-08, "loss": 0.0911, "step": 116110 }, { "epoch": 2.707504881816327, "grad_norm": 1.1647639274597168, "learning_rate": 9.787508549399987e-08, "loss": 0.1012, "step": 116120 }, { "epoch": 2.707738043193145, "grad_norm": 1.79256010055542, "learning_rate": 9.779736367593109e-08, "loss": 0.0995, "step": 116130 }, { "epoch": 2.707971204569963, "grad_norm": 1.3164281845092773, "learning_rate": 9.771964185786233e-08, "loss": 0.1054, "step": 116140 }, { "epoch": 2.7082043659467807, "grad_norm": 1.3350207805633545, "learning_rate": 9.764192003979356e-08, "loss": 0.0998, "step": 116150 }, { "epoch": 2.708437527323599, "grad_norm": 1.5612167119979858, "learning_rate": 9.756419822172479e-08, "loss": 0.0998, "step": 116160 }, { "epoch": 2.708670688700417, "grad_norm": 1.6453778743743896, "learning_rate": 9.748647640365602e-08, "loss": 0.1116, "step": 116170 }, { "epoch": 2.7089038500772347, "grad_norm": 1.2152683734893799, "learning_rate": 9.740875458558726e-08, "loss": 0.099, "step": 116180 }, { "epoch": 2.7091370114540525, "grad_norm": 1.2282626628875732, "learning_rate": 9.733103276751848e-08, "loss": 0.0904, "step": 116190 }, { "epoch": 2.7093701728308703, "grad_norm": 1.505347490310669, "learning_rate": 9.725331094944972e-08, "loss": 0.108, "step": 116200 }, { "epoch": 2.7096033342076886, "grad_norm": 1.7261037826538086, "learning_rate": 9.717558913138095e-08, "loss": 0.0968, "step": 116210 }, { "epoch": 2.7098364955845065, "grad_norm": 1.1936843395233154, "learning_rate": 9.709786731331219e-08, "loss": 0.1005, "step": 116220 }, { "epoch": 2.7100696569613243, "grad_norm": 1.9464473724365234, "learning_rate": 9.702014549524341e-08, "loss": 0.1015, "step": 116230 }, { "epoch": 2.710302818338142, "grad_norm": 2.4731180667877197, "learning_rate": 9.694242367717466e-08, "loss": 0.0961, "step": 116240 }, { "epoch": 2.71053597971496, "grad_norm": 1.4866541624069214, "learning_rate": 9.68647018591059e-08, "loss": 0.0972, "step": 116250 }, { "epoch": 2.7107691410917782, "grad_norm": 3.0319106578826904, "learning_rate": 9.678698004103712e-08, "loss": 0.1056, "step": 116260 }, { "epoch": 2.711002302468596, "grad_norm": 1.8753842115402222, "learning_rate": 9.670925822296835e-08, "loss": 0.1053, "step": 116270 }, { "epoch": 2.711235463845414, "grad_norm": 2.620272159576416, "learning_rate": 9.663153640489959e-08, "loss": 0.0943, "step": 116280 }, { "epoch": 2.711468625222232, "grad_norm": 1.9019349813461304, "learning_rate": 9.655381458683081e-08, "loss": 0.1001, "step": 116290 }, { "epoch": 2.7117017865990496, "grad_norm": 1.9719574451446533, "learning_rate": 9.647609276876205e-08, "loss": 0.1063, "step": 116300 }, { "epoch": 2.711934947975868, "grad_norm": 1.586198091506958, "learning_rate": 9.639837095069328e-08, "loss": 0.0994, "step": 116310 }, { "epoch": 2.7121681093526857, "grad_norm": 1.8814806938171387, "learning_rate": 9.632064913262452e-08, "loss": 0.1036, "step": 116320 }, { "epoch": 2.7124012707295035, "grad_norm": 1.144533634185791, "learning_rate": 9.624292731455574e-08, "loss": 0.1059, "step": 116330 }, { "epoch": 2.712634432106322, "grad_norm": 1.8825901746749878, "learning_rate": 9.616520549648698e-08, "loss": 0.1012, "step": 116340 }, { "epoch": 2.7128675934831397, "grad_norm": 1.7095372676849365, "learning_rate": 9.608748367841821e-08, "loss": 0.1099, "step": 116350 }, { "epoch": 2.7131007548599575, "grad_norm": 2.083179473876953, "learning_rate": 9.600976186034943e-08, "loss": 0.107, "step": 116360 }, { "epoch": 2.7133339162367753, "grad_norm": 2.200047492980957, "learning_rate": 9.593204004228067e-08, "loss": 0.0981, "step": 116370 }, { "epoch": 2.713567077613593, "grad_norm": 2.5208849906921387, "learning_rate": 9.58543182242119e-08, "loss": 0.1056, "step": 116380 }, { "epoch": 2.7138002389904115, "grad_norm": 2.121511459350586, "learning_rate": 9.577659640614313e-08, "loss": 0.1033, "step": 116390 }, { "epoch": 2.7140334003672293, "grad_norm": 2.5974903106689453, "learning_rate": 9.569887458807436e-08, "loss": 0.1091, "step": 116400 }, { "epoch": 2.714266561744047, "grad_norm": 2.1130759716033936, "learning_rate": 9.56211527700056e-08, "loss": 0.1013, "step": 116410 }, { "epoch": 2.714499723120865, "grad_norm": 2.1584794521331787, "learning_rate": 9.554343095193683e-08, "loss": 0.1063, "step": 116420 }, { "epoch": 2.714732884497683, "grad_norm": 2.692927837371826, "learning_rate": 9.546570913386806e-08, "loss": 0.1147, "step": 116430 }, { "epoch": 2.714966045874501, "grad_norm": 1.1682968139648438, "learning_rate": 9.538798731579929e-08, "loss": 0.0951, "step": 116440 }, { "epoch": 2.715199207251319, "grad_norm": 1.1385533809661865, "learning_rate": 9.531026549773053e-08, "loss": 0.1008, "step": 116450 }, { "epoch": 2.7154323686281368, "grad_norm": 1.661484956741333, "learning_rate": 9.523254367966175e-08, "loss": 0.1, "step": 116460 }, { "epoch": 2.7156655300049546, "grad_norm": 3.32328724861145, "learning_rate": 9.515482186159299e-08, "loss": 0.1053, "step": 116470 }, { "epoch": 2.7158986913817724, "grad_norm": 1.1169863939285278, "learning_rate": 9.507710004352422e-08, "loss": 0.0989, "step": 116480 }, { "epoch": 2.7161318527585907, "grad_norm": 1.9383223056793213, "learning_rate": 9.499937822545545e-08, "loss": 0.1093, "step": 116490 }, { "epoch": 2.7163650141354085, "grad_norm": 2.3644044399261475, "learning_rate": 9.492165640738668e-08, "loss": 0.1091, "step": 116500 }, { "epoch": 2.7165981755122264, "grad_norm": 3.8820807933807373, "learning_rate": 9.484393458931792e-08, "loss": 0.1125, "step": 116510 }, { "epoch": 2.716831336889044, "grad_norm": 2.8022847175598145, "learning_rate": 9.476621277124915e-08, "loss": 0.1048, "step": 116520 }, { "epoch": 2.717064498265862, "grad_norm": 1.8599321842193604, "learning_rate": 9.468849095318037e-08, "loss": 0.1023, "step": 116530 }, { "epoch": 2.7172976596426803, "grad_norm": 1.1930338144302368, "learning_rate": 9.461076913511161e-08, "loss": 0.1049, "step": 116540 }, { "epoch": 2.717530821019498, "grad_norm": 1.5175557136535645, "learning_rate": 9.453304731704285e-08, "loss": 0.1134, "step": 116550 }, { "epoch": 2.717763982396316, "grad_norm": 1.2600799798965454, "learning_rate": 9.445532549897407e-08, "loss": 0.0954, "step": 116560 }, { "epoch": 2.717997143773134, "grad_norm": 1.5117957592010498, "learning_rate": 9.43776036809053e-08, "loss": 0.1079, "step": 116570 }, { "epoch": 2.7182303051499517, "grad_norm": 1.0466336011886597, "learning_rate": 9.429988186283654e-08, "loss": 0.1043, "step": 116580 }, { "epoch": 2.71846346652677, "grad_norm": 2.4308829307556152, "learning_rate": 9.422216004476776e-08, "loss": 0.1083, "step": 116590 }, { "epoch": 2.718696627903588, "grad_norm": 2.881804943084717, "learning_rate": 9.4144438226699e-08, "loss": 0.1034, "step": 116600 }, { "epoch": 2.7189297892804056, "grad_norm": 1.5967193841934204, "learning_rate": 9.406671640863023e-08, "loss": 0.1025, "step": 116610 }, { "epoch": 2.7191629506572235, "grad_norm": 2.476684093475342, "learning_rate": 9.398899459056147e-08, "loss": 0.1131, "step": 116620 }, { "epoch": 2.7193961120340413, "grad_norm": 1.53436279296875, "learning_rate": 9.391127277249269e-08, "loss": 0.0921, "step": 116630 }, { "epoch": 2.7196292734108596, "grad_norm": 2.5355606079101562, "learning_rate": 9.383355095442393e-08, "loss": 0.1121, "step": 116640 }, { "epoch": 2.7198624347876774, "grad_norm": 1.7573145627975464, "learning_rate": 9.375582913635516e-08, "loss": 0.091, "step": 116650 }, { "epoch": 2.7200955961644953, "grad_norm": 1.248391032218933, "learning_rate": 9.367810731828639e-08, "loss": 0.1008, "step": 116660 }, { "epoch": 2.7203287575413135, "grad_norm": 2.006657123565674, "learning_rate": 9.360038550021762e-08, "loss": 0.1056, "step": 116670 }, { "epoch": 2.7205619189181314, "grad_norm": 1.988938808441162, "learning_rate": 9.352266368214886e-08, "loss": 0.1024, "step": 116680 }, { "epoch": 2.720795080294949, "grad_norm": 2.6984658241271973, "learning_rate": 9.344494186408008e-08, "loss": 0.0991, "step": 116690 }, { "epoch": 2.721028241671767, "grad_norm": 1.1426146030426025, "learning_rate": 9.336722004601131e-08, "loss": 0.0987, "step": 116700 }, { "epoch": 2.721261403048585, "grad_norm": 1.5600470304489136, "learning_rate": 9.328949822794255e-08, "loss": 0.1013, "step": 116710 }, { "epoch": 2.721494564425403, "grad_norm": 2.029031753540039, "learning_rate": 9.321177640987377e-08, "loss": 0.0986, "step": 116720 }, { "epoch": 2.721727725802221, "grad_norm": 2.1665172576904297, "learning_rate": 9.313405459180501e-08, "loss": 0.1117, "step": 116730 }, { "epoch": 2.721960887179039, "grad_norm": 1.5273339748382568, "learning_rate": 9.305633277373624e-08, "loss": 0.1012, "step": 116740 }, { "epoch": 2.7221940485558567, "grad_norm": 1.1971523761749268, "learning_rate": 9.297861095566748e-08, "loss": 0.1039, "step": 116750 }, { "epoch": 2.7224272099326745, "grad_norm": 1.4817038774490356, "learning_rate": 9.29008891375987e-08, "loss": 0.1024, "step": 116760 }, { "epoch": 2.722660371309493, "grad_norm": 1.3426522016525269, "learning_rate": 9.282316731952994e-08, "loss": 0.1059, "step": 116770 }, { "epoch": 2.7228935326863106, "grad_norm": 1.8533340692520142, "learning_rate": 9.274544550146117e-08, "loss": 0.112, "step": 116780 }, { "epoch": 2.7231266940631285, "grad_norm": 1.209120512008667, "learning_rate": 9.26677236833924e-08, "loss": 0.1089, "step": 116790 }, { "epoch": 2.7233598554399463, "grad_norm": 1.31103515625, "learning_rate": 9.259000186532363e-08, "loss": 0.1003, "step": 116800 }, { "epoch": 2.723593016816764, "grad_norm": 1.0293736457824707, "learning_rate": 9.251228004725487e-08, "loss": 0.103, "step": 116810 }, { "epoch": 2.7238261781935824, "grad_norm": 1.444524884223938, "learning_rate": 9.243455822918609e-08, "loss": 0.1214, "step": 116820 }, { "epoch": 2.7240593395704003, "grad_norm": 1.4681665897369385, "learning_rate": 9.235683641111733e-08, "loss": 0.1066, "step": 116830 }, { "epoch": 2.724292500947218, "grad_norm": 2.4219248294830322, "learning_rate": 9.227911459304856e-08, "loss": 0.1093, "step": 116840 }, { "epoch": 2.724525662324036, "grad_norm": 1.096635103225708, "learning_rate": 9.22013927749798e-08, "loss": 0.1078, "step": 116850 }, { "epoch": 2.7247588237008538, "grad_norm": 1.3034820556640625, "learning_rate": 9.212367095691102e-08, "loss": 0.0973, "step": 116860 }, { "epoch": 2.724991985077672, "grad_norm": 1.978171944618225, "learning_rate": 9.204594913884226e-08, "loss": 0.1034, "step": 116870 }, { "epoch": 2.72522514645449, "grad_norm": 1.6005244255065918, "learning_rate": 9.196822732077349e-08, "loss": 0.1155, "step": 116880 }, { "epoch": 2.7254583078313077, "grad_norm": 1.7613064050674438, "learning_rate": 9.189050550270471e-08, "loss": 0.105, "step": 116890 }, { "epoch": 2.7256914692081256, "grad_norm": 1.183750867843628, "learning_rate": 9.181278368463595e-08, "loss": 0.1011, "step": 116900 }, { "epoch": 2.7259246305849434, "grad_norm": 2.210158348083496, "learning_rate": 9.173506186656718e-08, "loss": 0.1009, "step": 116910 }, { "epoch": 2.7261577919617617, "grad_norm": 1.7548959255218506, "learning_rate": 9.165734004849841e-08, "loss": 0.1072, "step": 116920 }, { "epoch": 2.7263909533385795, "grad_norm": 1.568292498588562, "learning_rate": 9.157961823042964e-08, "loss": 0.0939, "step": 116930 }, { "epoch": 2.7266241147153973, "grad_norm": 1.040225863456726, "learning_rate": 9.150189641236088e-08, "loss": 0.1005, "step": 116940 }, { "epoch": 2.726857276092215, "grad_norm": 1.44286048412323, "learning_rate": 9.142417459429211e-08, "loss": 0.1114, "step": 116950 }, { "epoch": 2.727090437469033, "grad_norm": 1.399444341659546, "learning_rate": 9.134645277622334e-08, "loss": 0.1055, "step": 116960 }, { "epoch": 2.7273235988458513, "grad_norm": 1.2554503679275513, "learning_rate": 9.126873095815457e-08, "loss": 0.1091, "step": 116970 }, { "epoch": 2.727556760222669, "grad_norm": 1.1003665924072266, "learning_rate": 9.119100914008581e-08, "loss": 0.0991, "step": 116980 }, { "epoch": 2.727789921599487, "grad_norm": 1.4886912107467651, "learning_rate": 9.111328732201703e-08, "loss": 0.101, "step": 116990 }, { "epoch": 2.7280230829763052, "grad_norm": 2.335259199142456, "learning_rate": 9.103556550394827e-08, "loss": 0.1104, "step": 117000 }, { "epoch": 2.7282562443531226, "grad_norm": 1.1863139867782593, "learning_rate": 9.09578436858795e-08, "loss": 0.1041, "step": 117010 }, { "epoch": 2.728489405729941, "grad_norm": 2.79545259475708, "learning_rate": 9.088012186781072e-08, "loss": 0.0862, "step": 117020 }, { "epoch": 2.7287225671067588, "grad_norm": 1.5281267166137695, "learning_rate": 9.080240004974196e-08, "loss": 0.0958, "step": 117030 }, { "epoch": 2.7289557284835766, "grad_norm": 1.1324957609176636, "learning_rate": 9.07246782316732e-08, "loss": 0.0965, "step": 117040 }, { "epoch": 2.729188889860395, "grad_norm": 2.9101150035858154, "learning_rate": 9.064695641360443e-08, "loss": 0.1063, "step": 117050 }, { "epoch": 2.7294220512372127, "grad_norm": 1.206672191619873, "learning_rate": 9.056923459553565e-08, "loss": 0.1022, "step": 117060 }, { "epoch": 2.7296552126140305, "grad_norm": 1.0621839761734009, "learning_rate": 9.049151277746689e-08, "loss": 0.0915, "step": 117070 }, { "epoch": 2.7298883739908484, "grad_norm": 1.9077378511428833, "learning_rate": 9.041379095939813e-08, "loss": 0.1103, "step": 117080 }, { "epoch": 2.730121535367666, "grad_norm": 1.4973983764648438, "learning_rate": 9.033606914132935e-08, "loss": 0.0957, "step": 117090 }, { "epoch": 2.7303546967444845, "grad_norm": 2.620654344558716, "learning_rate": 9.025834732326058e-08, "loss": 0.1083, "step": 117100 }, { "epoch": 2.7305878581213023, "grad_norm": 1.4439918994903564, "learning_rate": 9.018062550519182e-08, "loss": 0.1089, "step": 117110 }, { "epoch": 2.73082101949812, "grad_norm": 1.1628292798995972, "learning_rate": 9.010290368712304e-08, "loss": 0.1047, "step": 117120 }, { "epoch": 2.731054180874938, "grad_norm": 2.1179659366607666, "learning_rate": 9.002518186905428e-08, "loss": 0.1087, "step": 117130 }, { "epoch": 2.731287342251756, "grad_norm": 1.1830686330795288, "learning_rate": 8.994746005098551e-08, "loss": 0.0967, "step": 117140 }, { "epoch": 2.731520503628574, "grad_norm": 3.17484712600708, "learning_rate": 8.986973823291675e-08, "loss": 0.1059, "step": 117150 }, { "epoch": 2.731753665005392, "grad_norm": 2.544031858444214, "learning_rate": 8.979201641484797e-08, "loss": 0.0994, "step": 117160 }, { "epoch": 2.73198682638221, "grad_norm": 1.3864498138427734, "learning_rate": 8.971429459677921e-08, "loss": 0.0948, "step": 117170 }, { "epoch": 2.7322199877590276, "grad_norm": 2.0742177963256836, "learning_rate": 8.964434496051732e-08, "loss": 0.0984, "step": 117180 }, { "epoch": 2.7324531491358455, "grad_norm": 1.9564505815505981, "learning_rate": 8.956662314244854e-08, "loss": 0.0901, "step": 117190 }, { "epoch": 2.7326863105126638, "grad_norm": 1.1691645383834839, "learning_rate": 8.948890132437978e-08, "loss": 0.1063, "step": 117200 }, { "epoch": 2.7329194718894816, "grad_norm": 2.546367883682251, "learning_rate": 8.941117950631101e-08, "loss": 0.1064, "step": 117210 }, { "epoch": 2.7331526332662994, "grad_norm": 1.2570745944976807, "learning_rate": 8.933345768824225e-08, "loss": 0.0942, "step": 117220 }, { "epoch": 2.7333857946431173, "grad_norm": 1.0471768379211426, "learning_rate": 8.925573587017347e-08, "loss": 0.0962, "step": 117230 }, { "epoch": 2.733618956019935, "grad_norm": 1.485144019126892, "learning_rate": 8.91780140521047e-08, "loss": 0.1051, "step": 117240 }, { "epoch": 2.7338521173967534, "grad_norm": 1.6654928922653198, "learning_rate": 8.910029223403594e-08, "loss": 0.1083, "step": 117250 }, { "epoch": 2.734085278773571, "grad_norm": 1.7474205493927002, "learning_rate": 8.902257041596716e-08, "loss": 0.1008, "step": 117260 }, { "epoch": 2.734318440150389, "grad_norm": 1.6271342039108276, "learning_rate": 8.89448485978984e-08, "loss": 0.1154, "step": 117270 }, { "epoch": 2.734551601527207, "grad_norm": 1.7401084899902344, "learning_rate": 8.886712677982964e-08, "loss": 0.1051, "step": 117280 }, { "epoch": 2.7347847629040247, "grad_norm": 1.4086666107177734, "learning_rate": 8.878940496176086e-08, "loss": 0.0981, "step": 117290 }, { "epoch": 2.735017924280843, "grad_norm": 0.9472995400428772, "learning_rate": 8.87116831436921e-08, "loss": 0.0999, "step": 117300 }, { "epoch": 2.735251085657661, "grad_norm": 1.4173541069030762, "learning_rate": 8.863396132562333e-08, "loss": 0.0987, "step": 117310 }, { "epoch": 2.7354842470344787, "grad_norm": 1.1096930503845215, "learning_rate": 8.855623950755457e-08, "loss": 0.0983, "step": 117320 }, { "epoch": 2.7357174084112965, "grad_norm": 1.2478140592575073, "learning_rate": 8.847851768948579e-08, "loss": 0.0937, "step": 117330 }, { "epoch": 2.7359505697881144, "grad_norm": 3.240661144256592, "learning_rate": 8.840079587141702e-08, "loss": 0.1144, "step": 117340 }, { "epoch": 2.7361837311649326, "grad_norm": 1.2745466232299805, "learning_rate": 8.832307405334826e-08, "loss": 0.0978, "step": 117350 }, { "epoch": 2.7364168925417505, "grad_norm": 2.916372060775757, "learning_rate": 8.824535223527948e-08, "loss": 0.0999, "step": 117360 }, { "epoch": 2.7366500539185683, "grad_norm": 1.8719037771224976, "learning_rate": 8.816763041721072e-08, "loss": 0.0874, "step": 117370 }, { "epoch": 2.7368832152953866, "grad_norm": 2.0974016189575195, "learning_rate": 8.808990859914195e-08, "loss": 0.1049, "step": 117380 }, { "epoch": 2.737116376672204, "grad_norm": 1.497638463973999, "learning_rate": 8.801218678107318e-08, "loss": 0.1001, "step": 117390 }, { "epoch": 2.7373495380490223, "grad_norm": 1.527301549911499, "learning_rate": 8.793446496300441e-08, "loss": 0.1076, "step": 117400 }, { "epoch": 2.73758269942584, "grad_norm": 1.7448515892028809, "learning_rate": 8.785674314493565e-08, "loss": 0.1027, "step": 117410 }, { "epoch": 2.737815860802658, "grad_norm": 2.3344945907592773, "learning_rate": 8.777902132686687e-08, "loss": 0.1096, "step": 117420 }, { "epoch": 2.738049022179476, "grad_norm": 1.4971126317977905, "learning_rate": 8.77012995087981e-08, "loss": 0.1141, "step": 117430 }, { "epoch": 2.738282183556294, "grad_norm": 1.2951750755310059, "learning_rate": 8.762357769072934e-08, "loss": 0.1103, "step": 117440 }, { "epoch": 2.738515344933112, "grad_norm": 2.204387903213501, "learning_rate": 8.754585587266058e-08, "loss": 0.1116, "step": 117450 }, { "epoch": 2.7387485063099297, "grad_norm": 1.3754055500030518, "learning_rate": 8.74681340545918e-08, "loss": 0.1017, "step": 117460 }, { "epoch": 2.7389816676867476, "grad_norm": 1.3651481866836548, "learning_rate": 8.739041223652303e-08, "loss": 0.1034, "step": 117470 }, { "epoch": 2.739214829063566, "grad_norm": 2.765547513961792, "learning_rate": 8.731269041845427e-08, "loss": 0.1138, "step": 117480 }, { "epoch": 2.7394479904403837, "grad_norm": 1.8672646284103394, "learning_rate": 8.723496860038549e-08, "loss": 0.1028, "step": 117490 }, { "epoch": 2.7396811518172015, "grad_norm": 1.3129229545593262, "learning_rate": 8.715724678231673e-08, "loss": 0.1081, "step": 117500 }, { "epoch": 2.7399143131940193, "grad_norm": 1.7306344509124756, "learning_rate": 8.707952496424796e-08, "loss": 0.1041, "step": 117510 }, { "epoch": 2.740147474570837, "grad_norm": 1.2127655744552612, "learning_rate": 8.700180314617919e-08, "loss": 0.0963, "step": 117520 }, { "epoch": 2.7403806359476555, "grad_norm": 2.098883628845215, "learning_rate": 8.692408132811042e-08, "loss": 0.0964, "step": 117530 }, { "epoch": 2.7406137973244733, "grad_norm": 1.3831456899642944, "learning_rate": 8.684635951004166e-08, "loss": 0.1049, "step": 117540 }, { "epoch": 2.740846958701291, "grad_norm": 1.4610024690628052, "learning_rate": 8.676863769197289e-08, "loss": 0.0965, "step": 117550 }, { "epoch": 2.741080120078109, "grad_norm": 1.17869234085083, "learning_rate": 8.669091587390412e-08, "loss": 0.1057, "step": 117560 }, { "epoch": 2.741313281454927, "grad_norm": 2.309690237045288, "learning_rate": 8.661319405583535e-08, "loss": 0.0942, "step": 117570 }, { "epoch": 2.741546442831745, "grad_norm": 1.3884700536727905, "learning_rate": 8.653547223776659e-08, "loss": 0.1155, "step": 117580 }, { "epoch": 2.741779604208563, "grad_norm": 1.8016555309295654, "learning_rate": 8.645775041969781e-08, "loss": 0.0985, "step": 117590 }, { "epoch": 2.7420127655853808, "grad_norm": 1.2204136848449707, "learning_rate": 8.638002860162905e-08, "loss": 0.1141, "step": 117600 }, { "epoch": 2.7422459269621986, "grad_norm": 2.31526255607605, "learning_rate": 8.630230678356028e-08, "loss": 0.1119, "step": 117610 }, { "epoch": 2.7424790883390164, "grad_norm": 1.1854066848754883, "learning_rate": 8.62245849654915e-08, "loss": 0.1188, "step": 117620 }, { "epoch": 2.7427122497158347, "grad_norm": 1.548926830291748, "learning_rate": 8.614686314742274e-08, "loss": 0.0913, "step": 117630 }, { "epoch": 2.7429454110926526, "grad_norm": 1.5921413898468018, "learning_rate": 8.606914132935397e-08, "loss": 0.1154, "step": 117640 }, { "epoch": 2.7431785724694704, "grad_norm": 1.1218712329864502, "learning_rate": 8.599141951128521e-08, "loss": 0.1002, "step": 117650 }, { "epoch": 2.7434117338462882, "grad_norm": 1.6047875881195068, "learning_rate": 8.591369769321643e-08, "loss": 0.1044, "step": 117660 }, { "epoch": 2.743644895223106, "grad_norm": 1.3074122667312622, "learning_rate": 8.583597587514767e-08, "loss": 0.0998, "step": 117670 }, { "epoch": 2.7438780565999243, "grad_norm": 1.0265343189239502, "learning_rate": 8.57582540570789e-08, "loss": 0.0963, "step": 117680 }, { "epoch": 2.744111217976742, "grad_norm": 1.309549331665039, "learning_rate": 8.568053223901013e-08, "loss": 0.1082, "step": 117690 }, { "epoch": 2.74434437935356, "grad_norm": 1.233054280281067, "learning_rate": 8.560281042094136e-08, "loss": 0.1022, "step": 117700 }, { "epoch": 2.744577540730378, "grad_norm": 3.010551691055298, "learning_rate": 8.55250886028726e-08, "loss": 0.0944, "step": 117710 }, { "epoch": 2.7448107021071957, "grad_norm": 1.2513649463653564, "learning_rate": 8.544736678480382e-08, "loss": 0.113, "step": 117720 }, { "epoch": 2.745043863484014, "grad_norm": 1.849233627319336, "learning_rate": 8.536964496673506e-08, "loss": 0.1028, "step": 117730 }, { "epoch": 2.745277024860832, "grad_norm": 2.2037436962127686, "learning_rate": 8.529192314866629e-08, "loss": 0.1051, "step": 117740 }, { "epoch": 2.7455101862376496, "grad_norm": 1.642960786819458, "learning_rate": 8.521420133059753e-08, "loss": 0.1033, "step": 117750 }, { "epoch": 2.745743347614468, "grad_norm": 1.155003309249878, "learning_rate": 8.513647951252875e-08, "loss": 0.0938, "step": 117760 }, { "epoch": 2.7459765089912858, "grad_norm": 1.5233173370361328, "learning_rate": 8.505875769445999e-08, "loss": 0.1151, "step": 117770 }, { "epoch": 2.7462096703681036, "grad_norm": 3.1369924545288086, "learning_rate": 8.498103587639122e-08, "loss": 0.1019, "step": 117780 }, { "epoch": 2.7464428317449214, "grad_norm": 1.4197641611099243, "learning_rate": 8.490331405832244e-08, "loss": 0.0974, "step": 117790 }, { "epoch": 2.7466759931217393, "grad_norm": 1.3413090705871582, "learning_rate": 8.482559224025368e-08, "loss": 0.1062, "step": 117800 }, { "epoch": 2.7469091544985575, "grad_norm": 1.742334008216858, "learning_rate": 8.474787042218492e-08, "loss": 0.0991, "step": 117810 }, { "epoch": 2.7471423158753754, "grad_norm": 3.6177406311035156, "learning_rate": 8.467014860411614e-08, "loss": 0.1094, "step": 117820 }, { "epoch": 2.747375477252193, "grad_norm": 1.7452949285507202, "learning_rate": 8.459242678604737e-08, "loss": 0.1067, "step": 117830 }, { "epoch": 2.747608638629011, "grad_norm": 1.136313557624817, "learning_rate": 8.451470496797861e-08, "loss": 0.0926, "step": 117840 }, { "epoch": 2.747841800005829, "grad_norm": 1.82169771194458, "learning_rate": 8.443698314990983e-08, "loss": 0.0899, "step": 117850 }, { "epoch": 2.748074961382647, "grad_norm": 1.2281558513641357, "learning_rate": 8.435926133184107e-08, "loss": 0.1111, "step": 117860 }, { "epoch": 2.748308122759465, "grad_norm": 1.3311502933502197, "learning_rate": 8.42815395137723e-08, "loss": 0.1002, "step": 117870 }, { "epoch": 2.748541284136283, "grad_norm": 5.038990020751953, "learning_rate": 8.420381769570354e-08, "loss": 0.1046, "step": 117880 }, { "epoch": 2.7487744455131007, "grad_norm": 2.5975918769836426, "learning_rate": 8.412609587763476e-08, "loss": 0.0985, "step": 117890 }, { "epoch": 2.7490076068899185, "grad_norm": 1.342675805091858, "learning_rate": 8.4048374059566e-08, "loss": 0.1016, "step": 117900 }, { "epoch": 2.749240768266737, "grad_norm": 1.110643744468689, "learning_rate": 8.397065224149723e-08, "loss": 0.0917, "step": 117910 }, { "epoch": 2.7494739296435546, "grad_norm": 2.541379690170288, "learning_rate": 8.389293042342845e-08, "loss": 0.1046, "step": 117920 }, { "epoch": 2.7497070910203725, "grad_norm": 1.2817473411560059, "learning_rate": 8.381520860535969e-08, "loss": 0.1063, "step": 117930 }, { "epoch": 2.7499402523971903, "grad_norm": 0.9488675594329834, "learning_rate": 8.373748678729093e-08, "loss": 0.0994, "step": 117940 }, { "epoch": 2.750173413774008, "grad_norm": 1.3944803476333618, "learning_rate": 8.365976496922215e-08, "loss": 0.1001, "step": 117950 }, { "epoch": 2.7504065751508264, "grad_norm": 1.2269271612167358, "learning_rate": 8.358204315115338e-08, "loss": 0.0986, "step": 117960 }, { "epoch": 2.7506397365276443, "grad_norm": 1.2572602033615112, "learning_rate": 8.350432133308462e-08, "loss": 0.1002, "step": 117970 }, { "epoch": 2.750872897904462, "grad_norm": 1.2087548971176147, "learning_rate": 8.342659951501586e-08, "loss": 0.0998, "step": 117980 }, { "epoch": 2.75110605928128, "grad_norm": 1.6762734651565552, "learning_rate": 8.334887769694708e-08, "loss": 0.0969, "step": 117990 }, { "epoch": 2.7513392206580978, "grad_norm": 3.2268402576446533, "learning_rate": 8.327115587887831e-08, "loss": 0.0963, "step": 118000 }, { "epoch": 2.751572382034916, "grad_norm": 1.2244786024093628, "learning_rate": 8.319343406080955e-08, "loss": 0.115, "step": 118010 }, { "epoch": 2.751805543411734, "grad_norm": 2.7149715423583984, "learning_rate": 8.311571224274077e-08, "loss": 0.0998, "step": 118020 }, { "epoch": 2.7520387047885517, "grad_norm": 1.9712806940078735, "learning_rate": 8.303799042467201e-08, "loss": 0.1104, "step": 118030 }, { "epoch": 2.7522718661653696, "grad_norm": 1.5495805740356445, "learning_rate": 8.296026860660324e-08, "loss": 0.1057, "step": 118040 }, { "epoch": 2.7525050275421874, "grad_norm": 1.105318546295166, "learning_rate": 8.288254678853447e-08, "loss": 0.1027, "step": 118050 }, { "epoch": 2.7527381889190057, "grad_norm": 1.3606221675872803, "learning_rate": 8.28048249704657e-08, "loss": 0.0987, "step": 118060 }, { "epoch": 2.7529713502958235, "grad_norm": 1.8305927515029907, "learning_rate": 8.272710315239694e-08, "loss": 0.1009, "step": 118070 }, { "epoch": 2.7532045116726414, "grad_norm": 1.3263142108917236, "learning_rate": 8.264938133432817e-08, "loss": 0.0999, "step": 118080 }, { "epoch": 2.7534376730494596, "grad_norm": 1.213752031326294, "learning_rate": 8.25716595162594e-08, "loss": 0.0902, "step": 118090 }, { "epoch": 2.753670834426277, "grad_norm": 1.9140701293945312, "learning_rate": 8.249393769819063e-08, "loss": 0.1043, "step": 118100 }, { "epoch": 2.7539039958030953, "grad_norm": 1.2827650308609009, "learning_rate": 8.241621588012187e-08, "loss": 0.0982, "step": 118110 }, { "epoch": 2.754137157179913, "grad_norm": 1.1588743925094604, "learning_rate": 8.233849406205309e-08, "loss": 0.101, "step": 118120 }, { "epoch": 2.754370318556731, "grad_norm": 2.4529528617858887, "learning_rate": 8.226077224398432e-08, "loss": 0.0901, "step": 118130 }, { "epoch": 2.7546034799335493, "grad_norm": 1.548143982887268, "learning_rate": 8.218305042591556e-08, "loss": 0.1065, "step": 118140 }, { "epoch": 2.754836641310367, "grad_norm": 2.3410212993621826, "learning_rate": 8.210532860784678e-08, "loss": 0.0904, "step": 118150 }, { "epoch": 2.755069802687185, "grad_norm": 1.7090463638305664, "learning_rate": 8.202760678977802e-08, "loss": 0.0933, "step": 118160 }, { "epoch": 2.7553029640640028, "grad_norm": 1.3834280967712402, "learning_rate": 8.194988497170925e-08, "loss": 0.1076, "step": 118170 }, { "epoch": 2.7555361254408206, "grad_norm": 1.1031159162521362, "learning_rate": 8.187216315364049e-08, "loss": 0.0975, "step": 118180 }, { "epoch": 2.755769286817639, "grad_norm": 2.5504038333892822, "learning_rate": 8.179444133557171e-08, "loss": 0.1066, "step": 118190 }, { "epoch": 2.7560024481944567, "grad_norm": 2.0997984409332275, "learning_rate": 8.171671951750295e-08, "loss": 0.1016, "step": 118200 }, { "epoch": 2.7562356095712746, "grad_norm": 1.167283296585083, "learning_rate": 8.163899769943418e-08, "loss": 0.0949, "step": 118210 }, { "epoch": 2.7564687709480924, "grad_norm": 2.0529401302337646, "learning_rate": 8.15612758813654e-08, "loss": 0.0939, "step": 118220 }, { "epoch": 2.7567019323249102, "grad_norm": 1.3123666048049927, "learning_rate": 8.148355406329664e-08, "loss": 0.1113, "step": 118230 }, { "epoch": 2.7569350937017285, "grad_norm": 1.4466685056686401, "learning_rate": 8.140583224522788e-08, "loss": 0.109, "step": 118240 }, { "epoch": 2.7571682550785463, "grad_norm": 1.5126452445983887, "learning_rate": 8.13281104271591e-08, "loss": 0.11, "step": 118250 }, { "epoch": 2.757401416455364, "grad_norm": 1.1546746492385864, "learning_rate": 8.125038860909034e-08, "loss": 0.1035, "step": 118260 }, { "epoch": 2.757634577832182, "grad_norm": 1.4923962354660034, "learning_rate": 8.117266679102157e-08, "loss": 0.0994, "step": 118270 }, { "epoch": 2.757867739209, "grad_norm": 1.7634059190750122, "learning_rate": 8.10949449729528e-08, "loss": 0.1083, "step": 118280 }, { "epoch": 2.758100900585818, "grad_norm": 1.151358962059021, "learning_rate": 8.101722315488403e-08, "loss": 0.1089, "step": 118290 }, { "epoch": 2.758334061962636, "grad_norm": 1.3789052963256836, "learning_rate": 8.093950133681527e-08, "loss": 0.1075, "step": 118300 }, { "epoch": 2.758567223339454, "grad_norm": 1.808051347732544, "learning_rate": 8.08617795187465e-08, "loss": 0.0994, "step": 118310 }, { "epoch": 2.7588003847162716, "grad_norm": 1.304304838180542, "learning_rate": 8.078405770067772e-08, "loss": 0.1003, "step": 118320 }, { "epoch": 2.7590335460930895, "grad_norm": 1.4592493772506714, "learning_rate": 8.070633588260896e-08, "loss": 0.1105, "step": 118330 }, { "epoch": 2.7592667074699078, "grad_norm": 2.642646074295044, "learning_rate": 8.06286140645402e-08, "loss": 0.1026, "step": 118340 }, { "epoch": 2.7594998688467256, "grad_norm": 3.252073287963867, "learning_rate": 8.055089224647142e-08, "loss": 0.1044, "step": 118350 }, { "epoch": 2.7597330302235434, "grad_norm": 1.9871325492858887, "learning_rate": 8.047317042840265e-08, "loss": 0.099, "step": 118360 }, { "epoch": 2.7599661916003613, "grad_norm": 2.3065268993377686, "learning_rate": 8.039544861033389e-08, "loss": 0.0971, "step": 118370 }, { "epoch": 2.760199352977179, "grad_norm": 1.2818204164505005, "learning_rate": 8.031772679226511e-08, "loss": 0.1012, "step": 118380 }, { "epoch": 2.7604325143539974, "grad_norm": 1.6796401739120483, "learning_rate": 8.024000497419635e-08, "loss": 0.1031, "step": 118390 }, { "epoch": 2.7606656757308152, "grad_norm": 1.5549501180648804, "learning_rate": 8.016228315612758e-08, "loss": 0.1, "step": 118400 }, { "epoch": 2.760898837107633, "grad_norm": 1.9043819904327393, "learning_rate": 8.008456133805882e-08, "loss": 0.1045, "step": 118410 }, { "epoch": 2.761131998484451, "grad_norm": 1.2969557046890259, "learning_rate": 8.000683951999004e-08, "loss": 0.1089, "step": 118420 }, { "epoch": 2.7613651598612687, "grad_norm": 1.2310492992401123, "learning_rate": 7.992911770192128e-08, "loss": 0.0949, "step": 118430 }, { "epoch": 2.761598321238087, "grad_norm": 1.4574741125106812, "learning_rate": 7.985139588385251e-08, "loss": 0.0937, "step": 118440 }, { "epoch": 2.761831482614905, "grad_norm": 2.8779513835906982, "learning_rate": 7.977367406578373e-08, "loss": 0.0984, "step": 118450 }, { "epoch": 2.7620646439917227, "grad_norm": 1.6944653987884521, "learning_rate": 7.969595224771497e-08, "loss": 0.0948, "step": 118460 }, { "epoch": 2.762297805368541, "grad_norm": 2.245492458343506, "learning_rate": 7.96182304296462e-08, "loss": 0.1131, "step": 118470 }, { "epoch": 2.7625309667453584, "grad_norm": 1.2815892696380615, "learning_rate": 7.954050861157743e-08, "loss": 0.1103, "step": 118480 }, { "epoch": 2.7627641281221766, "grad_norm": 1.1757450103759766, "learning_rate": 7.946278679350866e-08, "loss": 0.1077, "step": 118490 }, { "epoch": 2.7629972894989945, "grad_norm": 2.1011993885040283, "learning_rate": 7.93850649754399e-08, "loss": 0.0926, "step": 118500 }, { "epoch": 2.7632304508758123, "grad_norm": 1.5642014741897583, "learning_rate": 7.930734315737113e-08, "loss": 0.1086, "step": 118510 }, { "epoch": 2.7634636122526306, "grad_norm": 1.4592065811157227, "learning_rate": 7.922962133930236e-08, "loss": 0.124, "step": 118520 }, { "epoch": 2.7636967736294484, "grad_norm": 1.4228341579437256, "learning_rate": 7.915189952123359e-08, "loss": 0.1052, "step": 118530 }, { "epoch": 2.7639299350062663, "grad_norm": 2.7141294479370117, "learning_rate": 7.907417770316483e-08, "loss": 0.0934, "step": 118540 }, { "epoch": 2.764163096383084, "grad_norm": 1.4270745515823364, "learning_rate": 7.899645588509605e-08, "loss": 0.1056, "step": 118550 }, { "epoch": 2.764396257759902, "grad_norm": 1.3884211778640747, "learning_rate": 7.891873406702729e-08, "loss": 0.0922, "step": 118560 }, { "epoch": 2.7646294191367202, "grad_norm": 1.3590527772903442, "learning_rate": 7.884101224895852e-08, "loss": 0.0952, "step": 118570 }, { "epoch": 2.764862580513538, "grad_norm": 2.885441541671753, "learning_rate": 7.876329043088974e-08, "loss": 0.1124, "step": 118580 }, { "epoch": 2.765095741890356, "grad_norm": 2.2134532928466797, "learning_rate": 7.868556861282098e-08, "loss": 0.1094, "step": 118590 }, { "epoch": 2.7653289032671737, "grad_norm": 1.1837819814682007, "learning_rate": 7.860784679475222e-08, "loss": 0.0917, "step": 118600 }, { "epoch": 2.7655620646439916, "grad_norm": 3.254854440689087, "learning_rate": 7.853012497668345e-08, "loss": 0.0994, "step": 118610 }, { "epoch": 2.76579522602081, "grad_norm": 1.6823654174804688, "learning_rate": 7.845240315861467e-08, "loss": 0.1025, "step": 118620 }, { "epoch": 2.7660283873976277, "grad_norm": 1.4039998054504395, "learning_rate": 7.837468134054591e-08, "loss": 0.0977, "step": 118630 }, { "epoch": 2.7662615487744455, "grad_norm": 1.9455556869506836, "learning_rate": 7.829695952247715e-08, "loss": 0.0938, "step": 118640 }, { "epoch": 2.7664947101512634, "grad_norm": 1.4754678010940552, "learning_rate": 7.821923770440837e-08, "loss": 0.1108, "step": 118650 }, { "epoch": 2.766727871528081, "grad_norm": 3.2457075119018555, "learning_rate": 7.81415158863396e-08, "loss": 0.1086, "step": 118660 }, { "epoch": 2.7669610329048995, "grad_norm": 2.397282123565674, "learning_rate": 7.806379406827084e-08, "loss": 0.1104, "step": 118670 }, { "epoch": 2.7671941942817173, "grad_norm": 1.7182987928390503, "learning_rate": 7.798607225020206e-08, "loss": 0.1101, "step": 118680 }, { "epoch": 2.767427355658535, "grad_norm": 1.4192873239517212, "learning_rate": 7.79083504321333e-08, "loss": 0.0953, "step": 118690 }, { "epoch": 2.767660517035353, "grad_norm": 1.4607932567596436, "learning_rate": 7.783062861406453e-08, "loss": 0.1064, "step": 118700 }, { "epoch": 2.767893678412171, "grad_norm": 1.394571304321289, "learning_rate": 7.775290679599576e-08, "loss": 0.0912, "step": 118710 }, { "epoch": 2.768126839788989, "grad_norm": 1.5032908916473389, "learning_rate": 7.767518497792699e-08, "loss": 0.1127, "step": 118720 }, { "epoch": 2.768360001165807, "grad_norm": 1.9546105861663818, "learning_rate": 7.759746315985823e-08, "loss": 0.1088, "step": 118730 }, { "epoch": 2.7685931625426248, "grad_norm": 2.2546586990356445, "learning_rate": 7.751974134178946e-08, "loss": 0.1043, "step": 118740 }, { "epoch": 2.7688263239194426, "grad_norm": 2.6146724224090576, "learning_rate": 7.744201952372069e-08, "loss": 0.1061, "step": 118750 }, { "epoch": 2.7690594852962604, "grad_norm": 2.7927043437957764, "learning_rate": 7.736429770565192e-08, "loss": 0.0964, "step": 118760 }, { "epoch": 2.7692926466730787, "grad_norm": 1.9180301427841187, "learning_rate": 7.728657588758316e-08, "loss": 0.0965, "step": 118770 }, { "epoch": 2.7695258080498966, "grad_norm": 1.3827050924301147, "learning_rate": 7.720885406951438e-08, "loss": 0.0946, "step": 118780 }, { "epoch": 2.7697589694267144, "grad_norm": 1.7505296468734741, "learning_rate": 7.713113225144561e-08, "loss": 0.1028, "step": 118790 }, { "epoch": 2.7699921308035322, "grad_norm": 1.4235820770263672, "learning_rate": 7.705341043337685e-08, "loss": 0.1096, "step": 118800 }, { "epoch": 2.77022529218035, "grad_norm": 1.500158429145813, "learning_rate": 7.697568861530807e-08, "loss": 0.0967, "step": 118810 }, { "epoch": 2.7704584535571684, "grad_norm": 0.8944993615150452, "learning_rate": 7.689796679723931e-08, "loss": 0.1044, "step": 118820 }, { "epoch": 2.770691614933986, "grad_norm": 1.2367955446243286, "learning_rate": 7.682024497917054e-08, "loss": 0.098, "step": 118830 }, { "epoch": 2.770924776310804, "grad_norm": 3.299767017364502, "learning_rate": 7.674252316110178e-08, "loss": 0.1084, "step": 118840 }, { "epoch": 2.7711579376876223, "grad_norm": 1.1481164693832397, "learning_rate": 7.6664801343033e-08, "loss": 0.1072, "step": 118850 }, { "epoch": 2.7713910990644397, "grad_norm": 1.1331089735031128, "learning_rate": 7.658707952496424e-08, "loss": 0.0969, "step": 118860 }, { "epoch": 2.771624260441258, "grad_norm": 1.2773357629776, "learning_rate": 7.650935770689547e-08, "loss": 0.0946, "step": 118870 }, { "epoch": 2.771857421818076, "grad_norm": 1.3286083936691284, "learning_rate": 7.643163588882671e-08, "loss": 0.1007, "step": 118880 }, { "epoch": 2.7720905831948937, "grad_norm": 2.4438743591308594, "learning_rate": 7.635391407075795e-08, "loss": 0.1042, "step": 118890 }, { "epoch": 2.772323744571712, "grad_norm": 1.3819938898086548, "learning_rate": 7.627619225268918e-08, "loss": 0.1102, "step": 118900 }, { "epoch": 2.7725569059485298, "grad_norm": 3.251439094543457, "learning_rate": 7.619847043462042e-08, "loss": 0.1034, "step": 118910 }, { "epoch": 2.7727900673253476, "grad_norm": 1.368587851524353, "learning_rate": 7.612074861655164e-08, "loss": 0.0937, "step": 118920 }, { "epoch": 2.7730232287021654, "grad_norm": 2.1729373931884766, "learning_rate": 7.604302679848287e-08, "loss": 0.0987, "step": 118930 }, { "epoch": 2.7732563900789833, "grad_norm": 1.4062089920043945, "learning_rate": 7.596530498041411e-08, "loss": 0.0995, "step": 118940 }, { "epoch": 2.7734895514558016, "grad_norm": 1.2054541110992432, "learning_rate": 7.588758316234533e-08, "loss": 0.0957, "step": 118950 }, { "epoch": 2.7737227128326194, "grad_norm": 1.2743165493011475, "learning_rate": 7.580986134427657e-08, "loss": 0.1023, "step": 118960 }, { "epoch": 2.7739558742094372, "grad_norm": 1.6225669384002686, "learning_rate": 7.57321395262078e-08, "loss": 0.1158, "step": 118970 }, { "epoch": 2.774189035586255, "grad_norm": 1.400186538696289, "learning_rate": 7.565441770813903e-08, "loss": 0.0942, "step": 118980 }, { "epoch": 2.774422196963073, "grad_norm": 1.1347873210906982, "learning_rate": 7.557669589007026e-08, "loss": 0.0942, "step": 118990 }, { "epoch": 2.774655358339891, "grad_norm": 1.8257194757461548, "learning_rate": 7.54989740720015e-08, "loss": 0.0903, "step": 119000 }, { "epoch": 2.774888519716709, "grad_norm": 1.2897324562072754, "learning_rate": 7.542125225393272e-08, "loss": 0.1082, "step": 119010 }, { "epoch": 2.775121681093527, "grad_norm": 2.1496171951293945, "learning_rate": 7.534353043586396e-08, "loss": 0.1055, "step": 119020 }, { "epoch": 2.7753548424703447, "grad_norm": 1.460718035697937, "learning_rate": 7.526580861779519e-08, "loss": 0.112, "step": 119030 }, { "epoch": 2.7755880038471625, "grad_norm": 2.067159414291382, "learning_rate": 7.518808679972643e-08, "loss": 0.0966, "step": 119040 }, { "epoch": 2.775821165223981, "grad_norm": 1.7418776750564575, "learning_rate": 7.511036498165765e-08, "loss": 0.0935, "step": 119050 }, { "epoch": 2.7760543266007986, "grad_norm": 1.3113927841186523, "learning_rate": 7.503264316358889e-08, "loss": 0.1005, "step": 119060 }, { "epoch": 2.7762874879776165, "grad_norm": 2.492027521133423, "learning_rate": 7.495492134552012e-08, "loss": 0.0977, "step": 119070 }, { "epoch": 2.7765206493544343, "grad_norm": 1.5281678438186646, "learning_rate": 7.487719952745134e-08, "loss": 0.1071, "step": 119080 }, { "epoch": 2.776753810731252, "grad_norm": 3.6788558959960938, "learning_rate": 7.479947770938258e-08, "loss": 0.1011, "step": 119090 }, { "epoch": 2.7769869721080704, "grad_norm": 2.8793792724609375, "learning_rate": 7.472175589131382e-08, "loss": 0.0917, "step": 119100 }, { "epoch": 2.7772201334848883, "grad_norm": 2.067239284515381, "learning_rate": 7.464403407324504e-08, "loss": 0.1031, "step": 119110 }, { "epoch": 2.777453294861706, "grad_norm": 1.5173490047454834, "learning_rate": 7.456631225517627e-08, "loss": 0.1085, "step": 119120 }, { "epoch": 2.777686456238524, "grad_norm": 1.5608878135681152, "learning_rate": 7.448859043710751e-08, "loss": 0.1078, "step": 119130 }, { "epoch": 2.777919617615342, "grad_norm": 1.8941855430603027, "learning_rate": 7.441086861903874e-08, "loss": 0.1, "step": 119140 }, { "epoch": 2.77815277899216, "grad_norm": 1.4459673166275024, "learning_rate": 7.433314680096997e-08, "loss": 0.1062, "step": 119150 }, { "epoch": 2.778385940368978, "grad_norm": 1.2266172170639038, "learning_rate": 7.42554249829012e-08, "loss": 0.1114, "step": 119160 }, { "epoch": 2.7786191017457957, "grad_norm": 2.0729551315307617, "learning_rate": 7.417770316483244e-08, "loss": 0.1067, "step": 119170 }, { "epoch": 2.7788522631226136, "grad_norm": 0.9873033761978149, "learning_rate": 7.410775352857054e-08, "loss": 0.0901, "step": 119180 }, { "epoch": 2.7790854244994314, "grad_norm": 1.813151240348816, "learning_rate": 7.403003171050177e-08, "loss": 0.1024, "step": 119190 }, { "epoch": 2.7793185858762497, "grad_norm": 1.3878319263458252, "learning_rate": 7.395230989243301e-08, "loss": 0.0942, "step": 119200 }, { "epoch": 2.7795517472530675, "grad_norm": 1.7482815980911255, "learning_rate": 7.387458807436424e-08, "loss": 0.0986, "step": 119210 }, { "epoch": 2.7797849086298854, "grad_norm": 1.2909226417541504, "learning_rate": 7.379686625629547e-08, "loss": 0.1077, "step": 119220 }, { "epoch": 2.7800180700067036, "grad_norm": 2.163640022277832, "learning_rate": 7.37191444382267e-08, "loss": 0.095, "step": 119230 }, { "epoch": 2.7802512313835215, "grad_norm": 2.3649163246154785, "learning_rate": 7.364142262015794e-08, "loss": 0.0977, "step": 119240 }, { "epoch": 2.7804843927603393, "grad_norm": 1.2226561307907104, "learning_rate": 7.356370080208916e-08, "loss": 0.0982, "step": 119250 }, { "epoch": 2.780717554137157, "grad_norm": 1.6978121995925903, "learning_rate": 7.34859789840204e-08, "loss": 0.1005, "step": 119260 }, { "epoch": 2.780950715513975, "grad_norm": 2.162008047103882, "learning_rate": 7.340825716595163e-08, "loss": 0.0976, "step": 119270 }, { "epoch": 2.7811838768907933, "grad_norm": 1.8710662126541138, "learning_rate": 7.333053534788285e-08, "loss": 0.0992, "step": 119280 }, { "epoch": 2.781417038267611, "grad_norm": 1.5512953996658325, "learning_rate": 7.325281352981409e-08, "loss": 0.0934, "step": 119290 }, { "epoch": 2.781650199644429, "grad_norm": 1.2139869928359985, "learning_rate": 7.317509171174533e-08, "loss": 0.1098, "step": 119300 }, { "epoch": 2.781883361021247, "grad_norm": 1.5782867670059204, "learning_rate": 7.309736989367656e-08, "loss": 0.095, "step": 119310 }, { "epoch": 2.7821165223980646, "grad_norm": 1.4604073762893677, "learning_rate": 7.301964807560778e-08, "loss": 0.106, "step": 119320 }, { "epoch": 2.782349683774883, "grad_norm": 2.6594367027282715, "learning_rate": 7.294192625753902e-08, "loss": 0.0965, "step": 119330 }, { "epoch": 2.7825828451517007, "grad_norm": 1.1543856859207153, "learning_rate": 7.286420443947026e-08, "loss": 0.1032, "step": 119340 }, { "epoch": 2.7828160065285186, "grad_norm": 2.3100335597991943, "learning_rate": 7.278648262140148e-08, "loss": 0.1052, "step": 119350 }, { "epoch": 2.7830491679053364, "grad_norm": 2.8978617191314697, "learning_rate": 7.270876080333271e-08, "loss": 0.0981, "step": 119360 }, { "epoch": 2.7832823292821542, "grad_norm": 1.3496487140655518, "learning_rate": 7.263103898526395e-08, "loss": 0.1091, "step": 119370 }, { "epoch": 2.7835154906589725, "grad_norm": 2.416964292526245, "learning_rate": 7.255331716719517e-08, "loss": 0.1117, "step": 119380 }, { "epoch": 2.7837486520357904, "grad_norm": 1.4059404134750366, "learning_rate": 7.247559534912641e-08, "loss": 0.1048, "step": 119390 }, { "epoch": 2.783981813412608, "grad_norm": 1.436955213546753, "learning_rate": 7.239787353105764e-08, "loss": 0.0992, "step": 119400 }, { "epoch": 2.784214974789426, "grad_norm": 1.6024208068847656, "learning_rate": 7.232015171298888e-08, "loss": 0.1001, "step": 119410 }, { "epoch": 2.784448136166244, "grad_norm": 1.9298896789550781, "learning_rate": 7.22424298949201e-08, "loss": 0.1004, "step": 119420 }, { "epoch": 2.784681297543062, "grad_norm": 1.612427830696106, "learning_rate": 7.216470807685134e-08, "loss": 0.1037, "step": 119430 }, { "epoch": 2.78491445891988, "grad_norm": 1.389069676399231, "learning_rate": 7.208698625878257e-08, "loss": 0.0937, "step": 119440 }, { "epoch": 2.785147620296698, "grad_norm": 3.2129576206207275, "learning_rate": 7.20092644407138e-08, "loss": 0.1129, "step": 119450 }, { "epoch": 2.7853807816735157, "grad_norm": 1.2047959566116333, "learning_rate": 7.193154262264503e-08, "loss": 0.1083, "step": 119460 }, { "epoch": 2.7856139430503335, "grad_norm": 1.0539004802703857, "learning_rate": 7.185382080457627e-08, "loss": 0.1004, "step": 119470 }, { "epoch": 2.7858471044271518, "grad_norm": 2.801785945892334, "learning_rate": 7.177609898650749e-08, "loss": 0.1075, "step": 119480 }, { "epoch": 2.7860802658039696, "grad_norm": 1.330451488494873, "learning_rate": 7.169837716843872e-08, "loss": 0.0939, "step": 119490 }, { "epoch": 2.7863134271807874, "grad_norm": 1.7083348035812378, "learning_rate": 7.162065535036996e-08, "loss": 0.102, "step": 119500 }, { "epoch": 2.7865465885576053, "grad_norm": 1.4074078798294067, "learning_rate": 7.15429335323012e-08, "loss": 0.0943, "step": 119510 }, { "epoch": 2.786779749934423, "grad_norm": 1.6103947162628174, "learning_rate": 7.146521171423242e-08, "loss": 0.1076, "step": 119520 }, { "epoch": 2.7870129113112414, "grad_norm": 1.4596091508865356, "learning_rate": 7.138748989616365e-08, "loss": 0.1073, "step": 119530 }, { "epoch": 2.7872460726880592, "grad_norm": 4.744327545166016, "learning_rate": 7.130976807809489e-08, "loss": 0.1172, "step": 119540 }, { "epoch": 2.787479234064877, "grad_norm": 2.2538392543792725, "learning_rate": 7.123204626002611e-08, "loss": 0.0969, "step": 119550 }, { "epoch": 2.7877123954416954, "grad_norm": 2.1648058891296387, "learning_rate": 7.115432444195735e-08, "loss": 0.0975, "step": 119560 }, { "epoch": 2.7879455568185127, "grad_norm": 1.6242871284484863, "learning_rate": 7.107660262388858e-08, "loss": 0.1035, "step": 119570 }, { "epoch": 2.788178718195331, "grad_norm": 1.2440916299819946, "learning_rate": 7.09988808058198e-08, "loss": 0.0927, "step": 119580 }, { "epoch": 2.788411879572149, "grad_norm": 1.2828280925750732, "learning_rate": 7.092115898775104e-08, "loss": 0.107, "step": 119590 }, { "epoch": 2.7886450409489667, "grad_norm": 1.2339646816253662, "learning_rate": 7.084343716968228e-08, "loss": 0.0887, "step": 119600 }, { "epoch": 2.788878202325785, "grad_norm": 1.3834152221679688, "learning_rate": 7.07657153516135e-08, "loss": 0.1001, "step": 119610 }, { "epoch": 2.789111363702603, "grad_norm": 1.1734168529510498, "learning_rate": 7.068799353354474e-08, "loss": 0.1035, "step": 119620 }, { "epoch": 2.7893445250794207, "grad_norm": 2.901916980743408, "learning_rate": 7.061027171547597e-08, "loss": 0.0984, "step": 119630 }, { "epoch": 2.7895776864562385, "grad_norm": 2.056657075881958, "learning_rate": 7.05325498974072e-08, "loss": 0.0911, "step": 119640 }, { "epoch": 2.7898108478330563, "grad_norm": 1.6184577941894531, "learning_rate": 7.045482807933843e-08, "loss": 0.0945, "step": 119650 }, { "epoch": 2.7900440092098746, "grad_norm": 1.968042254447937, "learning_rate": 7.037710626126966e-08, "loss": 0.1052, "step": 119660 }, { "epoch": 2.7902771705866924, "grad_norm": 1.1279833316802979, "learning_rate": 7.02993844432009e-08, "loss": 0.1081, "step": 119670 }, { "epoch": 2.7905103319635103, "grad_norm": 1.079634189605713, "learning_rate": 7.022166262513212e-08, "loss": 0.123, "step": 119680 }, { "epoch": 2.790743493340328, "grad_norm": 1.7903069257736206, "learning_rate": 7.014394080706336e-08, "loss": 0.0961, "step": 119690 }, { "epoch": 2.790976654717146, "grad_norm": 2.221810817718506, "learning_rate": 7.00662189889946e-08, "loss": 0.0884, "step": 119700 }, { "epoch": 2.7912098160939642, "grad_norm": 2.988027572631836, "learning_rate": 6.998849717092582e-08, "loss": 0.1228, "step": 119710 }, { "epoch": 2.791442977470782, "grad_norm": 1.8291698694229126, "learning_rate": 6.991077535285705e-08, "loss": 0.1178, "step": 119720 }, { "epoch": 2.7916761388476, "grad_norm": 1.3983746767044067, "learning_rate": 6.983305353478829e-08, "loss": 0.106, "step": 119730 }, { "epoch": 2.7919093002244177, "grad_norm": 1.397162675857544, "learning_rate": 6.975533171671952e-08, "loss": 0.1105, "step": 119740 }, { "epoch": 2.7921424616012356, "grad_norm": 1.7639294862747192, "learning_rate": 6.967760989865075e-08, "loss": 0.1212, "step": 119750 }, { "epoch": 2.792375622978054, "grad_norm": 2.3090980052948, "learning_rate": 6.959988808058198e-08, "loss": 0.1086, "step": 119760 }, { "epoch": 2.7926087843548717, "grad_norm": 1.1580770015716553, "learning_rate": 6.952216626251322e-08, "loss": 0.1, "step": 119770 }, { "epoch": 2.7928419457316895, "grad_norm": 1.6275355815887451, "learning_rate": 6.944444444444444e-08, "loss": 0.0943, "step": 119780 }, { "epoch": 2.7930751071085074, "grad_norm": 1.4883275032043457, "learning_rate": 6.936672262637568e-08, "loss": 0.1114, "step": 119790 }, { "epoch": 2.793308268485325, "grad_norm": 1.8418484926223755, "learning_rate": 6.928900080830691e-08, "loss": 0.1121, "step": 119800 }, { "epoch": 2.7935414298621435, "grad_norm": 1.155442714691162, "learning_rate": 6.921127899023813e-08, "loss": 0.0932, "step": 119810 }, { "epoch": 2.7937745912389613, "grad_norm": 1.841800570487976, "learning_rate": 6.913355717216937e-08, "loss": 0.104, "step": 119820 }, { "epoch": 2.794007752615779, "grad_norm": 1.8772053718566895, "learning_rate": 6.90558353541006e-08, "loss": 0.107, "step": 119830 }, { "epoch": 2.794240913992597, "grad_norm": 1.3307652473449707, "learning_rate": 6.897811353603184e-08, "loss": 0.1002, "step": 119840 }, { "epoch": 2.794474075369415, "grad_norm": 1.6309069395065308, "learning_rate": 6.890039171796306e-08, "loss": 0.1054, "step": 119850 }, { "epoch": 2.794707236746233, "grad_norm": 1.716640591621399, "learning_rate": 6.88226698998943e-08, "loss": 0.1026, "step": 119860 }, { "epoch": 2.794940398123051, "grad_norm": 1.4593687057495117, "learning_rate": 6.874494808182553e-08, "loss": 0.0993, "step": 119870 }, { "epoch": 2.795173559499869, "grad_norm": 3.581242561340332, "learning_rate": 6.866722626375676e-08, "loss": 0.1105, "step": 119880 }, { "epoch": 2.7954067208766866, "grad_norm": 1.4204148054122925, "learning_rate": 6.858950444568799e-08, "loss": 0.1044, "step": 119890 }, { "epoch": 2.7956398822535045, "grad_norm": 1.6882109642028809, "learning_rate": 6.851178262761923e-08, "loss": 0.1077, "step": 119900 }, { "epoch": 2.7958730436303227, "grad_norm": 2.4085114002227783, "learning_rate": 6.843406080955045e-08, "loss": 0.1084, "step": 119910 }, { "epoch": 2.7961062050071406, "grad_norm": 1.2592333555221558, "learning_rate": 6.835633899148169e-08, "loss": 0.1054, "step": 119920 }, { "epoch": 2.7963393663839584, "grad_norm": 3.3229665756225586, "learning_rate": 6.827861717341292e-08, "loss": 0.1095, "step": 119930 }, { "epoch": 2.7965725277607767, "grad_norm": 1.336836338043213, "learning_rate": 6.820089535534416e-08, "loss": 0.104, "step": 119940 }, { "epoch": 2.796805689137594, "grad_norm": 1.767127513885498, "learning_rate": 6.812317353727538e-08, "loss": 0.1092, "step": 119950 }, { "epoch": 2.7970388505144124, "grad_norm": 1.4029256105422974, "learning_rate": 6.804545171920662e-08, "loss": 0.1065, "step": 119960 }, { "epoch": 2.79727201189123, "grad_norm": 1.7375487089157104, "learning_rate": 6.796772990113785e-08, "loss": 0.0993, "step": 119970 }, { "epoch": 2.797505173268048, "grad_norm": 3.0119848251342773, "learning_rate": 6.789000808306907e-08, "loss": 0.1029, "step": 119980 }, { "epoch": 2.7977383346448663, "grad_norm": 2.3999183177948, "learning_rate": 6.781228626500031e-08, "loss": 0.1055, "step": 119990 }, { "epoch": 2.797971496021684, "grad_norm": 1.6648201942443848, "learning_rate": 6.773456444693155e-08, "loss": 0.1028, "step": 120000 }, { "epoch": 2.797971496021684, "eval_accuracy": 0.9484496026597913, "eval_f1": 0.9631695727017635, "eval_loss": 0.1344262659549713, "eval_runtime": 3950.3697, "eval_samples_per_second": 463.227, "eval_steps_per_second": 57.903, "step": 120000 }, { "epoch": 2.798204657398502, "grad_norm": 1.3644911050796509, "learning_rate": 6.765684262886277e-08, "loss": 0.1023, "step": 120010 }, { "epoch": 2.79843781877532, "grad_norm": 1.9853308200836182, "learning_rate": 6.7579120810794e-08, "loss": 0.1087, "step": 120020 }, { "epoch": 2.7986709801521377, "grad_norm": 1.7088876962661743, "learning_rate": 6.750139899272524e-08, "loss": 0.094, "step": 120030 }, { "epoch": 2.798904141528956, "grad_norm": 1.5269132852554321, "learning_rate": 6.742367717465646e-08, "loss": 0.1083, "step": 120040 }, { "epoch": 2.799137302905774, "grad_norm": 2.027679920196533, "learning_rate": 6.73459553565877e-08, "loss": 0.0901, "step": 120050 }, { "epoch": 2.7993704642825916, "grad_norm": 1.4032601118087769, "learning_rate": 6.726823353851893e-08, "loss": 0.0907, "step": 120060 }, { "epoch": 2.7996036256594095, "grad_norm": 1.6748363971710205, "learning_rate": 6.719051172045017e-08, "loss": 0.0975, "step": 120070 }, { "epoch": 2.7998367870362273, "grad_norm": 4.575855731964111, "learning_rate": 6.711278990238139e-08, "loss": 0.1133, "step": 120080 }, { "epoch": 2.8000699484130456, "grad_norm": 2.906985282897949, "learning_rate": 6.703506808431263e-08, "loss": 0.0961, "step": 120090 }, { "epoch": 2.8003031097898634, "grad_norm": 1.2486968040466309, "learning_rate": 6.695734626624386e-08, "loss": 0.0861, "step": 120100 }, { "epoch": 2.8005362711666812, "grad_norm": 2.5133285522460938, "learning_rate": 6.687962444817508e-08, "loss": 0.1004, "step": 120110 }, { "epoch": 2.800769432543499, "grad_norm": 1.6637605428695679, "learning_rate": 6.680190263010632e-08, "loss": 0.0994, "step": 120120 }, { "epoch": 2.801002593920317, "grad_norm": 1.2172445058822632, "learning_rate": 6.672418081203756e-08, "loss": 0.1001, "step": 120130 }, { "epoch": 2.801235755297135, "grad_norm": 1.8883591890335083, "learning_rate": 6.664645899396878e-08, "loss": 0.1067, "step": 120140 }, { "epoch": 2.801468916673953, "grad_norm": 2.1759989261627197, "learning_rate": 6.656873717590001e-08, "loss": 0.1004, "step": 120150 }, { "epoch": 2.801702078050771, "grad_norm": 4.1941609382629395, "learning_rate": 6.649101535783125e-08, "loss": 0.1029, "step": 120160 }, { "epoch": 2.8019352394275887, "grad_norm": 1.1475380659103394, "learning_rate": 6.641329353976249e-08, "loss": 0.0948, "step": 120170 }, { "epoch": 2.8021684008044065, "grad_norm": 2.450580358505249, "learning_rate": 6.633557172169371e-08, "loss": 0.0969, "step": 120180 }, { "epoch": 2.802401562181225, "grad_norm": 1.3233799934387207, "learning_rate": 6.625784990362494e-08, "loss": 0.0957, "step": 120190 }, { "epoch": 2.8026347235580427, "grad_norm": 1.10642671585083, "learning_rate": 6.618012808555618e-08, "loss": 0.0967, "step": 120200 }, { "epoch": 2.8028678849348605, "grad_norm": 1.520662784576416, "learning_rate": 6.61024062674874e-08, "loss": 0.1048, "step": 120210 }, { "epoch": 2.8031010463116783, "grad_norm": 2.2340590953826904, "learning_rate": 6.602468444941864e-08, "loss": 0.1005, "step": 120220 }, { "epoch": 2.803334207688496, "grad_norm": 1.4080711603164673, "learning_rate": 6.594696263134987e-08, "loss": 0.0955, "step": 120230 }, { "epoch": 2.8035673690653145, "grad_norm": 1.4070723056793213, "learning_rate": 6.58692408132811e-08, "loss": 0.0989, "step": 120240 }, { "epoch": 2.8038005304421323, "grad_norm": 1.7718018293380737, "learning_rate": 6.579151899521233e-08, "loss": 0.0974, "step": 120250 }, { "epoch": 2.80403369181895, "grad_norm": 1.913853645324707, "learning_rate": 6.571379717714357e-08, "loss": 0.1079, "step": 120260 }, { "epoch": 2.804266853195768, "grad_norm": 1.4229997396469116, "learning_rate": 6.56360753590748e-08, "loss": 0.1122, "step": 120270 }, { "epoch": 2.804500014572586, "grad_norm": 1.2332277297973633, "learning_rate": 6.555835354100603e-08, "loss": 0.0922, "step": 120280 }, { "epoch": 2.804733175949404, "grad_norm": 1.5605820417404175, "learning_rate": 6.548063172293726e-08, "loss": 0.1075, "step": 120290 }, { "epoch": 2.804966337326222, "grad_norm": 1.3147729635238647, "learning_rate": 6.54029099048685e-08, "loss": 0.1036, "step": 120300 }, { "epoch": 2.8051994987030398, "grad_norm": 1.6399651765823364, "learning_rate": 6.532518808679972e-08, "loss": 0.1025, "step": 120310 }, { "epoch": 2.805432660079858, "grad_norm": 1.635023593902588, "learning_rate": 6.524746626873095e-08, "loss": 0.0971, "step": 120320 }, { "epoch": 2.805665821456676, "grad_norm": 2.7082252502441406, "learning_rate": 6.516974445066219e-08, "loss": 0.107, "step": 120330 }, { "epoch": 2.8058989828334937, "grad_norm": 1.2406675815582275, "learning_rate": 6.509202263259341e-08, "loss": 0.0949, "step": 120340 }, { "epoch": 2.8061321442103115, "grad_norm": 1.5202029943466187, "learning_rate": 6.501430081452465e-08, "loss": 0.0986, "step": 120350 }, { "epoch": 2.8063653055871294, "grad_norm": 1.381727695465088, "learning_rate": 6.493657899645588e-08, "loss": 0.0985, "step": 120360 }, { "epoch": 2.8065984669639477, "grad_norm": 1.6658895015716553, "learning_rate": 6.485885717838712e-08, "loss": 0.0911, "step": 120370 }, { "epoch": 2.8068316283407655, "grad_norm": 2.748875141143799, "learning_rate": 6.478113536031834e-08, "loss": 0.1081, "step": 120380 }, { "epoch": 2.8070647897175833, "grad_norm": 1.230482578277588, "learning_rate": 6.470341354224958e-08, "loss": 0.1073, "step": 120390 }, { "epoch": 2.807297951094401, "grad_norm": 1.3970377445220947, "learning_rate": 6.462569172418081e-08, "loss": 0.1034, "step": 120400 }, { "epoch": 2.807531112471219, "grad_norm": 2.1451034545898438, "learning_rate": 6.454796990611204e-08, "loss": 0.1095, "step": 120410 }, { "epoch": 2.8077642738480373, "grad_norm": 3.0600008964538574, "learning_rate": 6.447024808804327e-08, "loss": 0.113, "step": 120420 }, { "epoch": 2.807997435224855, "grad_norm": 1.3511430025100708, "learning_rate": 6.439252626997451e-08, "loss": 0.1064, "step": 120430 }, { "epoch": 2.808230596601673, "grad_norm": 1.4244256019592285, "learning_rate": 6.431480445190573e-08, "loss": 0.1183, "step": 120440 }, { "epoch": 2.808463757978491, "grad_norm": 1.9843260049819946, "learning_rate": 6.423708263383697e-08, "loss": 0.1081, "step": 120450 }, { "epoch": 2.8086969193553086, "grad_norm": 1.001787543296814, "learning_rate": 6.41593608157682e-08, "loss": 0.0959, "step": 120460 }, { "epoch": 2.808930080732127, "grad_norm": 1.6542141437530518, "learning_rate": 6.408163899769942e-08, "loss": 0.1116, "step": 120470 }, { "epoch": 2.8091632421089447, "grad_norm": 1.6129173040390015, "learning_rate": 6.400391717963066e-08, "loss": 0.0915, "step": 120480 }, { "epoch": 2.8093964034857626, "grad_norm": 1.1077598333358765, "learning_rate": 6.39261953615619e-08, "loss": 0.094, "step": 120490 }, { "epoch": 2.8096295648625804, "grad_norm": 1.269783854484558, "learning_rate": 6.384847354349313e-08, "loss": 0.1016, "step": 120500 }, { "epoch": 2.8098627262393983, "grad_norm": 1.2975050210952759, "learning_rate": 6.377075172542435e-08, "loss": 0.094, "step": 120510 }, { "epoch": 2.8100958876162165, "grad_norm": 2.2785801887512207, "learning_rate": 6.369302990735559e-08, "loss": 0.0928, "step": 120520 }, { "epoch": 2.8103290489930344, "grad_norm": 2.403613328933716, "learning_rate": 6.361530808928682e-08, "loss": 0.1018, "step": 120530 }, { "epoch": 2.810562210369852, "grad_norm": 1.2402516603469849, "learning_rate": 6.353758627121805e-08, "loss": 0.097, "step": 120540 }, { "epoch": 2.81079537174667, "grad_norm": 2.3276636600494385, "learning_rate": 6.345986445314928e-08, "loss": 0.0978, "step": 120550 }, { "epoch": 2.811028533123488, "grad_norm": 2.706242799758911, "learning_rate": 6.338214263508052e-08, "loss": 0.1087, "step": 120560 }, { "epoch": 2.811261694500306, "grad_norm": 1.9359822273254395, "learning_rate": 6.330442081701174e-08, "loss": 0.1135, "step": 120570 }, { "epoch": 2.811494855877124, "grad_norm": 1.3128682374954224, "learning_rate": 6.322669899894298e-08, "loss": 0.1114, "step": 120580 }, { "epoch": 2.811728017253942, "grad_norm": 3.106013536453247, "learning_rate": 6.314897718087421e-08, "loss": 0.1102, "step": 120590 }, { "epoch": 2.8119611786307597, "grad_norm": 1.624401330947876, "learning_rate": 6.307125536280545e-08, "loss": 0.0917, "step": 120600 }, { "epoch": 2.8121943400075775, "grad_norm": 1.6480685472488403, "learning_rate": 6.299353354473667e-08, "loss": 0.1064, "step": 120610 }, { "epoch": 2.812427501384396, "grad_norm": 1.7955482006072998, "learning_rate": 6.29158117266679e-08, "loss": 0.1093, "step": 120620 }, { "epoch": 2.8126606627612136, "grad_norm": 2.000736951828003, "learning_rate": 6.283808990859914e-08, "loss": 0.108, "step": 120630 }, { "epoch": 2.8128938241380315, "grad_norm": 1.312490701675415, "learning_rate": 6.276036809053036e-08, "loss": 0.1037, "step": 120640 }, { "epoch": 2.8131269855148493, "grad_norm": 1.6486937999725342, "learning_rate": 6.26826462724616e-08, "loss": 0.1046, "step": 120650 }, { "epoch": 2.813360146891667, "grad_norm": 1.7049000263214111, "learning_rate": 6.260492445439284e-08, "loss": 0.0882, "step": 120660 }, { "epoch": 2.8135933082684854, "grad_norm": 1.5065168142318726, "learning_rate": 6.252720263632406e-08, "loss": 0.1004, "step": 120670 }, { "epoch": 2.8138264696453033, "grad_norm": 3.2950026988983154, "learning_rate": 6.24494808182553e-08, "loss": 0.1005, "step": 120680 }, { "epoch": 2.814059631022121, "grad_norm": 3.16161847114563, "learning_rate": 6.237175900018653e-08, "loss": 0.1022, "step": 120690 }, { "epoch": 2.8142927923989394, "grad_norm": 1.8235774040222168, "learning_rate": 6.229403718211777e-08, "loss": 0.1026, "step": 120700 }, { "epoch": 2.814525953775757, "grad_norm": 1.1963722705841064, "learning_rate": 6.221631536404899e-08, "loss": 0.0971, "step": 120710 }, { "epoch": 2.814759115152575, "grad_norm": 1.52930748462677, "learning_rate": 6.213859354598022e-08, "loss": 0.1015, "step": 120720 }, { "epoch": 2.814992276529393, "grad_norm": 1.354428768157959, "learning_rate": 6.206087172791146e-08, "loss": 0.1009, "step": 120730 }, { "epoch": 2.8152254379062107, "grad_norm": 1.0447776317596436, "learning_rate": 6.198314990984268e-08, "loss": 0.0926, "step": 120740 }, { "epoch": 2.815458599283029, "grad_norm": 1.3023006916046143, "learning_rate": 6.190542809177392e-08, "loss": 0.1051, "step": 120750 }, { "epoch": 2.815691760659847, "grad_norm": 1.9816043376922607, "learning_rate": 6.182770627370515e-08, "loss": 0.1087, "step": 120760 }, { "epoch": 2.8159249220366647, "grad_norm": 1.7061022520065308, "learning_rate": 6.174998445563638e-08, "loss": 0.1001, "step": 120770 }, { "epoch": 2.8161580834134825, "grad_norm": 1.5642576217651367, "learning_rate": 6.167226263756761e-08, "loss": 0.0932, "step": 120780 }, { "epoch": 2.8163912447903003, "grad_norm": 1.7917017936706543, "learning_rate": 6.159454081949885e-08, "loss": 0.1045, "step": 120790 }, { "epoch": 2.8166244061671186, "grad_norm": 1.5087707042694092, "learning_rate": 6.151681900143008e-08, "loss": 0.0968, "step": 120800 }, { "epoch": 2.8168575675439365, "grad_norm": 1.8466476202011108, "learning_rate": 6.14390971833613e-08, "loss": 0.1118, "step": 120810 }, { "epoch": 2.8170907289207543, "grad_norm": 1.592651605606079, "learning_rate": 6.136137536529254e-08, "loss": 0.0939, "step": 120820 }, { "epoch": 2.817323890297572, "grad_norm": 1.5559494495391846, "learning_rate": 6.128365354722378e-08, "loss": 0.0992, "step": 120830 }, { "epoch": 2.81755705167439, "grad_norm": 1.2089543342590332, "learning_rate": 6.1205931729155e-08, "loss": 0.0937, "step": 120840 }, { "epoch": 2.8177902130512082, "grad_norm": 1.6053235530853271, "learning_rate": 6.112820991108623e-08, "loss": 0.1024, "step": 120850 }, { "epoch": 2.818023374428026, "grad_norm": 1.2021374702453613, "learning_rate": 6.105048809301747e-08, "loss": 0.1071, "step": 120860 }, { "epoch": 2.818256535804844, "grad_norm": 1.8307316303253174, "learning_rate": 6.097276627494869e-08, "loss": 0.1094, "step": 120870 }, { "epoch": 2.8184896971816618, "grad_norm": 2.951754570007324, "learning_rate": 6.089504445687993e-08, "loss": 0.0936, "step": 120880 }, { "epoch": 2.8187228585584796, "grad_norm": 1.0296212434768677, "learning_rate": 6.081732263881116e-08, "loss": 0.1148, "step": 120890 }, { "epoch": 2.818956019935298, "grad_norm": 1.5803616046905518, "learning_rate": 6.073960082074239e-08, "loss": 0.0972, "step": 120900 }, { "epoch": 2.8191891813121157, "grad_norm": 1.5874561071395874, "learning_rate": 6.066187900267362e-08, "loss": 0.1025, "step": 120910 }, { "epoch": 2.8194223426889335, "grad_norm": 2.7985081672668457, "learning_rate": 6.058415718460486e-08, "loss": 0.0947, "step": 120920 }, { "epoch": 2.8196555040657514, "grad_norm": 2.0795512199401855, "learning_rate": 6.050643536653609e-08, "loss": 0.0962, "step": 120930 }, { "epoch": 2.819888665442569, "grad_norm": 1.5109453201293945, "learning_rate": 6.042871354846733e-08, "loss": 0.0948, "step": 120940 }, { "epoch": 2.8201218268193875, "grad_norm": 1.4935920238494873, "learning_rate": 6.035099173039856e-08, "loss": 0.1163, "step": 120950 }, { "epoch": 2.8203549881962053, "grad_norm": 1.651047706604004, "learning_rate": 6.027326991232979e-08, "loss": 0.1021, "step": 120960 }, { "epoch": 2.820588149573023, "grad_norm": 1.9868838787078857, "learning_rate": 6.019554809426102e-08, "loss": 0.0999, "step": 120970 }, { "epoch": 2.820821310949841, "grad_norm": 1.2223776578903198, "learning_rate": 6.011782627619226e-08, "loss": 0.1017, "step": 120980 }, { "epoch": 2.821054472326659, "grad_norm": 2.802593946456909, "learning_rate": 6.004010445812348e-08, "loss": 0.1052, "step": 120990 }, { "epoch": 2.821287633703477, "grad_norm": 1.9801530838012695, "learning_rate": 5.996238264005472e-08, "loss": 0.0979, "step": 121000 }, { "epoch": 2.821520795080295, "grad_norm": 1.9833523035049438, "learning_rate": 5.988466082198595e-08, "loss": 0.1002, "step": 121010 }, { "epoch": 2.821753956457113, "grad_norm": 1.1032836437225342, "learning_rate": 5.980693900391717e-08, "loss": 0.0947, "step": 121020 }, { "epoch": 2.821987117833931, "grad_norm": 1.3178887367248535, "learning_rate": 5.972921718584841e-08, "loss": 0.0945, "step": 121030 }, { "epoch": 2.8222202792107485, "grad_norm": 2.335204839706421, "learning_rate": 5.965149536777965e-08, "loss": 0.0938, "step": 121040 }, { "epoch": 2.8224534405875668, "grad_norm": 1.9167091846466064, "learning_rate": 5.9573773549710875e-08, "loss": 0.1005, "step": 121050 }, { "epoch": 2.8226866019643846, "grad_norm": 2.483482837677002, "learning_rate": 5.9496051731642104e-08, "loss": 0.1052, "step": 121060 }, { "epoch": 2.8229197633412024, "grad_norm": 2.365511178970337, "learning_rate": 5.941832991357334e-08, "loss": 0.1009, "step": 121070 }, { "epoch": 2.8231529247180207, "grad_norm": 1.5449492931365967, "learning_rate": 5.934060809550457e-08, "loss": 0.1073, "step": 121080 }, { "epoch": 2.8233860860948385, "grad_norm": 1.7340786457061768, "learning_rate": 5.9262886277435804e-08, "loss": 0.098, "step": 121090 }, { "epoch": 2.8236192474716564, "grad_norm": 2.5053763389587402, "learning_rate": 5.9185164459367034e-08, "loss": 0.1079, "step": 121100 }, { "epoch": 2.823852408848474, "grad_norm": 2.4447386264801025, "learning_rate": 5.910744264129826e-08, "loss": 0.0967, "step": 121110 }, { "epoch": 2.824085570225292, "grad_norm": 1.2510263919830322, "learning_rate": 5.90297208232295e-08, "loss": 0.0933, "step": 121120 }, { "epoch": 2.8243187316021103, "grad_norm": 2.3392629623413086, "learning_rate": 5.895199900516073e-08, "loss": 0.0967, "step": 121130 }, { "epoch": 2.824551892978928, "grad_norm": 2.247696876525879, "learning_rate": 5.887427718709196e-08, "loss": 0.1015, "step": 121140 }, { "epoch": 2.824785054355746, "grad_norm": 1.4281957149505615, "learning_rate": 5.879655536902319e-08, "loss": 0.1098, "step": 121150 }, { "epoch": 2.825018215732564, "grad_norm": 1.8100106716156006, "learning_rate": 5.871883355095442e-08, "loss": 0.1016, "step": 121160 }, { "epoch": 2.8252513771093817, "grad_norm": 2.0128047466278076, "learning_rate": 5.864111173288566e-08, "loss": 0.0997, "step": 121170 }, { "epoch": 2.8254845384862, "grad_norm": 1.4037604331970215, "learning_rate": 5.8563389914816886e-08, "loss": 0.0989, "step": 121180 }, { "epoch": 2.825717699863018, "grad_norm": 1.4711356163024902, "learning_rate": 5.8493440278555e-08, "loss": 0.0931, "step": 121190 }, { "epoch": 2.8259508612398356, "grad_norm": 1.6927815675735474, "learning_rate": 5.841571846048623e-08, "loss": 0.0908, "step": 121200 }, { "epoch": 2.8261840226166535, "grad_norm": 2.4761264324188232, "learning_rate": 5.833799664241746e-08, "loss": 0.1006, "step": 121210 }, { "epoch": 2.8264171839934713, "grad_norm": 0.8343870043754578, "learning_rate": 5.826027482434869e-08, "loss": 0.0935, "step": 121220 }, { "epoch": 2.8266503453702896, "grad_norm": 1.9127191305160522, "learning_rate": 5.818255300627992e-08, "loss": 0.102, "step": 121230 }, { "epoch": 2.8268835067471074, "grad_norm": 1.3661776781082153, "learning_rate": 5.8104831188211157e-08, "loss": 0.1039, "step": 121240 }, { "epoch": 2.8271166681239253, "grad_norm": 1.7270249128341675, "learning_rate": 5.8027109370142386e-08, "loss": 0.0978, "step": 121250 }, { "epoch": 2.827349829500743, "grad_norm": 1.2441672086715698, "learning_rate": 5.794938755207362e-08, "loss": 0.0957, "step": 121260 }, { "epoch": 2.827582990877561, "grad_norm": 1.2677154541015625, "learning_rate": 5.787166573400485e-08, "loss": 0.1069, "step": 121270 }, { "epoch": 2.827816152254379, "grad_norm": 1.733186960220337, "learning_rate": 5.779394391593608e-08, "loss": 0.1055, "step": 121280 }, { "epoch": 2.828049313631197, "grad_norm": 3.9211738109588623, "learning_rate": 5.7716222097867315e-08, "loss": 0.0978, "step": 121290 }, { "epoch": 2.828282475008015, "grad_norm": 1.1113896369934082, "learning_rate": 5.7638500279798544e-08, "loss": 0.1006, "step": 121300 }, { "epoch": 2.8285156363848327, "grad_norm": 2.8733856678009033, "learning_rate": 5.756077846172978e-08, "loss": 0.1025, "step": 121310 }, { "epoch": 2.8287487977616506, "grad_norm": 1.1244224309921265, "learning_rate": 5.748305664366101e-08, "loss": 0.102, "step": 121320 }, { "epoch": 2.828981959138469, "grad_norm": 1.2036186456680298, "learning_rate": 5.740533482559224e-08, "loss": 0.1059, "step": 121330 }, { "epoch": 2.8292151205152867, "grad_norm": 3.1593058109283447, "learning_rate": 5.7327613007523474e-08, "loss": 0.1009, "step": 121340 }, { "epoch": 2.8294482818921045, "grad_norm": 2.274916887283325, "learning_rate": 5.72498911894547e-08, "loss": 0.0959, "step": 121350 }, { "epoch": 2.8296814432689223, "grad_norm": 1.5428061485290527, "learning_rate": 5.717216937138594e-08, "loss": 0.101, "step": 121360 }, { "epoch": 2.82991460464574, "grad_norm": 1.9506388902664185, "learning_rate": 5.709444755331717e-08, "loss": 0.1237, "step": 121370 }, { "epoch": 2.8301477660225585, "grad_norm": 2.670194387435913, "learning_rate": 5.7016725735248397e-08, "loss": 0.0904, "step": 121380 }, { "epoch": 2.8303809273993763, "grad_norm": 2.6867756843566895, "learning_rate": 5.693900391717963e-08, "loss": 0.1126, "step": 121390 }, { "epoch": 2.830614088776194, "grad_norm": 1.6317890882492065, "learning_rate": 5.686128209911086e-08, "loss": 0.108, "step": 121400 }, { "epoch": 2.8308472501530124, "grad_norm": 2.034372091293335, "learning_rate": 5.678356028104209e-08, "loss": 0.1139, "step": 121410 }, { "epoch": 2.83108041152983, "grad_norm": 1.8045985698699951, "learning_rate": 5.6705838462973326e-08, "loss": 0.1163, "step": 121420 }, { "epoch": 2.831313572906648, "grad_norm": 1.4476395845413208, "learning_rate": 5.6628116644904555e-08, "loss": 0.0977, "step": 121430 }, { "epoch": 2.831546734283466, "grad_norm": 2.1638987064361572, "learning_rate": 5.655039482683579e-08, "loss": 0.0964, "step": 121440 }, { "epoch": 2.8317798956602838, "grad_norm": 1.650834321975708, "learning_rate": 5.647267300876702e-08, "loss": 0.1023, "step": 121450 }, { "epoch": 2.832013057037102, "grad_norm": 1.5202884674072266, "learning_rate": 5.639495119069825e-08, "loss": 0.1194, "step": 121460 }, { "epoch": 2.83224621841392, "grad_norm": 1.8782293796539307, "learning_rate": 5.6317229372629485e-08, "loss": 0.089, "step": 121470 }, { "epoch": 2.8324793797907377, "grad_norm": 2.704615592956543, "learning_rate": 5.6239507554560714e-08, "loss": 0.1085, "step": 121480 }, { "epoch": 2.8327125411675556, "grad_norm": 1.3046131134033203, "learning_rate": 5.616178573649195e-08, "loss": 0.1054, "step": 121490 }, { "epoch": 2.8329457025443734, "grad_norm": 1.7150349617004395, "learning_rate": 5.608406391842318e-08, "loss": 0.1016, "step": 121500 }, { "epoch": 2.8331788639211917, "grad_norm": 2.018766403198242, "learning_rate": 5.600634210035441e-08, "loss": 0.1021, "step": 121510 }, { "epoch": 2.8334120252980095, "grad_norm": 4.154825210571289, "learning_rate": 5.592862028228564e-08, "loss": 0.1084, "step": 121520 }, { "epoch": 2.8336451866748273, "grad_norm": 1.6044551134109497, "learning_rate": 5.585089846421687e-08, "loss": 0.0988, "step": 121530 }, { "epoch": 2.833878348051645, "grad_norm": 1.3648580312728882, "learning_rate": 5.577317664614811e-08, "loss": 0.1125, "step": 121540 }, { "epoch": 2.834111509428463, "grad_norm": 4.764969825744629, "learning_rate": 5.569545482807934e-08, "loss": 0.1028, "step": 121550 }, { "epoch": 2.8343446708052813, "grad_norm": 1.6009315252304077, "learning_rate": 5.5617733010010566e-08, "loss": 0.1005, "step": 121560 }, { "epoch": 2.834577832182099, "grad_norm": 1.4643219709396362, "learning_rate": 5.55400111919418e-08, "loss": 0.1175, "step": 121570 }, { "epoch": 2.834810993558917, "grad_norm": 1.5109273195266724, "learning_rate": 5.546228937387303e-08, "loss": 0.1084, "step": 121580 }, { "epoch": 2.835044154935735, "grad_norm": 2.1766555309295654, "learning_rate": 5.5384567555804266e-08, "loss": 0.1001, "step": 121590 }, { "epoch": 2.8352773163125526, "grad_norm": 1.279723048210144, "learning_rate": 5.5306845737735495e-08, "loss": 0.0987, "step": 121600 }, { "epoch": 2.835510477689371, "grad_norm": 2.0909695625305176, "learning_rate": 5.5229123919666725e-08, "loss": 0.1024, "step": 121610 }, { "epoch": 2.8357436390661888, "grad_norm": 1.324317455291748, "learning_rate": 5.515140210159796e-08, "loss": 0.1093, "step": 121620 }, { "epoch": 2.8359768004430066, "grad_norm": 1.7296119928359985, "learning_rate": 5.507368028352919e-08, "loss": 0.1037, "step": 121630 }, { "epoch": 2.8362099618198244, "grad_norm": 1.8203681707382202, "learning_rate": 5.4995958465460425e-08, "loss": 0.0955, "step": 121640 }, { "epoch": 2.8364431231966423, "grad_norm": 1.4357197284698486, "learning_rate": 5.4918236647391654e-08, "loss": 0.1177, "step": 121650 }, { "epoch": 2.8366762845734605, "grad_norm": 1.2588789463043213, "learning_rate": 5.484051482932288e-08, "loss": 0.1082, "step": 121660 }, { "epoch": 2.8369094459502784, "grad_norm": 1.3906902074813843, "learning_rate": 5.476279301125412e-08, "loss": 0.0982, "step": 121670 }, { "epoch": 2.837142607327096, "grad_norm": 1.5192664861679077, "learning_rate": 5.468507119318535e-08, "loss": 0.0979, "step": 121680 }, { "epoch": 2.837375768703914, "grad_norm": 1.566136360168457, "learning_rate": 5.4607349375116583e-08, "loss": 0.1007, "step": 121690 }, { "epoch": 2.837608930080732, "grad_norm": 2.5499727725982666, "learning_rate": 5.452962755704781e-08, "loss": 0.1103, "step": 121700 }, { "epoch": 2.83784209145755, "grad_norm": 1.208464503288269, "learning_rate": 5.445190573897904e-08, "loss": 0.1098, "step": 121710 }, { "epoch": 2.838075252834368, "grad_norm": 1.5367393493652344, "learning_rate": 5.437418392091028e-08, "loss": 0.1061, "step": 121720 }, { "epoch": 2.838308414211186, "grad_norm": 1.3600976467132568, "learning_rate": 5.4296462102841506e-08, "loss": 0.0961, "step": 121730 }, { "epoch": 2.8385415755880037, "grad_norm": 0.9155444502830505, "learning_rate": 5.421874028477274e-08, "loss": 0.0933, "step": 121740 }, { "epoch": 2.8387747369648215, "grad_norm": 1.5626002550125122, "learning_rate": 5.414101846670397e-08, "loss": 0.0997, "step": 121750 }, { "epoch": 2.83900789834164, "grad_norm": 2.4143402576446533, "learning_rate": 5.40632966486352e-08, "loss": 0.0968, "step": 121760 }, { "epoch": 2.8392410597184576, "grad_norm": 1.5805096626281738, "learning_rate": 5.3985574830566436e-08, "loss": 0.1127, "step": 121770 }, { "epoch": 2.8394742210952755, "grad_norm": 1.6677603721618652, "learning_rate": 5.3907853012497665e-08, "loss": 0.1123, "step": 121780 }, { "epoch": 2.8397073824720938, "grad_norm": 1.8699610233306885, "learning_rate": 5.38301311944289e-08, "loss": 0.1032, "step": 121790 }, { "epoch": 2.8399405438489116, "grad_norm": 1.822343349456787, "learning_rate": 5.375240937636013e-08, "loss": 0.0967, "step": 121800 }, { "epoch": 2.8401737052257294, "grad_norm": 1.2982184886932373, "learning_rate": 5.367468755829136e-08, "loss": 0.0845, "step": 121810 }, { "epoch": 2.8404068666025473, "grad_norm": 1.926271677017212, "learning_rate": 5.3596965740222594e-08, "loss": 0.1022, "step": 121820 }, { "epoch": 2.840640027979365, "grad_norm": 1.558003544807434, "learning_rate": 5.3519243922153823e-08, "loss": 0.1049, "step": 121830 }, { "epoch": 2.8408731893561834, "grad_norm": 1.2628942728042603, "learning_rate": 5.344152210408506e-08, "loss": 0.0972, "step": 121840 }, { "epoch": 2.841106350733001, "grad_norm": 1.542933702468872, "learning_rate": 5.336380028601629e-08, "loss": 0.1078, "step": 121850 }, { "epoch": 2.841339512109819, "grad_norm": 1.5756758451461792, "learning_rate": 5.328607846794752e-08, "loss": 0.1007, "step": 121860 }, { "epoch": 2.841572673486637, "grad_norm": 3.9903595447540283, "learning_rate": 5.320835664987875e-08, "loss": 0.1044, "step": 121870 }, { "epoch": 2.8418058348634547, "grad_norm": 2.341945171356201, "learning_rate": 5.313063483180998e-08, "loss": 0.0993, "step": 121880 }, { "epoch": 2.842038996240273, "grad_norm": 1.566877841949463, "learning_rate": 5.305291301374121e-08, "loss": 0.0891, "step": 121890 }, { "epoch": 2.842272157617091, "grad_norm": 1.621788501739502, "learning_rate": 5.297519119567245e-08, "loss": 0.111, "step": 121900 }, { "epoch": 2.8425053189939087, "grad_norm": 1.4508806467056274, "learning_rate": 5.2897469377603676e-08, "loss": 0.112, "step": 121910 }, { "epoch": 2.8427384803707265, "grad_norm": 2.473924160003662, "learning_rate": 5.281974755953491e-08, "loss": 0.116, "step": 121920 }, { "epoch": 2.8429716417475444, "grad_norm": 1.2446874380111694, "learning_rate": 5.274202574146614e-08, "loss": 0.1044, "step": 121930 }, { "epoch": 2.8432048031243626, "grad_norm": 1.384871244430542, "learning_rate": 5.266430392339737e-08, "loss": 0.1012, "step": 121940 }, { "epoch": 2.8434379645011805, "grad_norm": 2.128326416015625, "learning_rate": 5.2586582105328605e-08, "loss": 0.107, "step": 121950 }, { "epoch": 2.8436711258779983, "grad_norm": 1.340641975402832, "learning_rate": 5.2508860287259834e-08, "loss": 0.1012, "step": 121960 }, { "epoch": 2.843904287254816, "grad_norm": 2.3316638469696045, "learning_rate": 5.243113846919107e-08, "loss": 0.0897, "step": 121970 }, { "epoch": 2.844137448631634, "grad_norm": 1.3090120553970337, "learning_rate": 5.23534166511223e-08, "loss": 0.0999, "step": 121980 }, { "epoch": 2.8443706100084523, "grad_norm": 2.840566873550415, "learning_rate": 5.227569483305353e-08, "loss": 0.1063, "step": 121990 }, { "epoch": 2.84460377138527, "grad_norm": 1.586465835571289, "learning_rate": 5.2197973014984764e-08, "loss": 0.1089, "step": 122000 }, { "epoch": 2.844836932762088, "grad_norm": 2.4812259674072266, "learning_rate": 5.212025119691599e-08, "loss": 0.1228, "step": 122010 }, { "epoch": 2.8450700941389058, "grad_norm": 2.4179739952087402, "learning_rate": 5.204252937884723e-08, "loss": 0.1112, "step": 122020 }, { "epoch": 2.8453032555157236, "grad_norm": 1.363673448562622, "learning_rate": 5.196480756077846e-08, "loss": 0.1065, "step": 122030 }, { "epoch": 2.845536416892542, "grad_norm": 4.211659908294678, "learning_rate": 5.188708574270969e-08, "loss": 0.1156, "step": 122040 }, { "epoch": 2.8457695782693597, "grad_norm": 2.3595879077911377, "learning_rate": 5.180936392464092e-08, "loss": 0.1035, "step": 122050 }, { "epoch": 2.8460027396461776, "grad_norm": 2.390705108642578, "learning_rate": 5.173164210657215e-08, "loss": 0.1061, "step": 122060 }, { "epoch": 2.8462359010229954, "grad_norm": 2.017263412475586, "learning_rate": 5.165392028850339e-08, "loss": 0.102, "step": 122070 }, { "epoch": 2.8464690623998132, "grad_norm": 1.5251485109329224, "learning_rate": 5.1576198470434616e-08, "loss": 0.1037, "step": 122080 }, { "epoch": 2.8467022237766315, "grad_norm": 1.6527305841445923, "learning_rate": 5.1498476652365845e-08, "loss": 0.1011, "step": 122090 }, { "epoch": 2.8469353851534493, "grad_norm": 1.027100920677185, "learning_rate": 5.142075483429708e-08, "loss": 0.0931, "step": 122100 }, { "epoch": 2.847168546530267, "grad_norm": 2.0180180072784424, "learning_rate": 5.134303301622831e-08, "loss": 0.0989, "step": 122110 }, { "epoch": 2.8474017079070855, "grad_norm": 1.3759366273880005, "learning_rate": 5.1265311198159546e-08, "loss": 0.102, "step": 122120 }, { "epoch": 2.847634869283903, "grad_norm": 1.79896080493927, "learning_rate": 5.1187589380090775e-08, "loss": 0.1029, "step": 122130 }, { "epoch": 2.847868030660721, "grad_norm": 1.2536870241165161, "learning_rate": 5.1109867562022004e-08, "loss": 0.0904, "step": 122140 }, { "epoch": 2.848101192037539, "grad_norm": 1.2705507278442383, "learning_rate": 5.103214574395324e-08, "loss": 0.1018, "step": 122150 }, { "epoch": 2.848334353414357, "grad_norm": 1.300093650817871, "learning_rate": 5.095442392588447e-08, "loss": 0.1041, "step": 122160 }, { "epoch": 2.848567514791175, "grad_norm": 2.055502414703369, "learning_rate": 5.0876702107815704e-08, "loss": 0.1106, "step": 122170 }, { "epoch": 2.848800676167993, "grad_norm": 1.7590340375900269, "learning_rate": 5.0798980289746933e-08, "loss": 0.0934, "step": 122180 }, { "epoch": 2.8490338375448108, "grad_norm": 3.629075765609741, "learning_rate": 5.072125847167816e-08, "loss": 0.1159, "step": 122190 }, { "epoch": 2.8492669989216286, "grad_norm": 3.033078908920288, "learning_rate": 5.06435366536094e-08, "loss": 0.0876, "step": 122200 }, { "epoch": 2.8495001602984464, "grad_norm": 1.156219720840454, "learning_rate": 5.056581483554063e-08, "loss": 0.107, "step": 122210 }, { "epoch": 2.8497333216752647, "grad_norm": 4.555761814117432, "learning_rate": 5.048809301747186e-08, "loss": 0.0986, "step": 122220 }, { "epoch": 2.8499664830520826, "grad_norm": 1.1960091590881348, "learning_rate": 5.041037119940309e-08, "loss": 0.1078, "step": 122230 }, { "epoch": 2.8501996444289004, "grad_norm": 1.4282292127609253, "learning_rate": 5.033264938133432e-08, "loss": 0.1026, "step": 122240 }, { "epoch": 2.8504328058057182, "grad_norm": 1.0905238389968872, "learning_rate": 5.0254927563265557e-08, "loss": 0.1041, "step": 122250 }, { "epoch": 2.850665967182536, "grad_norm": 1.2574561834335327, "learning_rate": 5.0177205745196786e-08, "loss": 0.1007, "step": 122260 }, { "epoch": 2.8508991285593543, "grad_norm": 2.215719223022461, "learning_rate": 5.009948392712802e-08, "loss": 0.1003, "step": 122270 }, { "epoch": 2.851132289936172, "grad_norm": 1.4830092191696167, "learning_rate": 5.002176210905925e-08, "loss": 0.0902, "step": 122280 }, { "epoch": 2.85136545131299, "grad_norm": 1.7990124225616455, "learning_rate": 4.994404029099048e-08, "loss": 0.1027, "step": 122290 }, { "epoch": 2.851598612689808, "grad_norm": 1.7284992933273315, "learning_rate": 4.9866318472921715e-08, "loss": 0.0949, "step": 122300 }, { "epoch": 2.8518317740666257, "grad_norm": 1.6289273500442505, "learning_rate": 4.9788596654852944e-08, "loss": 0.0932, "step": 122310 }, { "epoch": 2.852064935443444, "grad_norm": 1.740854024887085, "learning_rate": 4.971087483678417e-08, "loss": 0.0971, "step": 122320 }, { "epoch": 2.852298096820262, "grad_norm": 1.2009570598602295, "learning_rate": 4.963315301871541e-08, "loss": 0.1016, "step": 122330 }, { "epoch": 2.8525312581970796, "grad_norm": 1.1182739734649658, "learning_rate": 4.955543120064664e-08, "loss": 0.0885, "step": 122340 }, { "epoch": 2.8527644195738975, "grad_norm": 1.5573703050613403, "learning_rate": 4.9477709382577874e-08, "loss": 0.1026, "step": 122350 }, { "epoch": 2.8529975809507153, "grad_norm": 1.5218327045440674, "learning_rate": 4.939998756450911e-08, "loss": 0.1056, "step": 122360 }, { "epoch": 2.8532307423275336, "grad_norm": 1.2797746658325195, "learning_rate": 4.9322265746440345e-08, "loss": 0.0926, "step": 122370 }, { "epoch": 2.8534639037043514, "grad_norm": 1.346508502960205, "learning_rate": 4.9244543928371574e-08, "loss": 0.0866, "step": 122380 }, { "epoch": 2.8536970650811693, "grad_norm": 1.5163915157318115, "learning_rate": 4.91668221103028e-08, "loss": 0.1025, "step": 122390 }, { "epoch": 2.853930226457987, "grad_norm": 1.8681447505950928, "learning_rate": 4.908910029223404e-08, "loss": 0.1015, "step": 122400 }, { "epoch": 2.854163387834805, "grad_norm": 2.222926139831543, "learning_rate": 4.901137847416527e-08, "loss": 0.1043, "step": 122410 }, { "epoch": 2.8543965492116232, "grad_norm": 1.022869348526001, "learning_rate": 4.89336566560965e-08, "loss": 0.0993, "step": 122420 }, { "epoch": 2.854629710588441, "grad_norm": 1.4112517833709717, "learning_rate": 4.885593483802773e-08, "loss": 0.1045, "step": 122430 }, { "epoch": 2.854862871965259, "grad_norm": 1.0305310487747192, "learning_rate": 4.877821301995896e-08, "loss": 0.0988, "step": 122440 }, { "epoch": 2.8550960333420767, "grad_norm": 2.3994803428649902, "learning_rate": 4.87004912018902e-08, "loss": 0.1122, "step": 122450 }, { "epoch": 2.8553291947188946, "grad_norm": 2.5705442428588867, "learning_rate": 4.8622769383821427e-08, "loss": 0.1034, "step": 122460 }, { "epoch": 2.855562356095713, "grad_norm": 1.3099443912506104, "learning_rate": 4.8545047565752656e-08, "loss": 0.0967, "step": 122470 }, { "epoch": 2.8557955174725307, "grad_norm": 1.9925668239593506, "learning_rate": 4.846732574768389e-08, "loss": 0.1031, "step": 122480 }, { "epoch": 2.8560286788493485, "grad_norm": 1.0835245847702026, "learning_rate": 4.838960392961512e-08, "loss": 0.1107, "step": 122490 }, { "epoch": 2.856261840226167, "grad_norm": 1.296180248260498, "learning_rate": 4.8311882111546356e-08, "loss": 0.1075, "step": 122500 }, { "epoch": 2.856495001602984, "grad_norm": 2.352637529373169, "learning_rate": 4.8234160293477585e-08, "loss": 0.1034, "step": 122510 }, { "epoch": 2.8567281629798025, "grad_norm": 1.3547239303588867, "learning_rate": 4.8156438475408814e-08, "loss": 0.1102, "step": 122520 }, { "epoch": 2.8569613243566203, "grad_norm": 2.910020351409912, "learning_rate": 4.807871665734005e-08, "loss": 0.096, "step": 122530 }, { "epoch": 2.857194485733438, "grad_norm": 1.8307222127914429, "learning_rate": 4.800099483927128e-08, "loss": 0.103, "step": 122540 }, { "epoch": 2.8574276471102564, "grad_norm": 1.5269850492477417, "learning_rate": 4.7923273021202515e-08, "loss": 0.0996, "step": 122550 }, { "epoch": 2.8576608084870743, "grad_norm": 1.1007087230682373, "learning_rate": 4.7845551203133744e-08, "loss": 0.0896, "step": 122560 }, { "epoch": 2.857893969863892, "grad_norm": 1.7385300397872925, "learning_rate": 4.776782938506497e-08, "loss": 0.0959, "step": 122570 }, { "epoch": 2.85812713124071, "grad_norm": 1.6458280086517334, "learning_rate": 4.769010756699621e-08, "loss": 0.1013, "step": 122580 }, { "epoch": 2.8583602926175278, "grad_norm": 1.8015602827072144, "learning_rate": 4.761238574892744e-08, "loss": 0.0965, "step": 122590 }, { "epoch": 2.858593453994346, "grad_norm": 1.4814598560333252, "learning_rate": 4.753466393085867e-08, "loss": 0.1157, "step": 122600 }, { "epoch": 2.858826615371164, "grad_norm": 1.5596239566802979, "learning_rate": 4.74569421127899e-08, "loss": 0.1082, "step": 122610 }, { "epoch": 2.8590597767479817, "grad_norm": 1.809171438217163, "learning_rate": 4.737922029472113e-08, "loss": 0.0945, "step": 122620 }, { "epoch": 2.8592929381247996, "grad_norm": 1.2475792169570923, "learning_rate": 4.730149847665237e-08, "loss": 0.1028, "step": 122630 }, { "epoch": 2.8595260995016174, "grad_norm": 1.4850982427597046, "learning_rate": 4.7223776658583596e-08, "loss": 0.1071, "step": 122640 }, { "epoch": 2.8597592608784357, "grad_norm": 1.5269782543182373, "learning_rate": 4.714605484051483e-08, "loss": 0.1056, "step": 122650 }, { "epoch": 2.8599924222552535, "grad_norm": 2.9318506717681885, "learning_rate": 4.706833302244606e-08, "loss": 0.1065, "step": 122660 }, { "epoch": 2.8602255836320714, "grad_norm": 2.0342249870300293, "learning_rate": 4.699061120437729e-08, "loss": 0.0994, "step": 122670 }, { "epoch": 2.860458745008889, "grad_norm": 1.325440764427185, "learning_rate": 4.6912889386308525e-08, "loss": 0.0967, "step": 122680 }, { "epoch": 2.860691906385707, "grad_norm": 2.710875988006592, "learning_rate": 4.6835167568239755e-08, "loss": 0.1012, "step": 122690 }, { "epoch": 2.8609250677625253, "grad_norm": 1.1766630411148071, "learning_rate": 4.675744575017099e-08, "loss": 0.0892, "step": 122700 }, { "epoch": 2.861158229139343, "grad_norm": 1.2919100522994995, "learning_rate": 4.667972393210222e-08, "loss": 0.0854, "step": 122710 }, { "epoch": 2.861391390516161, "grad_norm": 1.8450216054916382, "learning_rate": 4.660200211403345e-08, "loss": 0.1084, "step": 122720 }, { "epoch": 2.861624551892979, "grad_norm": 2.137006998062134, "learning_rate": 4.6524280295964684e-08, "loss": 0.0979, "step": 122730 }, { "epoch": 2.8618577132697967, "grad_norm": 1.2107456922531128, "learning_rate": 4.644655847789591e-08, "loss": 0.1084, "step": 122740 }, { "epoch": 2.862090874646615, "grad_norm": 1.8194912672042847, "learning_rate": 4.636883665982715e-08, "loss": 0.1047, "step": 122750 }, { "epoch": 2.8623240360234328, "grad_norm": 2.96598219871521, "learning_rate": 4.629111484175838e-08, "loss": 0.1079, "step": 122760 }, { "epoch": 2.8625571974002506, "grad_norm": 1.3303462266921997, "learning_rate": 4.621339302368961e-08, "loss": 0.0922, "step": 122770 }, { "epoch": 2.8627903587770684, "grad_norm": 1.4528892040252686, "learning_rate": 4.613567120562084e-08, "loss": 0.1072, "step": 122780 }, { "epoch": 2.8630235201538863, "grad_norm": 1.1571729183197021, "learning_rate": 4.605794938755207e-08, "loss": 0.1011, "step": 122790 }, { "epoch": 2.8632566815307046, "grad_norm": 1.997894525527954, "learning_rate": 4.598022756948331e-08, "loss": 0.0971, "step": 122800 }, { "epoch": 2.8634898429075224, "grad_norm": 1.3093289136886597, "learning_rate": 4.5902505751414536e-08, "loss": 0.1141, "step": 122810 }, { "epoch": 2.8637230042843402, "grad_norm": 1.4058362245559692, "learning_rate": 4.5824783933345765e-08, "loss": 0.1008, "step": 122820 }, { "epoch": 2.863956165661158, "grad_norm": 2.0048937797546387, "learning_rate": 4.5747062115277e-08, "loss": 0.1082, "step": 122830 }, { "epoch": 2.864189327037976, "grad_norm": 1.6992591619491577, "learning_rate": 4.566934029720823e-08, "loss": 0.1029, "step": 122840 }, { "epoch": 2.864422488414794, "grad_norm": 1.8289591073989868, "learning_rate": 4.5591618479139466e-08, "loss": 0.1048, "step": 122850 }, { "epoch": 2.864655649791612, "grad_norm": 1.3766520023345947, "learning_rate": 4.5513896661070695e-08, "loss": 0.1043, "step": 122860 }, { "epoch": 2.86488881116843, "grad_norm": 3.681094169616699, "learning_rate": 4.5436174843001924e-08, "loss": 0.1116, "step": 122870 }, { "epoch": 2.865121972545248, "grad_norm": 1.3338675498962402, "learning_rate": 4.535845302493316e-08, "loss": 0.1091, "step": 122880 }, { "epoch": 2.8653551339220655, "grad_norm": 2.5620431900024414, "learning_rate": 4.528073120686439e-08, "loss": 0.1047, "step": 122890 }, { "epoch": 2.865588295298884, "grad_norm": 2.544379472732544, "learning_rate": 4.520300938879562e-08, "loss": 0.1049, "step": 122900 }, { "epoch": 2.8658214566757017, "grad_norm": 1.308518886566162, "learning_rate": 4.5125287570726853e-08, "loss": 0.1019, "step": 122910 }, { "epoch": 2.8660546180525195, "grad_norm": 2.234527111053467, "learning_rate": 4.504756575265808e-08, "loss": 0.1044, "step": 122920 }, { "epoch": 2.8662877794293378, "grad_norm": 1.4273589849472046, "learning_rate": 4.496984393458932e-08, "loss": 0.1065, "step": 122930 }, { "epoch": 2.8665209408061556, "grad_norm": 1.1709051132202148, "learning_rate": 4.489212211652055e-08, "loss": 0.0859, "step": 122940 }, { "epoch": 2.8667541021829734, "grad_norm": 1.3934576511383057, "learning_rate": 4.4814400298451776e-08, "loss": 0.0955, "step": 122950 }, { "epoch": 2.8669872635597913, "grad_norm": 1.7229979038238525, "learning_rate": 4.473667848038301e-08, "loss": 0.1045, "step": 122960 }, { "epoch": 2.867220424936609, "grad_norm": 1.4285728931427002, "learning_rate": 4.465895666231424e-08, "loss": 0.1018, "step": 122970 }, { "epoch": 2.8674535863134274, "grad_norm": 2.687946081161499, "learning_rate": 4.458123484424548e-08, "loss": 0.1054, "step": 122980 }, { "epoch": 2.8676867476902452, "grad_norm": 1.6059702634811401, "learning_rate": 4.4503513026176706e-08, "loss": 0.1003, "step": 122990 }, { "epoch": 2.867919909067063, "grad_norm": 1.625608205795288, "learning_rate": 4.4425791208107935e-08, "loss": 0.1012, "step": 123000 }, { "epoch": 2.868153070443881, "grad_norm": 1.835875391960144, "learning_rate": 4.434806939003917e-08, "loss": 0.1115, "step": 123010 }, { "epoch": 2.8683862318206987, "grad_norm": 1.3504703044891357, "learning_rate": 4.42703475719704e-08, "loss": 0.1089, "step": 123020 }, { "epoch": 2.868619393197517, "grad_norm": 1.5023877620697021, "learning_rate": 4.4192625753901635e-08, "loss": 0.1013, "step": 123030 }, { "epoch": 2.868852554574335, "grad_norm": 2.232198476791382, "learning_rate": 4.4114903935832864e-08, "loss": 0.1076, "step": 123040 }, { "epoch": 2.8690857159511527, "grad_norm": 1.3896187543869019, "learning_rate": 4.4037182117764093e-08, "loss": 0.1073, "step": 123050 }, { "epoch": 2.8693188773279705, "grad_norm": 1.2762672901153564, "learning_rate": 4.395946029969533e-08, "loss": 0.0949, "step": 123060 }, { "epoch": 2.8695520387047884, "grad_norm": 1.744475245475769, "learning_rate": 4.388173848162656e-08, "loss": 0.1169, "step": 123070 }, { "epoch": 2.8697852000816066, "grad_norm": 1.1341567039489746, "learning_rate": 4.3804016663557794e-08, "loss": 0.1039, "step": 123080 }, { "epoch": 2.8700183614584245, "grad_norm": 1.3694512844085693, "learning_rate": 4.372629484548902e-08, "loss": 0.1, "step": 123090 }, { "epoch": 2.8702515228352423, "grad_norm": 1.9965698719024658, "learning_rate": 4.364857302742025e-08, "loss": 0.1111, "step": 123100 }, { "epoch": 2.87048468421206, "grad_norm": 1.3043699264526367, "learning_rate": 4.357085120935149e-08, "loss": 0.1146, "step": 123110 }, { "epoch": 2.870717845588878, "grad_norm": 1.9511948823928833, "learning_rate": 4.349312939128272e-08, "loss": 0.1029, "step": 123120 }, { "epoch": 2.8709510069656963, "grad_norm": 2.2633609771728516, "learning_rate": 4.341540757321395e-08, "loss": 0.1041, "step": 123130 }, { "epoch": 2.871184168342514, "grad_norm": 2.1029937267303467, "learning_rate": 4.333768575514518e-08, "loss": 0.1002, "step": 123140 }, { "epoch": 2.871417329719332, "grad_norm": 1.7576465606689453, "learning_rate": 4.325996393707641e-08, "loss": 0.1024, "step": 123150 }, { "epoch": 2.87165049109615, "grad_norm": 1.7923153638839722, "learning_rate": 4.3182242119007646e-08, "loss": 0.1122, "step": 123160 }, { "epoch": 2.8718836524729676, "grad_norm": 2.179879903793335, "learning_rate": 4.3104520300938875e-08, "loss": 0.1087, "step": 123170 }, { "epoch": 2.872116813849786, "grad_norm": 2.5207314491271973, "learning_rate": 4.302679848287011e-08, "loss": 0.1002, "step": 123180 }, { "epoch": 2.8723499752266037, "grad_norm": 2.1800661087036133, "learning_rate": 4.2956848846608216e-08, "loss": 0.1054, "step": 123190 }, { "epoch": 2.8725831366034216, "grad_norm": 1.538905382156372, "learning_rate": 4.287912702853945e-08, "loss": 0.1079, "step": 123200 }, { "epoch": 2.8728162979802394, "grad_norm": 1.5889804363250732, "learning_rate": 4.280140521047068e-08, "loss": 0.098, "step": 123210 }, { "epoch": 2.8730494593570572, "grad_norm": 1.615276575088501, "learning_rate": 4.272368339240191e-08, "loss": 0.0998, "step": 123220 }, { "epoch": 2.8732826207338755, "grad_norm": 2.7792930603027344, "learning_rate": 4.2645961574333146e-08, "loss": 0.0979, "step": 123230 }, { "epoch": 2.8735157821106934, "grad_norm": 1.6615874767303467, "learning_rate": 4.2568239756264375e-08, "loss": 0.1025, "step": 123240 }, { "epoch": 2.873748943487511, "grad_norm": 1.458377480506897, "learning_rate": 4.249051793819561e-08, "loss": 0.1116, "step": 123250 }, { "epoch": 2.8739821048643295, "grad_norm": 1.3490010499954224, "learning_rate": 4.241279612012684e-08, "loss": 0.1046, "step": 123260 }, { "epoch": 2.8742152662411473, "grad_norm": 2.713369607925415, "learning_rate": 4.233507430205807e-08, "loss": 0.1085, "step": 123270 }, { "epoch": 2.874448427617965, "grad_norm": 2.0754079818725586, "learning_rate": 4.2257352483989304e-08, "loss": 0.1062, "step": 123280 }, { "epoch": 2.874681588994783, "grad_norm": 1.0813475847244263, "learning_rate": 4.2179630665920534e-08, "loss": 0.0958, "step": 123290 }, { "epoch": 2.874914750371601, "grad_norm": 1.5868018865585327, "learning_rate": 4.210190884785177e-08, "loss": 0.1131, "step": 123300 }, { "epoch": 2.875147911748419, "grad_norm": 1.3808335065841675, "learning_rate": 4.2024187029783e-08, "loss": 0.1101, "step": 123310 }, { "epoch": 2.875381073125237, "grad_norm": 1.8100332021713257, "learning_rate": 4.194646521171423e-08, "loss": 0.1067, "step": 123320 }, { "epoch": 2.8756142345020548, "grad_norm": 2.810878038406372, "learning_rate": 4.186874339364546e-08, "loss": 0.1071, "step": 123330 }, { "epoch": 2.8758473958788726, "grad_norm": 2.574214458465576, "learning_rate": 4.179102157557669e-08, "loss": 0.1124, "step": 123340 }, { "epoch": 2.8760805572556905, "grad_norm": 2.775202989578247, "learning_rate": 4.171329975750793e-08, "loss": 0.1081, "step": 123350 }, { "epoch": 2.8763137186325087, "grad_norm": 1.7066569328308105, "learning_rate": 4.163557793943916e-08, "loss": 0.1072, "step": 123360 }, { "epoch": 2.8765468800093266, "grad_norm": 2.5528838634490967, "learning_rate": 4.1557856121370386e-08, "loss": 0.1015, "step": 123370 }, { "epoch": 2.8767800413861444, "grad_norm": 2.9524881839752197, "learning_rate": 4.148013430330162e-08, "loss": 0.1093, "step": 123380 }, { "epoch": 2.8770132027629622, "grad_norm": 1.0154914855957031, "learning_rate": 4.140241248523285e-08, "loss": 0.1022, "step": 123390 }, { "epoch": 2.87724636413978, "grad_norm": 2.163510799407959, "learning_rate": 4.1324690667164086e-08, "loss": 0.1017, "step": 123400 }, { "epoch": 2.8774795255165984, "grad_norm": 1.4746785163879395, "learning_rate": 4.1246968849095315e-08, "loss": 0.1, "step": 123410 }, { "epoch": 2.877712686893416, "grad_norm": 1.446326494216919, "learning_rate": 4.1169247031026544e-08, "loss": 0.0965, "step": 123420 }, { "epoch": 2.877945848270234, "grad_norm": 1.4317007064819336, "learning_rate": 4.109152521295778e-08, "loss": 0.1025, "step": 123430 }, { "epoch": 2.878179009647052, "grad_norm": 1.9115363359451294, "learning_rate": 4.101380339488901e-08, "loss": 0.1057, "step": 123440 }, { "epoch": 2.8784121710238697, "grad_norm": 1.4966754913330078, "learning_rate": 4.0936081576820245e-08, "loss": 0.1154, "step": 123450 }, { "epoch": 2.878645332400688, "grad_norm": 1.2489855289459229, "learning_rate": 4.0858359758751474e-08, "loss": 0.1064, "step": 123460 }, { "epoch": 2.878878493777506, "grad_norm": 1.5538135766983032, "learning_rate": 4.07806379406827e-08, "loss": 0.0934, "step": 123470 }, { "epoch": 2.8791116551543237, "grad_norm": 1.6504297256469727, "learning_rate": 4.070291612261394e-08, "loss": 0.0996, "step": 123480 }, { "epoch": 2.8793448165311415, "grad_norm": 1.0642170906066895, "learning_rate": 4.062519430454517e-08, "loss": 0.1024, "step": 123490 }, { "epoch": 2.8795779779079593, "grad_norm": 2.097414255142212, "learning_rate": 4.05474724864764e-08, "loss": 0.0999, "step": 123500 }, { "epoch": 2.8798111392847776, "grad_norm": 1.1931633949279785, "learning_rate": 4.046975066840763e-08, "loss": 0.1125, "step": 123510 }, { "epoch": 2.8800443006615954, "grad_norm": 1.2287461757659912, "learning_rate": 4.039202885033886e-08, "loss": 0.0981, "step": 123520 }, { "epoch": 2.8802774620384133, "grad_norm": 1.1435556411743164, "learning_rate": 4.03143070322701e-08, "loss": 0.1022, "step": 123530 }, { "epoch": 2.880510623415231, "grad_norm": 2.4559195041656494, "learning_rate": 4.0236585214201326e-08, "loss": 0.1133, "step": 123540 }, { "epoch": 2.880743784792049, "grad_norm": 3.0461604595184326, "learning_rate": 4.0158863396132555e-08, "loss": 0.1029, "step": 123550 }, { "epoch": 2.8809769461688672, "grad_norm": 1.285489559173584, "learning_rate": 4.008114157806379e-08, "loss": 0.0934, "step": 123560 }, { "epoch": 2.881210107545685, "grad_norm": 2.8994059562683105, "learning_rate": 4.000341975999502e-08, "loss": 0.0973, "step": 123570 }, { "epoch": 2.881443268922503, "grad_norm": 3.1856062412261963, "learning_rate": 3.9925697941926256e-08, "loss": 0.1135, "step": 123580 }, { "epoch": 2.881676430299321, "grad_norm": 3.013385772705078, "learning_rate": 3.9847976123857485e-08, "loss": 0.0963, "step": 123590 }, { "epoch": 2.8819095916761386, "grad_norm": 1.4277230501174927, "learning_rate": 3.9770254305788714e-08, "loss": 0.0958, "step": 123600 }, { "epoch": 2.882142753052957, "grad_norm": 1.8548738956451416, "learning_rate": 3.969253248771995e-08, "loss": 0.0928, "step": 123610 }, { "epoch": 2.8823759144297747, "grad_norm": 2.0481865406036377, "learning_rate": 3.961481066965118e-08, "loss": 0.1031, "step": 123620 }, { "epoch": 2.8826090758065925, "grad_norm": 2.601489782333374, "learning_rate": 3.9537088851582414e-08, "loss": 0.1101, "step": 123630 }, { "epoch": 2.882842237183411, "grad_norm": 1.6870628595352173, "learning_rate": 3.9459367033513643e-08, "loss": 0.1038, "step": 123640 }, { "epoch": 2.8830753985602287, "grad_norm": 3.0110466480255127, "learning_rate": 3.938164521544487e-08, "loss": 0.1032, "step": 123650 }, { "epoch": 2.8833085599370465, "grad_norm": 1.5799803733825684, "learning_rate": 3.930392339737611e-08, "loss": 0.1095, "step": 123660 }, { "epoch": 2.8835417213138643, "grad_norm": 1.2018482685089111, "learning_rate": 3.922620157930734e-08, "loss": 0.1025, "step": 123670 }, { "epoch": 2.883774882690682, "grad_norm": 1.5067927837371826, "learning_rate": 3.914847976123857e-08, "loss": 0.108, "step": 123680 }, { "epoch": 2.8840080440675004, "grad_norm": 1.2883762121200562, "learning_rate": 3.90707579431698e-08, "loss": 0.1119, "step": 123690 }, { "epoch": 2.8842412054443183, "grad_norm": 1.581378698348999, "learning_rate": 3.899303612510103e-08, "loss": 0.11, "step": 123700 }, { "epoch": 2.884474366821136, "grad_norm": 2.253009796142578, "learning_rate": 3.891531430703227e-08, "loss": 0.0942, "step": 123710 }, { "epoch": 2.884707528197954, "grad_norm": 2.6901373863220215, "learning_rate": 3.8837592488963496e-08, "loss": 0.1039, "step": 123720 }, { "epoch": 2.884940689574772, "grad_norm": 1.2719225883483887, "learning_rate": 3.875987067089473e-08, "loss": 0.0944, "step": 123730 }, { "epoch": 2.88517385095159, "grad_norm": 1.4366551637649536, "learning_rate": 3.868214885282596e-08, "loss": 0.107, "step": 123740 }, { "epoch": 2.885407012328408, "grad_norm": 1.2572451829910278, "learning_rate": 3.860442703475719e-08, "loss": 0.1037, "step": 123750 }, { "epoch": 2.8856401737052257, "grad_norm": 1.406207799911499, "learning_rate": 3.8526705216688425e-08, "loss": 0.0979, "step": 123760 }, { "epoch": 2.8858733350820436, "grad_norm": 1.244564175605774, "learning_rate": 3.8448983398619654e-08, "loss": 0.0886, "step": 123770 }, { "epoch": 2.8861064964588614, "grad_norm": 1.7931480407714844, "learning_rate": 3.837126158055089e-08, "loss": 0.0994, "step": 123780 }, { "epoch": 2.8863396578356797, "grad_norm": 1.309979796409607, "learning_rate": 3.829353976248212e-08, "loss": 0.1067, "step": 123790 }, { "epoch": 2.8865728192124975, "grad_norm": 1.6740412712097168, "learning_rate": 3.8215817944413355e-08, "loss": 0.1013, "step": 123800 }, { "epoch": 2.8868059805893154, "grad_norm": 1.6634818315505981, "learning_rate": 3.813809612634459e-08, "loss": 0.0943, "step": 123810 }, { "epoch": 2.887039141966133, "grad_norm": 1.5468043088912964, "learning_rate": 3.806037430827582e-08, "loss": 0.1001, "step": 123820 }, { "epoch": 2.887272303342951, "grad_norm": 2.024595260620117, "learning_rate": 3.7982652490207055e-08, "loss": 0.0991, "step": 123830 }, { "epoch": 2.8875054647197693, "grad_norm": 2.405709981918335, "learning_rate": 3.7904930672138284e-08, "loss": 0.0967, "step": 123840 }, { "epoch": 2.887738626096587, "grad_norm": 1.8750988245010376, "learning_rate": 3.782720885406951e-08, "loss": 0.0985, "step": 123850 }, { "epoch": 2.887971787473405, "grad_norm": 1.3700193166732788, "learning_rate": 3.774948703600075e-08, "loss": 0.1095, "step": 123860 }, { "epoch": 2.888204948850223, "grad_norm": 1.5640718936920166, "learning_rate": 3.767176521793198e-08, "loss": 0.0951, "step": 123870 }, { "epoch": 2.8884381102270407, "grad_norm": 1.6443791389465332, "learning_rate": 3.7594043399863214e-08, "loss": 0.1157, "step": 123880 }, { "epoch": 2.888671271603859, "grad_norm": 1.285353660583496, "learning_rate": 3.751632158179444e-08, "loss": 0.0991, "step": 123890 }, { "epoch": 2.888904432980677, "grad_norm": 1.855349063873291, "learning_rate": 3.743859976372567e-08, "loss": 0.1075, "step": 123900 }, { "epoch": 2.8891375943574946, "grad_norm": 1.607736587524414, "learning_rate": 3.736087794565691e-08, "loss": 0.0966, "step": 123910 }, { "epoch": 2.8893707557343125, "grad_norm": 1.2382895946502686, "learning_rate": 3.7283156127588137e-08, "loss": 0.1029, "step": 123920 }, { "epoch": 2.8896039171111303, "grad_norm": 1.2492389678955078, "learning_rate": 3.720543430951937e-08, "loss": 0.1018, "step": 123930 }, { "epoch": 2.8898370784879486, "grad_norm": 1.9126040935516357, "learning_rate": 3.71277124914506e-08, "loss": 0.0978, "step": 123940 }, { "epoch": 2.8900702398647664, "grad_norm": 1.264665961265564, "learning_rate": 3.704999067338183e-08, "loss": 0.1002, "step": 123950 }, { "epoch": 2.8903034012415842, "grad_norm": 2.392247200012207, "learning_rate": 3.6972268855313066e-08, "loss": 0.1103, "step": 123960 }, { "epoch": 2.8905365626184025, "grad_norm": 1.113844871520996, "learning_rate": 3.6894547037244295e-08, "loss": 0.1044, "step": 123970 }, { "epoch": 2.89076972399522, "grad_norm": 1.7177753448486328, "learning_rate": 3.681682521917553e-08, "loss": 0.0984, "step": 123980 }, { "epoch": 2.891002885372038, "grad_norm": 1.7130519151687622, "learning_rate": 3.673910340110676e-08, "loss": 0.0864, "step": 123990 }, { "epoch": 2.891236046748856, "grad_norm": 1.813890814781189, "learning_rate": 3.666138158303799e-08, "loss": 0.107, "step": 124000 }, { "epoch": 2.891469208125674, "grad_norm": 1.548123836517334, "learning_rate": 3.6583659764969225e-08, "loss": 0.1012, "step": 124010 }, { "epoch": 2.891702369502492, "grad_norm": 1.5520744323730469, "learning_rate": 3.6505937946900454e-08, "loss": 0.0987, "step": 124020 }, { "epoch": 2.89193553087931, "grad_norm": 3.606332302093506, "learning_rate": 3.642821612883169e-08, "loss": 0.1079, "step": 124030 }, { "epoch": 2.892168692256128, "grad_norm": 1.271317958831787, "learning_rate": 3.635049431076292e-08, "loss": 0.1131, "step": 124040 }, { "epoch": 2.8924018536329457, "grad_norm": 1.3889665603637695, "learning_rate": 3.627277249269415e-08, "loss": 0.1033, "step": 124050 }, { "epoch": 2.8926350150097635, "grad_norm": 1.251255989074707, "learning_rate": 3.619505067462538e-08, "loss": 0.0921, "step": 124060 }, { "epoch": 2.892868176386582, "grad_norm": 0.9936211109161377, "learning_rate": 3.611732885655661e-08, "loss": 0.1083, "step": 124070 }, { "epoch": 2.8931013377633996, "grad_norm": 1.5284206867218018, "learning_rate": 3.603960703848784e-08, "loss": 0.1147, "step": 124080 }, { "epoch": 2.8933344991402175, "grad_norm": 2.139058828353882, "learning_rate": 3.596188522041908e-08, "loss": 0.1027, "step": 124090 }, { "epoch": 2.8935676605170353, "grad_norm": 1.5880786180496216, "learning_rate": 3.5884163402350306e-08, "loss": 0.0988, "step": 124100 }, { "epoch": 2.893800821893853, "grad_norm": 2.6096959114074707, "learning_rate": 3.580644158428154e-08, "loss": 0.0998, "step": 124110 }, { "epoch": 2.8940339832706714, "grad_norm": 3.2444405555725098, "learning_rate": 3.572871976621277e-08, "loss": 0.1119, "step": 124120 }, { "epoch": 2.8942671446474892, "grad_norm": 1.2740700244903564, "learning_rate": 3.5650997948144e-08, "loss": 0.1162, "step": 124130 }, { "epoch": 2.894500306024307, "grad_norm": 2.335531234741211, "learning_rate": 3.5573276130075236e-08, "loss": 0.1008, "step": 124140 }, { "epoch": 2.894733467401125, "grad_norm": 1.7410415410995483, "learning_rate": 3.5495554312006465e-08, "loss": 0.101, "step": 124150 }, { "epoch": 2.8949666287779428, "grad_norm": 2.2577908039093018, "learning_rate": 3.54178324939377e-08, "loss": 0.1008, "step": 124160 }, { "epoch": 2.895199790154761, "grad_norm": 2.351384401321411, "learning_rate": 3.534011067586893e-08, "loss": 0.0999, "step": 124170 }, { "epoch": 2.895432951531579, "grad_norm": 2.7772326469421387, "learning_rate": 3.526238885780016e-08, "loss": 0.1038, "step": 124180 }, { "epoch": 2.8956661129083967, "grad_norm": 1.1138455867767334, "learning_rate": 3.5184667039731394e-08, "loss": 0.1008, "step": 124190 }, { "epoch": 2.8958992742852145, "grad_norm": 1.591818928718567, "learning_rate": 3.510694522166262e-08, "loss": 0.1042, "step": 124200 }, { "epoch": 2.8961324356620324, "grad_norm": 2.5993947982788086, "learning_rate": 3.502922340359386e-08, "loss": 0.0948, "step": 124210 }, { "epoch": 2.8963655970388507, "grad_norm": 2.8074071407318115, "learning_rate": 3.495150158552509e-08, "loss": 0.0947, "step": 124220 }, { "epoch": 2.8965987584156685, "grad_norm": 1.1204643249511719, "learning_rate": 3.487377976745632e-08, "loss": 0.0983, "step": 124230 }, { "epoch": 2.8968319197924863, "grad_norm": 3.355820655822754, "learning_rate": 3.479605794938755e-08, "loss": 0.107, "step": 124240 }, { "epoch": 2.897065081169304, "grad_norm": 1.3020422458648682, "learning_rate": 3.471833613131878e-08, "loss": 0.0947, "step": 124250 }, { "epoch": 2.897298242546122, "grad_norm": 1.1823807954788208, "learning_rate": 3.464061431325002e-08, "loss": 0.0969, "step": 124260 }, { "epoch": 2.8975314039229403, "grad_norm": 1.115515947341919, "learning_rate": 3.4562892495181246e-08, "loss": 0.1032, "step": 124270 }, { "epoch": 2.897764565299758, "grad_norm": 1.9772356748580933, "learning_rate": 3.4485170677112476e-08, "loss": 0.1034, "step": 124280 }, { "epoch": 2.897997726676576, "grad_norm": 2.634120225906372, "learning_rate": 3.440744885904371e-08, "loss": 0.1059, "step": 124290 }, { "epoch": 2.898230888053394, "grad_norm": 2.337252378463745, "learning_rate": 3.432972704097494e-08, "loss": 0.0997, "step": 124300 }, { "epoch": 2.8984640494302116, "grad_norm": 3.2439956665039062, "learning_rate": 3.4252005222906176e-08, "loss": 0.0988, "step": 124310 }, { "epoch": 2.89869721080703, "grad_norm": 1.1944352388381958, "learning_rate": 3.4174283404837405e-08, "loss": 0.0919, "step": 124320 }, { "epoch": 2.8989303721838477, "grad_norm": 1.3557660579681396, "learning_rate": 3.4096561586768634e-08, "loss": 0.0957, "step": 124330 }, { "epoch": 2.8991635335606656, "grad_norm": 2.9022369384765625, "learning_rate": 3.401883976869987e-08, "loss": 0.1105, "step": 124340 }, { "epoch": 2.899396694937484, "grad_norm": 2.872842311859131, "learning_rate": 3.39411179506311e-08, "loss": 0.1025, "step": 124350 }, { "epoch": 2.8996298563143017, "grad_norm": 1.4871504306793213, "learning_rate": 3.3863396132562334e-08, "loss": 0.1, "step": 124360 }, { "epoch": 2.8998630176911195, "grad_norm": 1.5259394645690918, "learning_rate": 3.3785674314493564e-08, "loss": 0.1007, "step": 124370 }, { "epoch": 2.9000961790679374, "grad_norm": 1.9721417427062988, "learning_rate": 3.370795249642479e-08, "loss": 0.1073, "step": 124380 }, { "epoch": 2.900329340444755, "grad_norm": 1.2700817584991455, "learning_rate": 3.363023067835603e-08, "loss": 0.1106, "step": 124390 }, { "epoch": 2.9005625018215735, "grad_norm": 1.4216854572296143, "learning_rate": 3.355250886028726e-08, "loss": 0.1018, "step": 124400 }, { "epoch": 2.9007956631983913, "grad_norm": 1.1507372856140137, "learning_rate": 3.347478704221849e-08, "loss": 0.0956, "step": 124410 }, { "epoch": 2.901028824575209, "grad_norm": 2.2832882404327393, "learning_rate": 3.339706522414972e-08, "loss": 0.1143, "step": 124420 }, { "epoch": 2.901261985952027, "grad_norm": 1.2842904329299927, "learning_rate": 3.331934340608095e-08, "loss": 0.0953, "step": 124430 }, { "epoch": 2.901495147328845, "grad_norm": 3.2918267250061035, "learning_rate": 3.324162158801219e-08, "loss": 0.1145, "step": 124440 }, { "epoch": 2.901728308705663, "grad_norm": 1.2436745166778564, "learning_rate": 3.3163899769943416e-08, "loss": 0.0971, "step": 124450 }, { "epoch": 2.901961470082481, "grad_norm": 1.0368030071258545, "learning_rate": 3.308617795187465e-08, "loss": 0.101, "step": 124460 }, { "epoch": 2.902194631459299, "grad_norm": 1.2010433673858643, "learning_rate": 3.300845613380588e-08, "loss": 0.1015, "step": 124470 }, { "epoch": 2.9024277928361166, "grad_norm": 1.5108062028884888, "learning_rate": 3.293073431573711e-08, "loss": 0.1048, "step": 124480 }, { "epoch": 2.9026609542129345, "grad_norm": 1.5074856281280518, "learning_rate": 3.2853012497668345e-08, "loss": 0.103, "step": 124490 }, { "epoch": 2.9028941155897527, "grad_norm": 1.946771264076233, "learning_rate": 3.2775290679599574e-08, "loss": 0.1101, "step": 124500 }, { "epoch": 2.9031272769665706, "grad_norm": 1.6465288400650024, "learning_rate": 3.2697568861530804e-08, "loss": 0.1097, "step": 124510 }, { "epoch": 2.9033604383433884, "grad_norm": 1.6686983108520508, "learning_rate": 3.261984704346204e-08, "loss": 0.1081, "step": 124520 }, { "epoch": 2.9035935997202063, "grad_norm": 1.3466078042984009, "learning_rate": 3.254212522539327e-08, "loss": 0.0914, "step": 124530 }, { "epoch": 2.903826761097024, "grad_norm": 2.137216567993164, "learning_rate": 3.2464403407324504e-08, "loss": 0.1021, "step": 124540 }, { "epoch": 2.9040599224738424, "grad_norm": 2.2772774696350098, "learning_rate": 3.238668158925573e-08, "loss": 0.103, "step": 124550 }, { "epoch": 2.90429308385066, "grad_norm": 1.5604225397109985, "learning_rate": 3.230895977118696e-08, "loss": 0.1077, "step": 124560 }, { "epoch": 2.904526245227478, "grad_norm": 2.311476469039917, "learning_rate": 3.22312379531182e-08, "loss": 0.1098, "step": 124570 }, { "epoch": 2.904759406604296, "grad_norm": 1.8235881328582764, "learning_rate": 3.215351613504943e-08, "loss": 0.1087, "step": 124580 }, { "epoch": 2.9049925679811137, "grad_norm": 1.5255746841430664, "learning_rate": 3.207579431698066e-08, "loss": 0.1058, "step": 124590 }, { "epoch": 2.905225729357932, "grad_norm": 2.7392704486846924, "learning_rate": 3.199807249891189e-08, "loss": 0.1081, "step": 124600 }, { "epoch": 2.90545889073475, "grad_norm": 1.1252241134643555, "learning_rate": 3.192035068084312e-08, "loss": 0.1062, "step": 124610 }, { "epoch": 2.9056920521115677, "grad_norm": 1.5976170301437378, "learning_rate": 3.1842628862774356e-08, "loss": 0.1037, "step": 124620 }, { "epoch": 2.9059252134883855, "grad_norm": 1.6218247413635254, "learning_rate": 3.1764907044705585e-08, "loss": 0.1047, "step": 124630 }, { "epoch": 2.9061583748652033, "grad_norm": 2.2713029384613037, "learning_rate": 3.168718522663682e-08, "loss": 0.1096, "step": 124640 }, { "epoch": 2.9063915362420216, "grad_norm": 1.2877998352050781, "learning_rate": 3.160946340856805e-08, "loss": 0.1012, "step": 124650 }, { "epoch": 2.9066246976188395, "grad_norm": 1.4424362182617188, "learning_rate": 3.153174159049928e-08, "loss": 0.11, "step": 124660 }, { "epoch": 2.9068578589956573, "grad_norm": 1.3555806875228882, "learning_rate": 3.1454019772430515e-08, "loss": 0.1061, "step": 124670 }, { "epoch": 2.9070910203724756, "grad_norm": 1.409741997718811, "learning_rate": 3.1376297954361744e-08, "loss": 0.0995, "step": 124680 }, { "epoch": 2.907324181749293, "grad_norm": 1.0598230361938477, "learning_rate": 3.129857613629298e-08, "loss": 0.1127, "step": 124690 }, { "epoch": 2.9075573431261112, "grad_norm": 1.142677903175354, "learning_rate": 3.1220854318224215e-08, "loss": 0.0986, "step": 124700 }, { "epoch": 2.907790504502929, "grad_norm": 2.472196340560913, "learning_rate": 3.1143132500155444e-08, "loss": 0.0955, "step": 124710 }, { "epoch": 2.908023665879747, "grad_norm": 1.6072239875793457, "learning_rate": 3.1065410682086673e-08, "loss": 0.1125, "step": 124720 }, { "epoch": 2.908256827256565, "grad_norm": 2.757997989654541, "learning_rate": 3.098768886401791e-08, "loss": 0.1062, "step": 124730 }, { "epoch": 2.908489988633383, "grad_norm": 1.4306325912475586, "learning_rate": 3.090996704594914e-08, "loss": 0.0921, "step": 124740 }, { "epoch": 2.908723150010201, "grad_norm": 1.3904786109924316, "learning_rate": 3.0832245227880374e-08, "loss": 0.0948, "step": 124750 }, { "epoch": 2.9089563113870187, "grad_norm": 1.808625340461731, "learning_rate": 3.07545234098116e-08, "loss": 0.1014, "step": 124760 }, { "epoch": 2.9091894727638365, "grad_norm": 2.146371364593506, "learning_rate": 3.067680159174283e-08, "loss": 0.1091, "step": 124770 }, { "epoch": 2.909422634140655, "grad_norm": 1.3662586212158203, "learning_rate": 3.059907977367407e-08, "loss": 0.0963, "step": 124780 }, { "epoch": 2.9096557955174727, "grad_norm": 1.3874707221984863, "learning_rate": 3.05213579556053e-08, "loss": 0.0973, "step": 124790 }, { "epoch": 2.9098889568942905, "grad_norm": 1.5195035934448242, "learning_rate": 3.0443636137536526e-08, "loss": 0.1062, "step": 124800 }, { "epoch": 2.9101221182711083, "grad_norm": 1.7115048170089722, "learning_rate": 3.036591431946776e-08, "loss": 0.0963, "step": 124810 }, { "epoch": 2.910355279647926, "grad_norm": 1.4344371557235718, "learning_rate": 3.028819250139899e-08, "loss": 0.105, "step": 124820 }, { "epoch": 2.9105884410247445, "grad_norm": 2.3224053382873535, "learning_rate": 3.0210470683330226e-08, "loss": 0.1, "step": 124830 }, { "epoch": 2.9108216024015623, "grad_norm": 2.3629910945892334, "learning_rate": 3.0132748865261455e-08, "loss": 0.0987, "step": 124840 }, { "epoch": 2.91105476377838, "grad_norm": 1.0563589334487915, "learning_rate": 3.0055027047192684e-08, "loss": 0.104, "step": 124850 }, { "epoch": 2.911287925155198, "grad_norm": 1.4537508487701416, "learning_rate": 2.997730522912392e-08, "loss": 0.0901, "step": 124860 }, { "epoch": 2.911521086532016, "grad_norm": 1.129631519317627, "learning_rate": 2.989958341105515e-08, "loss": 0.1023, "step": 124870 }, { "epoch": 2.911754247908834, "grad_norm": 1.0488710403442383, "learning_rate": 2.9821861592986385e-08, "loss": 0.1056, "step": 124880 }, { "epoch": 2.911987409285652, "grad_norm": 1.5392136573791504, "learning_rate": 2.9744139774917614e-08, "loss": 0.0988, "step": 124890 }, { "epoch": 2.9122205706624698, "grad_norm": 1.1731477975845337, "learning_rate": 2.9666417956848846e-08, "loss": 0.1026, "step": 124900 }, { "epoch": 2.9124537320392876, "grad_norm": 1.5306092500686646, "learning_rate": 2.958869613878008e-08, "loss": 0.101, "step": 124910 }, { "epoch": 2.9126868934161054, "grad_norm": 3.535642385482788, "learning_rate": 2.9510974320711308e-08, "loss": 0.1049, "step": 124920 }, { "epoch": 2.9129200547929237, "grad_norm": 1.4825674295425415, "learning_rate": 2.943325250264254e-08, "loss": 0.1035, "step": 124930 }, { "epoch": 2.9131532161697415, "grad_norm": 1.3091813325881958, "learning_rate": 2.9355530684573772e-08, "loss": 0.1024, "step": 124940 }, { "epoch": 2.9133863775465594, "grad_norm": 1.360929012298584, "learning_rate": 2.9277808866505005e-08, "loss": 0.1018, "step": 124950 }, { "epoch": 2.913619538923377, "grad_norm": 2.267706871032715, "learning_rate": 2.9200087048436237e-08, "loss": 0.112, "step": 124960 }, { "epoch": 2.913852700300195, "grad_norm": 3.268601894378662, "learning_rate": 2.9122365230367466e-08, "loss": 0.1069, "step": 124970 }, { "epoch": 2.9140858616770133, "grad_norm": 1.2276486158370972, "learning_rate": 2.90446434122987e-08, "loss": 0.1014, "step": 124980 }, { "epoch": 2.914319023053831, "grad_norm": 2.0976827144622803, "learning_rate": 2.896692159422993e-08, "loss": 0.1012, "step": 124990 }, { "epoch": 2.914552184430649, "grad_norm": 1.4532344341278076, "learning_rate": 2.8889199776161163e-08, "loss": 0.0993, "step": 125000 }, { "epoch": 2.914785345807467, "grad_norm": 3.1519615650177, "learning_rate": 2.8811477958092396e-08, "loss": 0.104, "step": 125010 }, { "epoch": 2.9150185071842847, "grad_norm": 2.155406951904297, "learning_rate": 2.8733756140023625e-08, "loss": 0.1033, "step": 125020 }, { "epoch": 2.915251668561103, "grad_norm": 1.8022369146347046, "learning_rate": 2.8656034321954857e-08, "loss": 0.0953, "step": 125030 }, { "epoch": 2.915484829937921, "grad_norm": 1.915228009223938, "learning_rate": 2.857831250388609e-08, "loss": 0.0994, "step": 125040 }, { "epoch": 2.9157179913147386, "grad_norm": 2.175442695617676, "learning_rate": 2.8500590685817322e-08, "loss": 0.115, "step": 125050 }, { "epoch": 2.915951152691557, "grad_norm": 2.1008660793304443, "learning_rate": 2.8422868867748554e-08, "loss": 0.102, "step": 125060 }, { "epoch": 2.9161843140683743, "grad_norm": 1.5589423179626465, "learning_rate": 2.8345147049679783e-08, "loss": 0.114, "step": 125070 }, { "epoch": 2.9164174754451926, "grad_norm": 1.6456809043884277, "learning_rate": 2.8267425231611016e-08, "loss": 0.0951, "step": 125080 }, { "epoch": 2.9166506368220104, "grad_norm": 2.1833741664886475, "learning_rate": 2.8189703413542248e-08, "loss": 0.1107, "step": 125090 }, { "epoch": 2.9168837981988283, "grad_norm": 1.6320663690567017, "learning_rate": 2.811198159547348e-08, "loss": 0.1005, "step": 125100 }, { "epoch": 2.9171169595756465, "grad_norm": 1.6666067838668823, "learning_rate": 2.8034259777404713e-08, "loss": 0.1132, "step": 125110 }, { "epoch": 2.9173501209524644, "grad_norm": 1.4430965185165405, "learning_rate": 2.7956537959335942e-08, "loss": 0.114, "step": 125120 }, { "epoch": 2.917583282329282, "grad_norm": 1.6829057931900024, "learning_rate": 2.7878816141267174e-08, "loss": 0.1085, "step": 125130 }, { "epoch": 2.9178164437061, "grad_norm": 2.2029688358306885, "learning_rate": 2.7801094323198407e-08, "loss": 0.1012, "step": 125140 }, { "epoch": 2.918049605082918, "grad_norm": 2.0290563106536865, "learning_rate": 2.772337250512964e-08, "loss": 0.1016, "step": 125150 }, { "epoch": 2.918282766459736, "grad_norm": 2.7487387657165527, "learning_rate": 2.7645650687060868e-08, "loss": 0.1115, "step": 125160 }, { "epoch": 2.918515927836554, "grad_norm": 1.11510169506073, "learning_rate": 2.75679288689921e-08, "loss": 0.1038, "step": 125170 }, { "epoch": 2.918749089213372, "grad_norm": 2.3461904525756836, "learning_rate": 2.7490207050923333e-08, "loss": 0.1075, "step": 125180 }, { "epoch": 2.9189822505901897, "grad_norm": 1.2204951047897339, "learning_rate": 2.7412485232854565e-08, "loss": 0.0986, "step": 125190 }, { "epoch": 2.9192154119670075, "grad_norm": 2.2502055168151855, "learning_rate": 2.7342535596592674e-08, "loss": 0.0947, "step": 125200 }, { "epoch": 2.919448573343826, "grad_norm": 1.5789278745651245, "learning_rate": 2.7264813778523906e-08, "loss": 0.1108, "step": 125210 }, { "epoch": 2.9196817347206436, "grad_norm": 1.3869699239730835, "learning_rate": 2.718709196045514e-08, "loss": 0.1025, "step": 125220 }, { "epoch": 2.9199148960974615, "grad_norm": 1.4494502544403076, "learning_rate": 2.710937014238637e-08, "loss": 0.107, "step": 125230 }, { "epoch": 2.9201480574742793, "grad_norm": 1.6351385116577148, "learning_rate": 2.70316483243176e-08, "loss": 0.1003, "step": 125240 }, { "epoch": 2.920381218851097, "grad_norm": 1.5857563018798828, "learning_rate": 2.6953926506248832e-08, "loss": 0.1052, "step": 125250 }, { "epoch": 2.9206143802279154, "grad_norm": 1.1462796926498413, "learning_rate": 2.6876204688180065e-08, "loss": 0.1239, "step": 125260 }, { "epoch": 2.9208475416047333, "grad_norm": 1.5695432424545288, "learning_rate": 2.6798482870111297e-08, "loss": 0.0996, "step": 125270 }, { "epoch": 2.921080702981551, "grad_norm": 1.5691697597503662, "learning_rate": 2.672076105204253e-08, "loss": 0.1081, "step": 125280 }, { "epoch": 2.921313864358369, "grad_norm": 3.5464770793914795, "learning_rate": 2.664303923397376e-08, "loss": 0.1147, "step": 125290 }, { "epoch": 2.9215470257351868, "grad_norm": 1.8025546073913574, "learning_rate": 2.656531741590499e-08, "loss": 0.0974, "step": 125300 }, { "epoch": 2.921780187112005, "grad_norm": 2.1875994205474854, "learning_rate": 2.6487595597836223e-08, "loss": 0.1028, "step": 125310 }, { "epoch": 2.922013348488823, "grad_norm": 1.494335651397705, "learning_rate": 2.6409873779767456e-08, "loss": 0.0951, "step": 125320 }, { "epoch": 2.9222465098656407, "grad_norm": 1.7970762252807617, "learning_rate": 2.6332151961698685e-08, "loss": 0.1079, "step": 125330 }, { "epoch": 2.9224796712424586, "grad_norm": 1.4607737064361572, "learning_rate": 2.6254430143629917e-08, "loss": 0.1196, "step": 125340 }, { "epoch": 2.9227128326192764, "grad_norm": 1.2725316286087036, "learning_rate": 2.617670832556115e-08, "loss": 0.1023, "step": 125350 }, { "epoch": 2.9229459939960947, "grad_norm": 1.8287779092788696, "learning_rate": 2.6098986507492382e-08, "loss": 0.0963, "step": 125360 }, { "epoch": 2.9231791553729125, "grad_norm": 2.026930332183838, "learning_rate": 2.6021264689423614e-08, "loss": 0.0914, "step": 125370 }, { "epoch": 2.9234123167497303, "grad_norm": 1.4762204885482788, "learning_rate": 2.5943542871354843e-08, "loss": 0.1077, "step": 125380 }, { "epoch": 2.923645478126548, "grad_norm": 1.2071459293365479, "learning_rate": 2.5865821053286076e-08, "loss": 0.0884, "step": 125390 }, { "epoch": 2.923878639503366, "grad_norm": 1.2483302354812622, "learning_rate": 2.5788099235217308e-08, "loss": 0.0882, "step": 125400 }, { "epoch": 2.9241118008801843, "grad_norm": 2.9735043048858643, "learning_rate": 2.571037741714854e-08, "loss": 0.1028, "step": 125410 }, { "epoch": 2.924344962257002, "grad_norm": 1.5421016216278076, "learning_rate": 2.5632655599079773e-08, "loss": 0.0834, "step": 125420 }, { "epoch": 2.92457812363382, "grad_norm": 1.3365212678909302, "learning_rate": 2.5554933781011002e-08, "loss": 0.0985, "step": 125430 }, { "epoch": 2.9248112850106383, "grad_norm": 1.1306838989257812, "learning_rate": 2.5477211962942234e-08, "loss": 0.0916, "step": 125440 }, { "epoch": 2.9250444463874556, "grad_norm": 1.3356090784072876, "learning_rate": 2.5399490144873467e-08, "loss": 0.1077, "step": 125450 }, { "epoch": 2.925277607764274, "grad_norm": 1.112394094467163, "learning_rate": 2.53217683268047e-08, "loss": 0.1009, "step": 125460 }, { "epoch": 2.9255107691410918, "grad_norm": 2.4246573448181152, "learning_rate": 2.524404650873593e-08, "loss": 0.1051, "step": 125470 }, { "epoch": 2.9257439305179096, "grad_norm": 1.557816982269287, "learning_rate": 2.516632469066716e-08, "loss": 0.0949, "step": 125480 }, { "epoch": 2.925977091894728, "grad_norm": 1.6256484985351562, "learning_rate": 2.5088602872598393e-08, "loss": 0.0979, "step": 125490 }, { "epoch": 2.9262102532715457, "grad_norm": 1.3895243406295776, "learning_rate": 2.5010881054529625e-08, "loss": 0.0994, "step": 125500 }, { "epoch": 2.9264434146483636, "grad_norm": 1.399569034576416, "learning_rate": 2.4933159236460858e-08, "loss": 0.0953, "step": 125510 }, { "epoch": 2.9266765760251814, "grad_norm": 1.1128246784210205, "learning_rate": 2.4855437418392087e-08, "loss": 0.1076, "step": 125520 }, { "epoch": 2.9269097374019992, "grad_norm": 2.4053828716278076, "learning_rate": 2.477771560032332e-08, "loss": 0.1005, "step": 125530 }, { "epoch": 2.9271428987788175, "grad_norm": 1.3891091346740723, "learning_rate": 2.4699993782254555e-08, "loss": 0.1028, "step": 125540 }, { "epoch": 2.9273760601556353, "grad_norm": 1.846616268157959, "learning_rate": 2.4622271964185787e-08, "loss": 0.1053, "step": 125550 }, { "epoch": 2.927609221532453, "grad_norm": 2.7337145805358887, "learning_rate": 2.454455014611702e-08, "loss": 0.1008, "step": 125560 }, { "epoch": 2.927842382909271, "grad_norm": 1.6382884979248047, "learning_rate": 2.446682832804825e-08, "loss": 0.0959, "step": 125570 }, { "epoch": 2.928075544286089, "grad_norm": 2.128314971923828, "learning_rate": 2.438910650997948e-08, "loss": 0.1013, "step": 125580 }, { "epoch": 2.928308705662907, "grad_norm": 1.6951303482055664, "learning_rate": 2.4311384691910713e-08, "loss": 0.0955, "step": 125590 }, { "epoch": 2.928541867039725, "grad_norm": 2.088672399520874, "learning_rate": 2.4233662873841946e-08, "loss": 0.107, "step": 125600 }, { "epoch": 2.928775028416543, "grad_norm": 1.3253971338272095, "learning_rate": 2.4155941055773178e-08, "loss": 0.1078, "step": 125610 }, { "epoch": 2.9290081897933606, "grad_norm": 1.3806984424591064, "learning_rate": 2.4078219237704407e-08, "loss": 0.1031, "step": 125620 }, { "epoch": 2.9292413511701785, "grad_norm": 1.2379330396652222, "learning_rate": 2.400049741963564e-08, "loss": 0.1011, "step": 125630 }, { "epoch": 2.9294745125469968, "grad_norm": 1.4570382833480835, "learning_rate": 2.3922775601566872e-08, "loss": 0.11, "step": 125640 }, { "epoch": 2.9297076739238146, "grad_norm": 1.6093212366104126, "learning_rate": 2.3845053783498104e-08, "loss": 0.0996, "step": 125650 }, { "epoch": 2.9299408353006324, "grad_norm": 1.1569257974624634, "learning_rate": 2.3767331965429337e-08, "loss": 0.103, "step": 125660 }, { "epoch": 2.9301739966774503, "grad_norm": 2.554396629333496, "learning_rate": 2.3689610147360566e-08, "loss": 0.0983, "step": 125670 }, { "epoch": 2.930407158054268, "grad_norm": 1.3757174015045166, "learning_rate": 2.3611888329291798e-08, "loss": 0.1083, "step": 125680 }, { "epoch": 2.9306403194310864, "grad_norm": 1.5350632667541504, "learning_rate": 2.353416651122303e-08, "loss": 0.1008, "step": 125690 }, { "epoch": 2.930873480807904, "grad_norm": 1.418397068977356, "learning_rate": 2.3456444693154263e-08, "loss": 0.1059, "step": 125700 }, { "epoch": 2.931106642184722, "grad_norm": 3.569305658340454, "learning_rate": 2.3378722875085495e-08, "loss": 0.1029, "step": 125710 }, { "epoch": 2.93133980356154, "grad_norm": 1.6298449039459229, "learning_rate": 2.3301001057016724e-08, "loss": 0.1045, "step": 125720 }, { "epoch": 2.9315729649383577, "grad_norm": 1.2536686658859253, "learning_rate": 2.3223279238947957e-08, "loss": 0.104, "step": 125730 }, { "epoch": 2.931806126315176, "grad_norm": 1.5640809535980225, "learning_rate": 2.314555742087919e-08, "loss": 0.1097, "step": 125740 }, { "epoch": 2.932039287691994, "grad_norm": 1.481852412223816, "learning_rate": 2.306783560281042e-08, "loss": 0.1002, "step": 125750 }, { "epoch": 2.9322724490688117, "grad_norm": 1.9329149723052979, "learning_rate": 2.2990113784741654e-08, "loss": 0.0902, "step": 125760 }, { "epoch": 2.9325056104456295, "grad_norm": 1.3933554887771606, "learning_rate": 2.2912391966672883e-08, "loss": 0.1013, "step": 125770 }, { "epoch": 2.9327387718224474, "grad_norm": 1.5926111936569214, "learning_rate": 2.2834670148604115e-08, "loss": 0.1121, "step": 125780 }, { "epoch": 2.9329719331992656, "grad_norm": 1.09228515625, "learning_rate": 2.2756948330535347e-08, "loss": 0.1033, "step": 125790 }, { "epoch": 2.9332050945760835, "grad_norm": 1.2219338417053223, "learning_rate": 2.267922651246658e-08, "loss": 0.1047, "step": 125800 }, { "epoch": 2.9334382559529013, "grad_norm": 3.0861170291900635, "learning_rate": 2.260150469439781e-08, "loss": 0.1105, "step": 125810 }, { "epoch": 2.9336714173297196, "grad_norm": 2.28946590423584, "learning_rate": 2.252378287632904e-08, "loss": 0.0976, "step": 125820 }, { "epoch": 2.9339045787065374, "grad_norm": 1.4839807748794556, "learning_rate": 2.2446061058260274e-08, "loss": 0.1032, "step": 125830 }, { "epoch": 2.9341377400833553, "grad_norm": 1.795888066291809, "learning_rate": 2.2368339240191506e-08, "loss": 0.0919, "step": 125840 }, { "epoch": 2.934370901460173, "grad_norm": 1.1137334108352661, "learning_rate": 2.229061742212274e-08, "loss": 0.1029, "step": 125850 }, { "epoch": 2.934604062836991, "grad_norm": 1.109679102897644, "learning_rate": 2.2212895604053967e-08, "loss": 0.1061, "step": 125860 }, { "epoch": 2.934837224213809, "grad_norm": 1.159195065498352, "learning_rate": 2.21351737859852e-08, "loss": 0.1016, "step": 125870 }, { "epoch": 2.935070385590627, "grad_norm": 1.2974897623062134, "learning_rate": 2.2057451967916432e-08, "loss": 0.1036, "step": 125880 }, { "epoch": 2.935303546967445, "grad_norm": 1.6905765533447266, "learning_rate": 2.1979730149847665e-08, "loss": 0.1011, "step": 125890 }, { "epoch": 2.9355367083442627, "grad_norm": 1.9941225051879883, "learning_rate": 2.1902008331778897e-08, "loss": 0.1076, "step": 125900 }, { "epoch": 2.9357698697210806, "grad_norm": 1.736901044845581, "learning_rate": 2.1824286513710126e-08, "loss": 0.0996, "step": 125910 }, { "epoch": 2.936003031097899, "grad_norm": 1.1460611820220947, "learning_rate": 2.174656469564136e-08, "loss": 0.1049, "step": 125920 }, { "epoch": 2.9362361924747167, "grad_norm": 1.2334232330322266, "learning_rate": 2.166884287757259e-08, "loss": 0.095, "step": 125930 }, { "epoch": 2.9364693538515345, "grad_norm": 1.2872626781463623, "learning_rate": 2.1591121059503823e-08, "loss": 0.0894, "step": 125940 }, { "epoch": 2.9367025152283524, "grad_norm": 1.3116471767425537, "learning_rate": 2.1513399241435055e-08, "loss": 0.0918, "step": 125950 }, { "epoch": 2.93693567660517, "grad_norm": 1.4694589376449585, "learning_rate": 2.1435677423366285e-08, "loss": 0.1015, "step": 125960 }, { "epoch": 2.9371688379819885, "grad_norm": 1.406577229499817, "learning_rate": 2.1357955605297517e-08, "loss": 0.1044, "step": 125970 }, { "epoch": 2.9374019993588063, "grad_norm": 1.1049396991729736, "learning_rate": 2.128023378722875e-08, "loss": 0.0997, "step": 125980 }, { "epoch": 2.937635160735624, "grad_norm": 1.3783107995986938, "learning_rate": 2.1202511969159982e-08, "loss": 0.1004, "step": 125990 }, { "epoch": 2.937868322112442, "grad_norm": 1.4069443941116333, "learning_rate": 2.1124790151091214e-08, "loss": 0.1122, "step": 126000 }, { "epoch": 2.93810148348926, "grad_norm": 1.368615746498108, "learning_rate": 2.1047068333022443e-08, "loss": 0.0876, "step": 126010 }, { "epoch": 2.938334644866078, "grad_norm": 1.7982696294784546, "learning_rate": 2.0969346514953675e-08, "loss": 0.0968, "step": 126020 }, { "epoch": 2.938567806242896, "grad_norm": 3.4939820766448975, "learning_rate": 2.0891624696884908e-08, "loss": 0.1089, "step": 126030 }, { "epoch": 2.9388009676197138, "grad_norm": 1.4458427429199219, "learning_rate": 2.081390287881614e-08, "loss": 0.1021, "step": 126040 }, { "epoch": 2.9390341289965316, "grad_norm": 3.1041388511657715, "learning_rate": 2.073618106074737e-08, "loss": 0.1139, "step": 126050 }, { "epoch": 2.9392672903733494, "grad_norm": 1.6040692329406738, "learning_rate": 2.06584592426786e-08, "loss": 0.0972, "step": 126060 }, { "epoch": 2.9395004517501677, "grad_norm": 1.347285509109497, "learning_rate": 2.0580737424609834e-08, "loss": 0.1068, "step": 126070 }, { "epoch": 2.9397336131269856, "grad_norm": 1.9795697927474976, "learning_rate": 2.0503015606541066e-08, "loss": 0.1141, "step": 126080 }, { "epoch": 2.9399667745038034, "grad_norm": 1.2186055183410645, "learning_rate": 2.04252937884723e-08, "loss": 0.1175, "step": 126090 }, { "epoch": 2.9401999358806212, "grad_norm": 1.2417281866073608, "learning_rate": 2.0347571970403528e-08, "loss": 0.0859, "step": 126100 }, { "epoch": 2.940433097257439, "grad_norm": 1.3134750127792358, "learning_rate": 2.026985015233476e-08, "loss": 0.0915, "step": 126110 }, { "epoch": 2.9406662586342573, "grad_norm": 1.527479648590088, "learning_rate": 2.0192128334265993e-08, "loss": 0.1061, "step": 126120 }, { "epoch": 2.940899420011075, "grad_norm": 1.686608076095581, "learning_rate": 2.0114406516197225e-08, "loss": 0.1027, "step": 126130 }, { "epoch": 2.941132581387893, "grad_norm": 2.108896493911743, "learning_rate": 2.003668469812846e-08, "loss": 0.0975, "step": 126140 }, { "epoch": 2.9413657427647113, "grad_norm": 1.404343843460083, "learning_rate": 1.995896288005969e-08, "loss": 0.093, "step": 126150 }, { "epoch": 2.9415989041415287, "grad_norm": 1.693622350692749, "learning_rate": 1.9881241061990922e-08, "loss": 0.1145, "step": 126160 }, { "epoch": 2.941832065518347, "grad_norm": 0.9230610132217407, "learning_rate": 1.9803519243922154e-08, "loss": 0.107, "step": 126170 }, { "epoch": 2.942065226895165, "grad_norm": 1.3075809478759766, "learning_rate": 1.9725797425853387e-08, "loss": 0.097, "step": 126180 }, { "epoch": 2.9422983882719826, "grad_norm": 1.3851886987686157, "learning_rate": 1.964807560778462e-08, "loss": 0.1049, "step": 126190 }, { "epoch": 2.942531549648801, "grad_norm": 1.1448543071746826, "learning_rate": 1.9570353789715848e-08, "loss": 0.0904, "step": 126200 }, { "epoch": 2.9427647110256188, "grad_norm": 3.781954288482666, "learning_rate": 1.949263197164708e-08, "loss": 0.1006, "step": 126210 }, { "epoch": 2.9429978724024366, "grad_norm": 1.2662303447723389, "learning_rate": 1.9414910153578313e-08, "loss": 0.0993, "step": 126220 }, { "epoch": 2.9432310337792544, "grad_norm": 1.2812690734863281, "learning_rate": 1.9337188335509545e-08, "loss": 0.1012, "step": 126230 }, { "epoch": 2.9434641951560723, "grad_norm": 1.7302155494689941, "learning_rate": 1.9259466517440778e-08, "loss": 0.0916, "step": 126240 }, { "epoch": 2.9436973565328906, "grad_norm": 1.6008930206298828, "learning_rate": 1.9181744699372007e-08, "loss": 0.0955, "step": 126250 }, { "epoch": 2.9439305179097084, "grad_norm": 1.5484261512756348, "learning_rate": 1.910402288130324e-08, "loss": 0.1124, "step": 126260 }, { "epoch": 2.9441636792865262, "grad_norm": 1.2564934492111206, "learning_rate": 1.902630106323447e-08, "loss": 0.0986, "step": 126270 }, { "epoch": 2.944396840663344, "grad_norm": 1.4486091136932373, "learning_rate": 1.8948579245165704e-08, "loss": 0.122, "step": 126280 }, { "epoch": 2.944630002040162, "grad_norm": 1.4806749820709229, "learning_rate": 1.8870857427096936e-08, "loss": 0.0958, "step": 126290 }, { "epoch": 2.94486316341698, "grad_norm": 1.3083267211914062, "learning_rate": 1.8793135609028165e-08, "loss": 0.1007, "step": 126300 }, { "epoch": 2.945096324793798, "grad_norm": 1.416715383529663, "learning_rate": 1.8715413790959398e-08, "loss": 0.1077, "step": 126310 }, { "epoch": 2.945329486170616, "grad_norm": 2.840024471282959, "learning_rate": 1.863769197289063e-08, "loss": 0.0996, "step": 126320 }, { "epoch": 2.9455626475474337, "grad_norm": 2.6467010974884033, "learning_rate": 1.8559970154821862e-08, "loss": 0.1133, "step": 126330 }, { "epoch": 2.9457958089242515, "grad_norm": 2.301002264022827, "learning_rate": 1.848224833675309e-08, "loss": 0.0967, "step": 126340 }, { "epoch": 2.94602897030107, "grad_norm": 1.4585098028182983, "learning_rate": 1.8404526518684324e-08, "loss": 0.1005, "step": 126350 }, { "epoch": 2.9462621316778876, "grad_norm": 1.2105536460876465, "learning_rate": 1.8326804700615556e-08, "loss": 0.1001, "step": 126360 }, { "epoch": 2.9464952930547055, "grad_norm": 1.2991355657577515, "learning_rate": 1.824908288254679e-08, "loss": 0.1033, "step": 126370 }, { "epoch": 2.9467284544315233, "grad_norm": 1.820269227027893, "learning_rate": 1.817136106447802e-08, "loss": 0.0876, "step": 126380 }, { "epoch": 2.946961615808341, "grad_norm": 1.1787686347961426, "learning_rate": 1.809363924640925e-08, "loss": 0.0993, "step": 126390 }, { "epoch": 2.9471947771851594, "grad_norm": 1.6876786947250366, "learning_rate": 1.8015917428340482e-08, "loss": 0.1004, "step": 126400 }, { "epoch": 2.9474279385619773, "grad_norm": 1.7085915803909302, "learning_rate": 1.7938195610271715e-08, "loss": 0.1105, "step": 126410 }, { "epoch": 2.947661099938795, "grad_norm": 1.6033645868301392, "learning_rate": 1.7860473792202947e-08, "loss": 0.0969, "step": 126420 }, { "epoch": 2.947894261315613, "grad_norm": 1.3929564952850342, "learning_rate": 1.778275197413418e-08, "loss": 0.0962, "step": 126430 }, { "epoch": 2.9481274226924308, "grad_norm": 2.0699920654296875, "learning_rate": 1.770503015606541e-08, "loss": 0.096, "step": 126440 }, { "epoch": 2.948360584069249, "grad_norm": 2.0703811645507812, "learning_rate": 1.762730833799664e-08, "loss": 0.1052, "step": 126450 }, { "epoch": 2.948593745446067, "grad_norm": 1.3000538349151611, "learning_rate": 1.7549586519927873e-08, "loss": 0.1066, "step": 126460 }, { "epoch": 2.9488269068228847, "grad_norm": 2.108633279800415, "learning_rate": 1.7471864701859106e-08, "loss": 0.0969, "step": 126470 }, { "epoch": 2.9490600681997026, "grad_norm": 1.9889583587646484, "learning_rate": 1.7394142883790338e-08, "loss": 0.1018, "step": 126480 }, { "epoch": 2.9492932295765204, "grad_norm": 1.415581226348877, "learning_rate": 1.7316421065721567e-08, "loss": 0.1122, "step": 126490 }, { "epoch": 2.9495263909533387, "grad_norm": 1.2375868558883667, "learning_rate": 1.72386992476528e-08, "loss": 0.0965, "step": 126500 }, { "epoch": 2.9497595523301565, "grad_norm": 1.1853934526443481, "learning_rate": 1.7160977429584032e-08, "loss": 0.1025, "step": 126510 }, { "epoch": 2.9499927137069744, "grad_norm": 1.1537429094314575, "learning_rate": 1.7083255611515264e-08, "loss": 0.0964, "step": 126520 }, { "epoch": 2.9502258750837926, "grad_norm": 1.3132458925247192, "learning_rate": 1.7005533793446493e-08, "loss": 0.1056, "step": 126530 }, { "epoch": 2.95045903646061, "grad_norm": 2.115543842315674, "learning_rate": 1.6927811975377726e-08, "loss": 0.1012, "step": 126540 }, { "epoch": 2.9506921978374283, "grad_norm": 2.487910032272339, "learning_rate": 1.6850090157308958e-08, "loss": 0.1012, "step": 126550 }, { "epoch": 2.950925359214246, "grad_norm": 1.545836091041565, "learning_rate": 1.677236833924019e-08, "loss": 0.1106, "step": 126560 }, { "epoch": 2.951158520591064, "grad_norm": 1.2245484590530396, "learning_rate": 1.6694646521171423e-08, "loss": 0.1031, "step": 126570 }, { "epoch": 2.9513916819678823, "grad_norm": 1.3954188823699951, "learning_rate": 1.6616924703102652e-08, "loss": 0.0909, "step": 126580 }, { "epoch": 2.9516248433447, "grad_norm": 1.2513729333877563, "learning_rate": 1.6539202885033884e-08, "loss": 0.1112, "step": 126590 }, { "epoch": 2.951858004721518, "grad_norm": 1.952170491218567, "learning_rate": 1.6461481066965117e-08, "loss": 0.1002, "step": 126600 }, { "epoch": 2.9520911660983358, "grad_norm": 1.428401231765747, "learning_rate": 1.638375924889635e-08, "loss": 0.1017, "step": 126610 }, { "epoch": 2.9523243274751536, "grad_norm": 2.241164207458496, "learning_rate": 1.630603743082758e-08, "loss": 0.1003, "step": 126620 }, { "epoch": 2.952557488851972, "grad_norm": 1.270097017288208, "learning_rate": 1.622831561275881e-08, "loss": 0.0994, "step": 126630 }, { "epoch": 2.9527906502287897, "grad_norm": 1.2851418256759644, "learning_rate": 1.6150593794690043e-08, "loss": 0.1031, "step": 126640 }, { "epoch": 2.9530238116056076, "grad_norm": 3.5239224433898926, "learning_rate": 1.6072871976621275e-08, "loss": 0.0997, "step": 126650 }, { "epoch": 2.9532569729824254, "grad_norm": 1.535083293914795, "learning_rate": 1.5995150158552508e-08, "loss": 0.1089, "step": 126660 }, { "epoch": 2.9534901343592432, "grad_norm": 2.170104742050171, "learning_rate": 1.591742834048374e-08, "loss": 0.1044, "step": 126670 }, { "epoch": 2.9537232957360615, "grad_norm": 1.3722388744354248, "learning_rate": 1.583970652241497e-08, "loss": 0.0904, "step": 126680 }, { "epoch": 2.9539564571128794, "grad_norm": 1.3167245388031006, "learning_rate": 1.57619847043462e-08, "loss": 0.095, "step": 126690 }, { "epoch": 2.954189618489697, "grad_norm": 3.5678436756134033, "learning_rate": 1.5684262886277434e-08, "loss": 0.1033, "step": 126700 }, { "epoch": 2.954422779866515, "grad_norm": 1.1330058574676514, "learning_rate": 1.5606541068208666e-08, "loss": 0.0978, "step": 126710 }, { "epoch": 2.954655941243333, "grad_norm": 3.0161399841308594, "learning_rate": 1.55288192501399e-08, "loss": 0.1127, "step": 126720 }, { "epoch": 2.954889102620151, "grad_norm": 3.149646282196045, "learning_rate": 1.545109743207113e-08, "loss": 0.1088, "step": 126730 }, { "epoch": 2.955122263996969, "grad_norm": 1.3186559677124023, "learning_rate": 1.5373375614002363e-08, "loss": 0.0935, "step": 126740 }, { "epoch": 2.955355425373787, "grad_norm": 2.948709011077881, "learning_rate": 1.5295653795933592e-08, "loss": 0.1027, "step": 126750 }, { "epoch": 2.9555885867506047, "grad_norm": 2.003922462463379, "learning_rate": 1.5217931977864825e-08, "loss": 0.1026, "step": 126760 }, { "epoch": 2.9558217481274225, "grad_norm": 1.3401358127593994, "learning_rate": 1.5140210159796057e-08, "loss": 0.0944, "step": 126770 }, { "epoch": 2.9560549095042408, "grad_norm": 1.3252501487731934, "learning_rate": 1.506248834172729e-08, "loss": 0.104, "step": 126780 }, { "epoch": 2.9562880708810586, "grad_norm": 1.6251689195632935, "learning_rate": 1.4984766523658522e-08, "loss": 0.1065, "step": 126790 }, { "epoch": 2.9565212322578764, "grad_norm": 1.1945000886917114, "learning_rate": 1.490704470558975e-08, "loss": 0.1009, "step": 126800 }, { "epoch": 2.9567543936346943, "grad_norm": 2.626037836074829, "learning_rate": 1.4829322887520983e-08, "loss": 0.1001, "step": 126810 }, { "epoch": 2.956987555011512, "grad_norm": 2.5474705696105957, "learning_rate": 1.4751601069452216e-08, "loss": 0.0972, "step": 126820 }, { "epoch": 2.9572207163883304, "grad_norm": 1.29435133934021, "learning_rate": 1.4673879251383448e-08, "loss": 0.0953, "step": 126830 }, { "epoch": 2.9574538777651482, "grad_norm": 3.085035800933838, "learning_rate": 1.4596157433314679e-08, "loss": 0.1047, "step": 126840 }, { "epoch": 2.957687039141966, "grad_norm": 1.3042627573013306, "learning_rate": 1.4518435615245911e-08, "loss": 0.1042, "step": 126850 }, { "epoch": 2.957920200518784, "grad_norm": 1.9673112630844116, "learning_rate": 1.4440713797177142e-08, "loss": 0.1076, "step": 126860 }, { "epoch": 2.9581533618956017, "grad_norm": 1.6602027416229248, "learning_rate": 1.4362991979108374e-08, "loss": 0.1024, "step": 126870 }, { "epoch": 2.95838652327242, "grad_norm": 1.2613221406936646, "learning_rate": 1.4285270161039605e-08, "loss": 0.0985, "step": 126880 }, { "epoch": 2.958619684649238, "grad_norm": 2.250993251800537, "learning_rate": 1.4207548342970839e-08, "loss": 0.1157, "step": 126890 }, { "epoch": 2.9588528460260557, "grad_norm": 1.6020227670669556, "learning_rate": 1.4129826524902071e-08, "loss": 0.1053, "step": 126900 }, { "epoch": 2.959086007402874, "grad_norm": 1.0154632329940796, "learning_rate": 1.4052104706833302e-08, "loss": 0.1005, "step": 126910 }, { "epoch": 2.9593191687796914, "grad_norm": 2.4036386013031006, "learning_rate": 1.3974382888764534e-08, "loss": 0.1077, "step": 126920 }, { "epoch": 2.9595523301565096, "grad_norm": 1.3356995582580566, "learning_rate": 1.3896661070695765e-08, "loss": 0.1022, "step": 126930 }, { "epoch": 2.9597854915333275, "grad_norm": 1.6215717792510986, "learning_rate": 1.3818939252626997e-08, "loss": 0.1133, "step": 126940 }, { "epoch": 2.9600186529101453, "grad_norm": 1.5772600173950195, "learning_rate": 1.374121743455823e-08, "loss": 0.1095, "step": 126950 }, { "epoch": 2.9602518142869636, "grad_norm": 1.3497334718704224, "learning_rate": 1.366349561648946e-08, "loss": 0.0963, "step": 126960 }, { "epoch": 2.9604849756637814, "grad_norm": 1.269503116607666, "learning_rate": 1.3585773798420693e-08, "loss": 0.1027, "step": 126970 }, { "epoch": 2.9607181370405993, "grad_norm": 1.637342929840088, "learning_rate": 1.3508051980351924e-08, "loss": 0.1074, "step": 126980 }, { "epoch": 2.960951298417417, "grad_norm": 1.5216737985610962, "learning_rate": 1.3430330162283156e-08, "loss": 0.0964, "step": 126990 }, { "epoch": 2.961184459794235, "grad_norm": 2.010673761367798, "learning_rate": 1.3352608344214387e-08, "loss": 0.1006, "step": 127000 }, { "epoch": 2.9614176211710532, "grad_norm": 2.396738052368164, "learning_rate": 1.3274886526145619e-08, "loss": 0.1055, "step": 127010 }, { "epoch": 2.961650782547871, "grad_norm": 1.618201732635498, "learning_rate": 1.3197164708076851e-08, "loss": 0.1077, "step": 127020 }, { "epoch": 2.961883943924689, "grad_norm": 1.9333305358886719, "learning_rate": 1.3119442890008082e-08, "loss": 0.1139, "step": 127030 }, { "epoch": 2.9621171053015067, "grad_norm": 1.7883304357528687, "learning_rate": 1.3041721071939315e-08, "loss": 0.093, "step": 127040 }, { "epoch": 2.9623502666783246, "grad_norm": 1.1452233791351318, "learning_rate": 1.2963999253870545e-08, "loss": 0.1046, "step": 127050 }, { "epoch": 2.962583428055143, "grad_norm": 3.026245355606079, "learning_rate": 1.2886277435801778e-08, "loss": 0.1032, "step": 127060 }, { "epoch": 2.9628165894319607, "grad_norm": 1.3578912019729614, "learning_rate": 1.280855561773301e-08, "loss": 0.1012, "step": 127070 }, { "epoch": 2.9630497508087785, "grad_norm": 1.7927944660186768, "learning_rate": 1.273083379966424e-08, "loss": 0.1124, "step": 127080 }, { "epoch": 2.9632829121855964, "grad_norm": 1.312867522239685, "learning_rate": 1.2653111981595473e-08, "loss": 0.1007, "step": 127090 }, { "epoch": 2.963516073562414, "grad_norm": 1.280665397644043, "learning_rate": 1.2575390163526704e-08, "loss": 0.0946, "step": 127100 }, { "epoch": 2.9637492349392325, "grad_norm": 1.3489115238189697, "learning_rate": 1.2497668345457936e-08, "loss": 0.1083, "step": 127110 }, { "epoch": 2.9639823963160503, "grad_norm": 1.703897476196289, "learning_rate": 1.2419946527389167e-08, "loss": 0.1018, "step": 127120 }, { "epoch": 2.964215557692868, "grad_norm": 1.243180513381958, "learning_rate": 1.23422247093204e-08, "loss": 0.1083, "step": 127130 }, { "epoch": 2.964448719069686, "grad_norm": 2.358530044555664, "learning_rate": 1.2264502891251632e-08, "loss": 0.1081, "step": 127140 }, { "epoch": 2.964681880446504, "grad_norm": 1.5009253025054932, "learning_rate": 1.2186781073182862e-08, "loss": 0.1013, "step": 127150 }, { "epoch": 2.964915041823322, "grad_norm": 3.729698419570923, "learning_rate": 1.2109059255114095e-08, "loss": 0.0992, "step": 127160 }, { "epoch": 2.96514820320014, "grad_norm": 2.0533740520477295, "learning_rate": 1.2031337437045325e-08, "loss": 0.1018, "step": 127170 }, { "epoch": 2.965381364576958, "grad_norm": 1.7383774518966675, "learning_rate": 1.1953615618976558e-08, "loss": 0.096, "step": 127180 }, { "epoch": 2.9656145259537756, "grad_norm": 1.2102943658828735, "learning_rate": 1.1875893800907792e-08, "loss": 0.0952, "step": 127190 }, { "epoch": 2.9658476873305935, "grad_norm": 1.1213808059692383, "learning_rate": 1.1805944164645899e-08, "loss": 0.1019, "step": 127200 }, { "epoch": 2.9660808487074117, "grad_norm": 1.8119250535964966, "learning_rate": 1.1728222346577131e-08, "loss": 0.1019, "step": 127210 }, { "epoch": 2.9663140100842296, "grad_norm": 0.9767786264419556, "learning_rate": 1.1650500528508362e-08, "loss": 0.0925, "step": 127220 }, { "epoch": 2.9665471714610474, "grad_norm": 1.0049405097961426, "learning_rate": 1.1572778710439594e-08, "loss": 0.0905, "step": 127230 }, { "epoch": 2.9667803328378652, "grad_norm": 1.5535436868667603, "learning_rate": 1.1495056892370827e-08, "loss": 0.0943, "step": 127240 }, { "epoch": 2.967013494214683, "grad_norm": 1.4337540864944458, "learning_rate": 1.1417335074302058e-08, "loss": 0.0953, "step": 127250 }, { "epoch": 2.9672466555915014, "grad_norm": 1.7887636423110962, "learning_rate": 1.133961325623329e-08, "loss": 0.1008, "step": 127260 }, { "epoch": 2.967479816968319, "grad_norm": 2.300694465637207, "learning_rate": 1.126189143816452e-08, "loss": 0.097, "step": 127270 }, { "epoch": 2.967712978345137, "grad_norm": 2.4105751514434814, "learning_rate": 1.1184169620095753e-08, "loss": 0.0918, "step": 127280 }, { "epoch": 2.9679461397219553, "grad_norm": 1.203013300895691, "learning_rate": 1.1106447802026984e-08, "loss": 0.1066, "step": 127290 }, { "epoch": 2.968179301098773, "grad_norm": 1.4985543489456177, "learning_rate": 1.1028725983958216e-08, "loss": 0.1052, "step": 127300 }, { "epoch": 2.968412462475591, "grad_norm": 1.1533414125442505, "learning_rate": 1.0951004165889448e-08, "loss": 0.0997, "step": 127310 }, { "epoch": 2.968645623852409, "grad_norm": 1.374732494354248, "learning_rate": 1.087328234782068e-08, "loss": 0.0971, "step": 127320 }, { "epoch": 2.9688787852292267, "grad_norm": 3.2536277770996094, "learning_rate": 1.0795560529751912e-08, "loss": 0.0988, "step": 127330 }, { "epoch": 2.969111946606045, "grad_norm": 1.410869836807251, "learning_rate": 1.0717838711683142e-08, "loss": 0.1027, "step": 127340 }, { "epoch": 2.9693451079828628, "grad_norm": 1.298521637916565, "learning_rate": 1.0640116893614375e-08, "loss": 0.1036, "step": 127350 }, { "epoch": 2.9695782693596806, "grad_norm": 1.4352563619613647, "learning_rate": 1.0562395075545607e-08, "loss": 0.1108, "step": 127360 }, { "epoch": 2.9698114307364984, "grad_norm": 1.3939547538757324, "learning_rate": 1.0484673257476838e-08, "loss": 0.0992, "step": 127370 }, { "epoch": 2.9700445921133163, "grad_norm": 2.4400389194488525, "learning_rate": 1.040695143940807e-08, "loss": 0.0907, "step": 127380 }, { "epoch": 2.9702777534901346, "grad_norm": 1.70416259765625, "learning_rate": 1.03292296213393e-08, "loss": 0.0938, "step": 127390 }, { "epoch": 2.9705109148669524, "grad_norm": 1.7460813522338867, "learning_rate": 1.0251507803270533e-08, "loss": 0.0969, "step": 127400 }, { "epoch": 2.9707440762437702, "grad_norm": 1.305688500404358, "learning_rate": 1.0173785985201764e-08, "loss": 0.1101, "step": 127410 }, { "epoch": 2.970977237620588, "grad_norm": 2.313572883605957, "learning_rate": 1.0096064167132996e-08, "loss": 0.1052, "step": 127420 }, { "epoch": 2.971210398997406, "grad_norm": 1.2008472681045532, "learning_rate": 1.001834234906423e-08, "loss": 0.1048, "step": 127430 }, { "epoch": 2.971443560374224, "grad_norm": 1.2701765298843384, "learning_rate": 9.940620530995461e-09, "loss": 0.1022, "step": 127440 }, { "epoch": 2.971676721751042, "grad_norm": 1.7250392436981201, "learning_rate": 9.862898712926693e-09, "loss": 0.0974, "step": 127450 }, { "epoch": 2.97190988312786, "grad_norm": 1.2588226795196533, "learning_rate": 9.785176894857924e-09, "loss": 0.0995, "step": 127460 }, { "epoch": 2.9721430445046777, "grad_norm": 2.082590341567993, "learning_rate": 9.707455076789156e-09, "loss": 0.1043, "step": 127470 }, { "epoch": 2.9723762058814955, "grad_norm": 1.25155770778656, "learning_rate": 9.629733258720389e-09, "loss": 0.1003, "step": 127480 }, { "epoch": 2.972609367258314, "grad_norm": 1.6224174499511719, "learning_rate": 9.55201144065162e-09, "loss": 0.0897, "step": 127490 }, { "epoch": 2.9728425286351317, "grad_norm": 1.9086604118347168, "learning_rate": 9.474289622582852e-09, "loss": 0.0956, "step": 127500 }, { "epoch": 2.9730756900119495, "grad_norm": 1.2932709455490112, "learning_rate": 9.396567804514083e-09, "loss": 0.0934, "step": 127510 }, { "epoch": 2.9733088513887673, "grad_norm": 1.1909047365188599, "learning_rate": 9.318845986445315e-09, "loss": 0.0927, "step": 127520 }, { "epoch": 2.973542012765585, "grad_norm": 1.2458986043930054, "learning_rate": 9.241124168376546e-09, "loss": 0.0971, "step": 127530 }, { "epoch": 2.9737751741424034, "grad_norm": 1.362221121788025, "learning_rate": 9.163402350307778e-09, "loss": 0.0952, "step": 127540 }, { "epoch": 2.9740083355192213, "grad_norm": 1.5261377096176147, "learning_rate": 9.08568053223901e-09, "loss": 0.1074, "step": 127550 }, { "epoch": 2.974241496896039, "grad_norm": 1.4424455165863037, "learning_rate": 9.007958714170241e-09, "loss": 0.1113, "step": 127560 }, { "epoch": 2.974474658272857, "grad_norm": 1.345104694366455, "learning_rate": 8.930236896101474e-09, "loss": 0.1022, "step": 127570 }, { "epoch": 2.974707819649675, "grad_norm": 1.957334280014038, "learning_rate": 8.852515078032704e-09, "loss": 0.1063, "step": 127580 }, { "epoch": 2.974940981026493, "grad_norm": 2.0624494552612305, "learning_rate": 8.774793259963937e-09, "loss": 0.1005, "step": 127590 }, { "epoch": 2.975174142403311, "grad_norm": 2.1301398277282715, "learning_rate": 8.697071441895169e-09, "loss": 0.1063, "step": 127600 }, { "epoch": 2.9754073037801287, "grad_norm": 1.305397868156433, "learning_rate": 8.6193496238264e-09, "loss": 0.1022, "step": 127610 }, { "epoch": 2.975640465156947, "grad_norm": 1.2397874593734741, "learning_rate": 8.541627805757632e-09, "loss": 0.0929, "step": 127620 }, { "epoch": 2.9758736265337644, "grad_norm": 2.4042506217956543, "learning_rate": 8.463905987688863e-09, "loss": 0.1131, "step": 127630 }, { "epoch": 2.9761067879105827, "grad_norm": 1.1387805938720703, "learning_rate": 8.386184169620095e-09, "loss": 0.108, "step": 127640 }, { "epoch": 2.9763399492874005, "grad_norm": 1.2751516103744507, "learning_rate": 8.308462351551326e-09, "loss": 0.1058, "step": 127650 }, { "epoch": 2.9765731106642184, "grad_norm": 2.358964443206787, "learning_rate": 8.230740533482558e-09, "loss": 0.1009, "step": 127660 }, { "epoch": 2.9768062720410366, "grad_norm": 1.543352484703064, "learning_rate": 8.15301871541379e-09, "loss": 0.0944, "step": 127670 }, { "epoch": 2.9770394334178545, "grad_norm": 2.5360372066497803, "learning_rate": 8.075296897345021e-09, "loss": 0.1126, "step": 127680 }, { "epoch": 2.9772725947946723, "grad_norm": 1.5655946731567383, "learning_rate": 7.997575079276254e-09, "loss": 0.1086, "step": 127690 }, { "epoch": 2.97750575617149, "grad_norm": 2.4635772705078125, "learning_rate": 7.919853261207484e-09, "loss": 0.1072, "step": 127700 }, { "epoch": 2.977738917548308, "grad_norm": 2.5406925678253174, "learning_rate": 7.842131443138717e-09, "loss": 0.1153, "step": 127710 }, { "epoch": 2.9779720789251263, "grad_norm": 3.0088179111480713, "learning_rate": 7.76440962506995e-09, "loss": 0.0984, "step": 127720 }, { "epoch": 2.978205240301944, "grad_norm": 1.7337621450424194, "learning_rate": 7.686687807001182e-09, "loss": 0.0979, "step": 127730 }, { "epoch": 2.978438401678762, "grad_norm": 1.848719835281372, "learning_rate": 7.608965988932412e-09, "loss": 0.1032, "step": 127740 }, { "epoch": 2.97867156305558, "grad_norm": 1.9941295385360718, "learning_rate": 7.531244170863645e-09, "loss": 0.118, "step": 127750 }, { "epoch": 2.9789047244323976, "grad_norm": 1.371153712272644, "learning_rate": 7.453522352794875e-09, "loss": 0.1002, "step": 127760 }, { "epoch": 2.979137885809216, "grad_norm": 1.1511772871017456, "learning_rate": 7.375800534726108e-09, "loss": 0.1086, "step": 127770 }, { "epoch": 2.9793710471860337, "grad_norm": 1.6576119661331177, "learning_rate": 7.298078716657339e-09, "loss": 0.1074, "step": 127780 }, { "epoch": 2.9796042085628516, "grad_norm": 1.7335022687911987, "learning_rate": 7.220356898588571e-09, "loss": 0.0893, "step": 127790 }, { "epoch": 2.9798373699396694, "grad_norm": 2.806011915206909, "learning_rate": 7.1426350805198024e-09, "loss": 0.1009, "step": 127800 }, { "epoch": 2.9800705313164872, "grad_norm": 3.192396402359009, "learning_rate": 7.064913262451036e-09, "loss": 0.1181, "step": 127810 }, { "epoch": 2.9803036926933055, "grad_norm": 1.4653406143188477, "learning_rate": 6.987191444382267e-09, "loss": 0.0936, "step": 127820 }, { "epoch": 2.9805368540701234, "grad_norm": 1.9584405422210693, "learning_rate": 6.909469626313499e-09, "loss": 0.1121, "step": 127830 }, { "epoch": 2.980770015446941, "grad_norm": 0.8685773015022278, "learning_rate": 6.83174780824473e-09, "loss": 0.1018, "step": 127840 }, { "epoch": 2.981003176823759, "grad_norm": 4.373706340789795, "learning_rate": 6.754025990175962e-09, "loss": 0.102, "step": 127850 }, { "epoch": 2.981236338200577, "grad_norm": 1.3207036256790161, "learning_rate": 6.676304172107193e-09, "loss": 0.1062, "step": 127860 }, { "epoch": 2.981469499577395, "grad_norm": 1.6353752613067627, "learning_rate": 6.598582354038426e-09, "loss": 0.1006, "step": 127870 }, { "epoch": 2.981702660954213, "grad_norm": 1.7488523721694946, "learning_rate": 6.520860535969657e-09, "loss": 0.1002, "step": 127880 }, { "epoch": 2.981935822331031, "grad_norm": 1.502022624015808, "learning_rate": 6.443138717900889e-09, "loss": 0.1025, "step": 127890 }, { "epoch": 2.9821689837078487, "grad_norm": 1.1812876462936401, "learning_rate": 6.36541689983212e-09, "loss": 0.1134, "step": 127900 }, { "epoch": 2.9824021450846665, "grad_norm": 2.0547735691070557, "learning_rate": 6.287695081763352e-09, "loss": 0.0971, "step": 127910 }, { "epoch": 2.982635306461485, "grad_norm": 1.1991053819656372, "learning_rate": 6.2099732636945834e-09, "loss": 0.1021, "step": 127920 }, { "epoch": 2.9828684678383026, "grad_norm": 2.804121732711792, "learning_rate": 6.132251445625816e-09, "loss": 0.1057, "step": 127930 }, { "epoch": 2.9831016292151205, "grad_norm": 1.2845426797866821, "learning_rate": 6.054529627557047e-09, "loss": 0.1121, "step": 127940 }, { "epoch": 2.9833347905919383, "grad_norm": 1.4675348997116089, "learning_rate": 5.976807809488279e-09, "loss": 0.0985, "step": 127950 }, { "epoch": 2.983567951968756, "grad_norm": 1.5236190557479858, "learning_rate": 5.899085991419511e-09, "loss": 0.1053, "step": 127960 }, { "epoch": 2.9838011133455744, "grad_norm": 1.2692159414291382, "learning_rate": 5.821364173350743e-09, "loss": 0.1093, "step": 127970 }, { "epoch": 2.9840342747223922, "grad_norm": 1.6292997598648071, "learning_rate": 5.743642355281974e-09, "loss": 0.1057, "step": 127980 }, { "epoch": 2.98426743609921, "grad_norm": 1.9502766132354736, "learning_rate": 5.665920537213207e-09, "loss": 0.1001, "step": 127990 }, { "epoch": 2.9845005974760284, "grad_norm": 2.6718616485595703, "learning_rate": 5.588198719144438e-09, "loss": 0.1117, "step": 128000 }, { "epoch": 2.9847337588528458, "grad_norm": 2.1599280834198, "learning_rate": 5.51047690107567e-09, "loss": 0.103, "step": 128010 }, { "epoch": 2.984966920229664, "grad_norm": 1.4236812591552734, "learning_rate": 5.432755083006901e-09, "loss": 0.1129, "step": 128020 }, { "epoch": 2.985200081606482, "grad_norm": 1.3768638372421265, "learning_rate": 5.355033264938133e-09, "loss": 0.0952, "step": 128030 }, { "epoch": 2.9854332429832997, "grad_norm": 1.6170239448547363, "learning_rate": 5.2773114468693645e-09, "loss": 0.0974, "step": 128040 }, { "epoch": 2.985666404360118, "grad_norm": 2.1608126163482666, "learning_rate": 5.199589628800597e-09, "loss": 0.0995, "step": 128050 }, { "epoch": 2.985899565736936, "grad_norm": 1.917909026145935, "learning_rate": 5.121867810731828e-09, "loss": 0.101, "step": 128060 }, { "epoch": 2.9861327271137537, "grad_norm": 1.379399061203003, "learning_rate": 5.04414599266306e-09, "loss": 0.0953, "step": 128070 }, { "epoch": 2.9863658884905715, "grad_norm": 1.614464282989502, "learning_rate": 4.9664241745942915e-09, "loss": 0.0987, "step": 128080 }, { "epoch": 2.9865990498673893, "grad_norm": 1.1509227752685547, "learning_rate": 4.888702356525523e-09, "loss": 0.0986, "step": 128090 }, { "epoch": 2.9868322112442076, "grad_norm": 1.5892342329025269, "learning_rate": 4.8109805384567546e-09, "loss": 0.1059, "step": 128100 }, { "epoch": 2.9870653726210254, "grad_norm": 1.4519925117492676, "learning_rate": 4.733258720387988e-09, "loss": 0.1049, "step": 128110 }, { "epoch": 2.9872985339978433, "grad_norm": 1.0905941724777222, "learning_rate": 4.655536902319219e-09, "loss": 0.0879, "step": 128120 }, { "epoch": 2.987531695374661, "grad_norm": 1.9800533056259155, "learning_rate": 4.577815084250451e-09, "loss": 0.1087, "step": 128130 }, { "epoch": 2.987764856751479, "grad_norm": 2.227877140045166, "learning_rate": 4.500093266181682e-09, "loss": 0.1107, "step": 128140 }, { "epoch": 2.9879980181282972, "grad_norm": 3.2967653274536133, "learning_rate": 4.422371448112914e-09, "loss": 0.1039, "step": 128150 }, { "epoch": 2.988231179505115, "grad_norm": 1.2132865190505981, "learning_rate": 4.344649630044146e-09, "loss": 0.1023, "step": 128160 }, { "epoch": 2.988464340881933, "grad_norm": 1.983245611190796, "learning_rate": 4.266927811975378e-09, "loss": 0.106, "step": 128170 }, { "epoch": 2.9886975022587507, "grad_norm": 4.235135555267334, "learning_rate": 4.189205993906609e-09, "loss": 0.112, "step": 128180 }, { "epoch": 2.9889306636355686, "grad_norm": 1.4124211072921753, "learning_rate": 4.111484175837841e-09, "loss": 0.1049, "step": 128190 }, { "epoch": 2.989163825012387, "grad_norm": 1.2634315490722656, "learning_rate": 4.0337623577690725e-09, "loss": 0.1028, "step": 128200 }, { "epoch": 2.9893969863892047, "grad_norm": 2.3463187217712402, "learning_rate": 3.956040539700304e-09, "loss": 0.1078, "step": 128210 }, { "epoch": 2.9896301477660225, "grad_norm": 1.3584147691726685, "learning_rate": 3.878318721631536e-09, "loss": 0.1095, "step": 128220 }, { "epoch": 2.9898633091428404, "grad_norm": 2.9711453914642334, "learning_rate": 3.800596903562768e-09, "loss": 0.1029, "step": 128230 }, { "epoch": 2.990096470519658, "grad_norm": 1.376558542251587, "learning_rate": 3.722875085494e-09, "loss": 0.102, "step": 128240 }, { "epoch": 2.9903296318964765, "grad_norm": 2.989351987838745, "learning_rate": 3.6451532674252315e-09, "loss": 0.0975, "step": 128250 }, { "epoch": 2.9905627932732943, "grad_norm": 2.225006341934204, "learning_rate": 3.567431449356463e-09, "loss": 0.1032, "step": 128260 }, { "epoch": 2.990795954650112, "grad_norm": 1.3170117139816284, "learning_rate": 3.489709631287695e-09, "loss": 0.0996, "step": 128270 }, { "epoch": 2.99102911602693, "grad_norm": 2.3306844234466553, "learning_rate": 3.4119878132189265e-09, "loss": 0.0913, "step": 128280 }, { "epoch": 2.991262277403748, "grad_norm": 1.5819752216339111, "learning_rate": 3.334265995150158e-09, "loss": 0.103, "step": 128290 }, { "epoch": 2.991495438780566, "grad_norm": 1.4495642185211182, "learning_rate": 3.2565441770813904e-09, "loss": 0.1052, "step": 128300 }, { "epoch": 2.991728600157384, "grad_norm": 1.739596962928772, "learning_rate": 3.178822359012622e-09, "loss": 0.1003, "step": 128310 }, { "epoch": 2.991961761534202, "grad_norm": 1.0762982368469238, "learning_rate": 3.1011005409438535e-09, "loss": 0.0996, "step": 128320 }, { "epoch": 2.9921949229110196, "grad_norm": 1.5504295825958252, "learning_rate": 3.0233787228750855e-09, "loss": 0.1159, "step": 128330 }, { "epoch": 2.9924280842878375, "grad_norm": 1.7972995042800903, "learning_rate": 2.945656904806317e-09, "loss": 0.0937, "step": 128340 }, { "epoch": 2.9926612456646557, "grad_norm": 1.1575497388839722, "learning_rate": 2.8679350867375486e-09, "loss": 0.1048, "step": 128350 }, { "epoch": 2.9928944070414736, "grad_norm": 3.62398624420166, "learning_rate": 2.7902132686687805e-09, "loss": 0.1181, "step": 128360 }, { "epoch": 2.9931275684182914, "grad_norm": 1.8130671977996826, "learning_rate": 2.7124914506000125e-09, "loss": 0.119, "step": 128370 }, { "epoch": 2.9933607297951097, "grad_norm": 1.3749810457229614, "learning_rate": 2.634769632531244e-09, "loss": 0.1026, "step": 128380 }, { "epoch": 2.9935938911719275, "grad_norm": 1.7448241710662842, "learning_rate": 2.557047814462476e-09, "loss": 0.0982, "step": 128390 }, { "epoch": 2.9938270525487454, "grad_norm": 1.2342958450317383, "learning_rate": 2.4793259963937075e-09, "loss": 0.0944, "step": 128400 }, { "epoch": 2.994060213925563, "grad_norm": 1.395050048828125, "learning_rate": 2.401604178324939e-09, "loss": 0.1068, "step": 128410 }, { "epoch": 2.994293375302381, "grad_norm": 1.796502709388733, "learning_rate": 2.323882360256171e-09, "loss": 0.0953, "step": 128420 }, { "epoch": 2.9945265366791993, "grad_norm": 1.6437351703643799, "learning_rate": 2.2461605421874026e-09, "loss": 0.1099, "step": 128430 }, { "epoch": 2.994759698056017, "grad_norm": 2.17084002494812, "learning_rate": 2.168438724118634e-09, "loss": 0.0976, "step": 128440 }, { "epoch": 2.994992859432835, "grad_norm": 2.443728446960449, "learning_rate": 2.0907169060498665e-09, "loss": 0.0948, "step": 128450 }, { "epoch": 2.995226020809653, "grad_norm": 1.5909383296966553, "learning_rate": 2.012995087981098e-09, "loss": 0.1132, "step": 128460 }, { "epoch": 2.9954591821864707, "grad_norm": 1.7190831899642944, "learning_rate": 1.9352732699123296e-09, "loss": 0.0995, "step": 128470 }, { "epoch": 2.995692343563289, "grad_norm": 1.2876211404800415, "learning_rate": 1.8575514518435615e-09, "loss": 0.1126, "step": 128480 }, { "epoch": 2.995925504940107, "grad_norm": 1.200893521308899, "learning_rate": 1.779829633774793e-09, "loss": 0.1118, "step": 128490 }, { "epoch": 2.9961586663169246, "grad_norm": 2.1558732986450195, "learning_rate": 1.702107815706025e-09, "loss": 0.1043, "step": 128500 }, { "epoch": 2.9963918276937425, "grad_norm": 1.4482110738754272, "learning_rate": 1.6243859976372568e-09, "loss": 0.1008, "step": 128510 }, { "epoch": 2.9966249890705603, "grad_norm": 1.7854620218276978, "learning_rate": 1.5466641795684883e-09, "loss": 0.1084, "step": 128520 }, { "epoch": 2.9968581504473786, "grad_norm": 2.4632534980773926, "learning_rate": 1.46894236149972e-09, "loss": 0.0995, "step": 128530 }, { "epoch": 2.9970913118241964, "grad_norm": 3.024911880493164, "learning_rate": 1.391220543430952e-09, "loss": 0.0928, "step": 128540 }, { "epoch": 2.9973244732010142, "grad_norm": 2.446653127670288, "learning_rate": 1.3134987253621836e-09, "loss": 0.0968, "step": 128550 }, { "epoch": 2.997557634577832, "grad_norm": 2.118914842605591, "learning_rate": 1.2357769072934153e-09, "loss": 0.0891, "step": 128560 }, { "epoch": 2.99779079595465, "grad_norm": 1.3939154148101807, "learning_rate": 1.158055089224647e-09, "loss": 0.1044, "step": 128570 }, { "epoch": 2.998023957331468, "grad_norm": 2.1586532592773438, "learning_rate": 1.0803332711558788e-09, "loss": 0.1026, "step": 128580 }, { "epoch": 2.998257118708286, "grad_norm": 2.2598366737365723, "learning_rate": 1.0026114530871106e-09, "loss": 0.0998, "step": 128590 }, { "epoch": 2.998490280085104, "grad_norm": 1.6056699752807617, "learning_rate": 9.248896350183422e-10, "loss": 0.0886, "step": 128600 }, { "epoch": 2.9987234414619217, "grad_norm": 1.33486008644104, "learning_rate": 8.471678169495741e-10, "loss": 0.095, "step": 128610 }, { "epoch": 2.9989566028387395, "grad_norm": 1.7967939376831055, "learning_rate": 7.694459988808058e-10, "loss": 0.0977, "step": 128620 }, { "epoch": 2.999189764215558, "grad_norm": 1.449737310409546, "learning_rate": 6.917241808120375e-10, "loss": 0.0926, "step": 128630 }, { "epoch": 2.9994229255923757, "grad_norm": 2.4125988483428955, "learning_rate": 6.140023627432693e-10, "loss": 0.102, "step": 128640 }, { "epoch": 2.9996560869691935, "grad_norm": 1.3649572134017944, "learning_rate": 5.36280544674501e-10, "loss": 0.107, "step": 128650 }, { "epoch": 2.9998892483460113, "grad_norm": 2.116607427597046, "learning_rate": 4.5855872660573275e-10, "loss": 0.1032, "step": 128660 } ], "logging_steps": 10, "max_steps": 128664, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.545059434864378e+18, "train_batch_size": 48, "trial_name": null, "trial_params": null }