{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0000155339403651, "eval_steps": 500, "global_step": 193128, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00103559334320599, "grad_norm": 1.026977300643921, "learning_rate": 0.00029999980548445665, "loss": 6.8456, "step": 100 }, { "epoch": 0.00207118668641198, "grad_norm": 1.0050791501998901, "learning_rate": 0.0002999992140592828, "loss": 5.1595, "step": 200 }, { "epoch": 0.0031067800296179695, "grad_norm": 0.9459976553916931, "learning_rate": 0.000299998225706197, "loss": 4.5657, "step": 300 }, { "epoch": 0.00414237337282396, "grad_norm": 0.8726732730865479, "learning_rate": 0.00029999684042781455, "loss": 4.2289, "step": 400 }, { "epoch": 0.00517796671602995, "grad_norm": 0.7343498468399048, "learning_rate": 0.00029999505822780116, "loss": 4.0452, "step": 500 }, { "epoch": 0.006213560059235939, "grad_norm": 0.7063133716583252, "learning_rate": 0.00029999287911087303, "loss": 3.8879, "step": 600 }, { "epoch": 0.007249153402441929, "grad_norm": 0.648737907409668, "learning_rate": 0.00029999030308279636, "loss": 3.7432, "step": 700 }, { "epoch": 0.00828474674564792, "grad_norm": 0.7131102085113525, "learning_rate": 0.0002999873301503879, "loss": 3.6559, "step": 800 }, { "epoch": 0.009320340088853908, "grad_norm": 0.648167073726654, "learning_rate": 0.0002999839603215147, "loss": 3.5505, "step": 900 }, { "epoch": 0.0103559334320599, "grad_norm": 0.7449036240577698, "learning_rate": 0.0002999801936050939, "loss": 3.4726, "step": 1000 }, { "epoch": 0.011391526775265889, "grad_norm": 0.5992525219917297, "learning_rate": 0.0002999760300110931, "loss": 3.4005, "step": 1100 }, { "epoch": 0.012427120118471878, "grad_norm": 0.6302841305732727, "learning_rate": 0.00029997146955052986, "loss": 3.3274, "step": 1200 }, { "epoch": 0.013462713461677869, "grad_norm": 0.6017561554908752, "learning_rate": 0.00029996651223547217, "loss": 3.2857, "step": 1300 }, { "epoch": 0.014498306804883858, "grad_norm": 0.5809340476989746, "learning_rate": 0.000299961158079038, "loss": 3.2497, "step": 1400 }, { "epoch": 0.015533900148089847, "grad_norm": 0.5803759098052979, "learning_rate": 0.0002999554070953956, "loss": 3.19, "step": 1500 }, { "epoch": 0.01656949349129584, "grad_norm": 0.5678243041038513, "learning_rate": 0.0002999492592997632, "loss": 3.1437, "step": 1600 }, { "epoch": 0.017605086834501828, "grad_norm": 0.586242139339447, "learning_rate": 0.00029994271470840903, "loss": 3.1104, "step": 1700 }, { "epoch": 0.018640680177707817, "grad_norm": 0.5574731826782227, "learning_rate": 0.0002999357733386515, "loss": 3.066, "step": 1800 }, { "epoch": 0.019676273520913806, "grad_norm": 0.5647651553153992, "learning_rate": 0.0002999284352088588, "loss": 3.0575, "step": 1900 }, { "epoch": 0.0207118668641198, "grad_norm": 0.5294833183288574, "learning_rate": 0.00029992070033844903, "loss": 3.0183, "step": 2000 }, { "epoch": 0.021747460207325788, "grad_norm": 0.5482518076896667, "learning_rate": 0.0002999125687478903, "loss": 2.9948, "step": 2100 }, { "epoch": 0.022783053550531777, "grad_norm": 0.594523549079895, "learning_rate": 0.0002999040404587004, "loss": 2.9556, "step": 2200 }, { "epoch": 0.023818646893737767, "grad_norm": 0.5697149634361267, "learning_rate": 0.0002998951154934469, "loss": 2.948, "step": 2300 }, { "epoch": 0.024854240236943756, "grad_norm": 0.4975557029247284, "learning_rate": 0.00029988579387574694, "loss": 2.9257, "step": 2400 }, { "epoch": 0.025889833580149745, "grad_norm": 0.49620944261550903, "learning_rate": 0.00029987607563026754, "loss": 2.9044, "step": 2500 }, { "epoch": 0.026925426923355738, "grad_norm": 0.5182105898857117, "learning_rate": 0.00029986596078272504, "loss": 2.8733, "step": 2600 }, { "epoch": 0.027961020266561727, "grad_norm": 0.5067175626754761, "learning_rate": 0.0002998554493598853, "loss": 2.8418, "step": 2700 }, { "epoch": 0.028996613609767716, "grad_norm": 0.4979783594608307, "learning_rate": 0.0002998445413895638, "loss": 2.8504, "step": 2800 }, { "epoch": 0.030032206952973706, "grad_norm": 0.5144481658935547, "learning_rate": 0.00029983323690062504, "loss": 2.84, "step": 2900 }, { "epoch": 0.031067800296179695, "grad_norm": 0.5084084868431091, "learning_rate": 0.00029982153592298305, "loss": 2.8205, "step": 3000 }, { "epoch": 0.032103393639385684, "grad_norm": 0.4909542202949524, "learning_rate": 0.0002998094384876009, "loss": 2.8185, "step": 3100 }, { "epoch": 0.03313898698259168, "grad_norm": 0.49054595828056335, "learning_rate": 0.00029979694462649087, "loss": 2.8038, "step": 3200 }, { "epoch": 0.03417458032579766, "grad_norm": 0.4718395471572876, "learning_rate": 0.0002997840543727142, "loss": 2.7718, "step": 3300 }, { "epoch": 0.035210173669003655, "grad_norm": 0.4961560070514679, "learning_rate": 0.00029977076776038113, "loss": 2.7824, "step": 3400 }, { "epoch": 0.03624576701220965, "grad_norm": 0.48904070258140564, "learning_rate": 0.0002997570848246506, "loss": 2.7565, "step": 3500 }, { "epoch": 0.037281360355415634, "grad_norm": 0.44885751605033875, "learning_rate": 0.00029974300560173044, "loss": 2.7573, "step": 3600 }, { "epoch": 0.03831695369862163, "grad_norm": 0.49806690216064453, "learning_rate": 0.00029972853012887713, "loss": 2.7463, "step": 3700 }, { "epoch": 0.03935254704182761, "grad_norm": 0.4702439606189728, "learning_rate": 0.0002997136584443956, "loss": 2.7406, "step": 3800 }, { "epoch": 0.040388140385033605, "grad_norm": 0.46370580792427063, "learning_rate": 0.0002996983905876393, "loss": 2.7244, "step": 3900 }, { "epoch": 0.0414237337282396, "grad_norm": 0.4597495198249817, "learning_rate": 0.0002996827265990101, "loss": 2.7098, "step": 4000 }, { "epoch": 0.042459327071445584, "grad_norm": 0.44511985778808594, "learning_rate": 0.00029966666651995796, "loss": 2.7061, "step": 4100 }, { "epoch": 0.043494920414651576, "grad_norm": 0.47016459703445435, "learning_rate": 0.00029965021039298113, "loss": 2.6925, "step": 4200 }, { "epoch": 0.04453051375785756, "grad_norm": 0.4496707022190094, "learning_rate": 0.0002996333582616257, "loss": 2.6649, "step": 4300 }, { "epoch": 0.045566107101063555, "grad_norm": 0.43062758445739746, "learning_rate": 0.000299616110170486, "loss": 2.6815, "step": 4400 }, { "epoch": 0.04660170044426955, "grad_norm": 0.45944711565971375, "learning_rate": 0.0002995984661652037, "loss": 2.6736, "step": 4500 }, { "epoch": 0.04763729378747553, "grad_norm": 0.42545244097709656, "learning_rate": 0.0002995804262924685, "loss": 2.6703, "step": 4600 }, { "epoch": 0.048672887130681526, "grad_norm": 0.44761550426483154, "learning_rate": 0.00029956199060001743, "loss": 2.6635, "step": 4700 }, { "epoch": 0.04970848047388751, "grad_norm": 0.44097450375556946, "learning_rate": 0.00029954315913663506, "loss": 2.651, "step": 4800 }, { "epoch": 0.050744073817093505, "grad_norm": 0.428134948015213, "learning_rate": 0.00029952393195215325, "loss": 2.6577, "step": 4900 }, { "epoch": 0.05177966716029949, "grad_norm": 0.43022963404655457, "learning_rate": 0.00029950430909745084, "loss": 2.6272, "step": 5000 }, { "epoch": 0.05281526050350548, "grad_norm": 0.43754976987838745, "learning_rate": 0.00029948429062445395, "loss": 2.6302, "step": 5100 }, { "epoch": 0.053850853846711476, "grad_norm": 0.46531617641448975, "learning_rate": 0.00029946387658613547, "loss": 2.6289, "step": 5200 }, { "epoch": 0.05488644718991746, "grad_norm": 0.4297070801258087, "learning_rate": 0.00029944306703651494, "loss": 2.6173, "step": 5300 }, { "epoch": 0.055922040533123454, "grad_norm": 0.44832634925842285, "learning_rate": 0.0002994218620306587, "loss": 2.6213, "step": 5400 }, { "epoch": 0.05695763387632944, "grad_norm": 0.4039616584777832, "learning_rate": 0.0002994002616246793, "loss": 2.612, "step": 5500 }, { "epoch": 0.05799322721953543, "grad_norm": 0.40514007210731506, "learning_rate": 0.0002993782658757358, "loss": 2.6042, "step": 5600 }, { "epoch": 0.059028820562741426, "grad_norm": 0.44061511754989624, "learning_rate": 0.00029935587484203335, "loss": 2.6053, "step": 5700 }, { "epoch": 0.06006441390594741, "grad_norm": 0.41186845302581787, "learning_rate": 0.00029933308858282314, "loss": 2.5962, "step": 5800 }, { "epoch": 0.061100007249153404, "grad_norm": 0.4145048260688782, "learning_rate": 0.0002993099071584021, "loss": 2.5779, "step": 5900 }, { "epoch": 0.06213560059235939, "grad_norm": 0.4050670862197876, "learning_rate": 0.0002992863306301129, "loss": 2.5856, "step": 6000 }, { "epoch": 0.06317119393556538, "grad_norm": 0.3998633623123169, "learning_rate": 0.0002992623590603439, "loss": 2.5598, "step": 6100 }, { "epoch": 0.06420678727877137, "grad_norm": 0.4318375587463379, "learning_rate": 0.0002992379925125285, "loss": 2.5922, "step": 6200 }, { "epoch": 0.06524238062197736, "grad_norm": 0.4028064012527466, "learning_rate": 0.0002992132310511455, "loss": 2.5702, "step": 6300 }, { "epoch": 0.06627797396518335, "grad_norm": 0.4095822870731354, "learning_rate": 0.00029918832825909307, "loss": 2.5431, "step": 6400 }, { "epoch": 0.06731356730838935, "grad_norm": 0.40318048000335693, "learning_rate": 0.00029916278111567296, "loss": 2.563, "step": 6500 }, { "epoch": 0.06834916065159533, "grad_norm": 0.3942864239215851, "learning_rate": 0.0002991368392577096, "loss": 2.5638, "step": 6600 }, { "epoch": 0.06938475399480132, "grad_norm": 0.39761781692504883, "learning_rate": 0.00029911050275385043, "loss": 2.5645, "step": 6700 }, { "epoch": 0.07042034733800731, "grad_norm": 0.38176122307777405, "learning_rate": 0.00029908377167378705, "loss": 2.5446, "step": 6800 }, { "epoch": 0.0714559406812133, "grad_norm": 0.3925549387931824, "learning_rate": 0.00029905664608825523, "loss": 2.5446, "step": 6900 }, { "epoch": 0.0724915340244193, "grad_norm": 0.39650392532348633, "learning_rate": 0.00029902912606903474, "loss": 2.5279, "step": 7000 }, { "epoch": 0.07352712736762528, "grad_norm": 0.38231194019317627, "learning_rate": 0.00029900121168894905, "loss": 2.5319, "step": 7100 }, { "epoch": 0.07456272071083127, "grad_norm": 0.43189162015914917, "learning_rate": 0.0002989729030218652, "loss": 2.5351, "step": 7200 }, { "epoch": 0.07559831405403726, "grad_norm": 0.3876056373119354, "learning_rate": 0.00029894420014269353, "loss": 2.534, "step": 7300 }, { "epoch": 0.07663390739724325, "grad_norm": 0.4233093559741974, "learning_rate": 0.0002989151031273876, "loss": 2.5106, "step": 7400 }, { "epoch": 0.07766950074044925, "grad_norm": 0.4047662019729614, "learning_rate": 0.000298885612052944, "loss": 2.5137, "step": 7500 }, { "epoch": 0.07870509408365522, "grad_norm": 0.39603501558303833, "learning_rate": 0.000298855726997402, "loss": 2.5155, "step": 7600 }, { "epoch": 0.07974068742686122, "grad_norm": 0.38630104064941406, "learning_rate": 0.00029882544803984336, "loss": 2.5007, "step": 7700 }, { "epoch": 0.08077628077006721, "grad_norm": 0.39435768127441406, "learning_rate": 0.0002987947752603923, "loss": 2.4984, "step": 7800 }, { "epoch": 0.0818118741132732, "grad_norm": 0.3888026475906372, "learning_rate": 0.00029876370874021513, "loss": 2.4993, "step": 7900 }, { "epoch": 0.0828474674564792, "grad_norm": 0.3844919800758362, "learning_rate": 0.00029873224856152007, "loss": 2.4882, "step": 8000 }, { "epoch": 0.08388306079968517, "grad_norm": 0.372907429933548, "learning_rate": 0.00029870039480755703, "loss": 2.4996, "step": 8100 }, { "epoch": 0.08491865414289117, "grad_norm": 0.3768796920776367, "learning_rate": 0.00029866814756261743, "loss": 2.5001, "step": 8200 }, { "epoch": 0.08595424748609716, "grad_norm": 0.3979648947715759, "learning_rate": 0.00029863550691203394, "loss": 2.4917, "step": 8300 }, { "epoch": 0.08698984082930315, "grad_norm": 0.40186259150505066, "learning_rate": 0.00029860247294218024, "loss": 2.486, "step": 8400 }, { "epoch": 0.08802543417250915, "grad_norm": 0.37083563208580017, "learning_rate": 0.00029856904574047076, "loss": 2.4791, "step": 8500 }, { "epoch": 0.08906102751571512, "grad_norm": 0.3823927342891693, "learning_rate": 0.00029853522539536063, "loss": 2.4748, "step": 8600 }, { "epoch": 0.09009662085892112, "grad_norm": 0.370646208524704, "learning_rate": 0.0002985010119963452, "loss": 2.4644, "step": 8700 }, { "epoch": 0.09113221420212711, "grad_norm": 0.398028701543808, "learning_rate": 0.00029846640563396, "loss": 2.467, "step": 8800 }, { "epoch": 0.0921678075453331, "grad_norm": 0.3974510133266449, "learning_rate": 0.00029843140639978046, "loss": 2.4412, "step": 8900 }, { "epoch": 0.0932034008885391, "grad_norm": 0.40102872252464294, "learning_rate": 0.00029839601438642134, "loss": 2.4575, "step": 9000 }, { "epoch": 0.09423899423174507, "grad_norm": 0.3847925066947937, "learning_rate": 0.0002983602296875371, "loss": 2.4482, "step": 9100 }, { "epoch": 0.09527458757495107, "grad_norm": 0.38044506311416626, "learning_rate": 0.0002983240523978213, "loss": 2.4715, "step": 9200 }, { "epoch": 0.09631018091815706, "grad_norm": 0.37497472763061523, "learning_rate": 0.00029828748261300625, "loss": 2.4464, "step": 9300 }, { "epoch": 0.09734577426136305, "grad_norm": 0.364352822303772, "learning_rate": 0.00029825052042986274, "loss": 2.4545, "step": 9400 }, { "epoch": 0.09838136760456903, "grad_norm": 0.3712766468524933, "learning_rate": 0.0002982131659462002, "loss": 2.4485, "step": 9500 }, { "epoch": 0.09941696094777502, "grad_norm": 0.3670974671840668, "learning_rate": 0.0002981754192608662, "loss": 2.4476, "step": 9600 }, { "epoch": 0.10045255429098102, "grad_norm": 0.352822870016098, "learning_rate": 0.00029813728047374587, "loss": 2.4447, "step": 9700 }, { "epoch": 0.10148814763418701, "grad_norm": 0.38108208775520325, "learning_rate": 0.00029809874968576204, "loss": 2.4325, "step": 9800 }, { "epoch": 0.102523740977393, "grad_norm": 0.3761230707168579, "learning_rate": 0.0002980598269988749, "loss": 2.4392, "step": 9900 }, { "epoch": 0.10355933432059898, "grad_norm": 0.3923780024051666, "learning_rate": 0.0002980205125160818, "loss": 2.4282, "step": 10000 }, { "epoch": 0.10459492766380497, "grad_norm": 0.36487850546836853, "learning_rate": 0.00029798080634141653, "loss": 2.4336, "step": 10100 }, { "epoch": 0.10563052100701097, "grad_norm": 0.376537561416626, "learning_rate": 0.00029794070857994966, "loss": 2.4363, "step": 10200 }, { "epoch": 0.10666611435021696, "grad_norm": 0.35036200284957886, "learning_rate": 0.0002979002193377879, "loss": 2.4371, "step": 10300 }, { "epoch": 0.10770170769342295, "grad_norm": 0.36887550354003906, "learning_rate": 0.00029785933872207386, "loss": 2.4226, "step": 10400 }, { "epoch": 0.10873730103662893, "grad_norm": 0.35275015234947205, "learning_rate": 0.00029781806684098586, "loss": 2.4256, "step": 10500 }, { "epoch": 0.10977289437983492, "grad_norm": 0.3568744957447052, "learning_rate": 0.00029777640380373744, "loss": 2.425, "step": 10600 }, { "epoch": 0.11080848772304092, "grad_norm": 0.3581329584121704, "learning_rate": 0.00029773434972057744, "loss": 2.4259, "step": 10700 }, { "epoch": 0.11184408106624691, "grad_norm": 0.36360159516334534, "learning_rate": 0.0002976919047027893, "loss": 2.4087, "step": 10800 }, { "epoch": 0.1128796744094529, "grad_norm": 0.3543790578842163, "learning_rate": 0.0002976490688626911, "loss": 2.4214, "step": 10900 }, { "epoch": 0.11391526775265888, "grad_norm": 0.3874112367630005, "learning_rate": 0.00029760584231363496, "loss": 2.4167, "step": 11000 }, { "epoch": 0.11495086109586487, "grad_norm": 0.3504088521003723, "learning_rate": 0.000297562225170007, "loss": 2.404, "step": 11100 }, { "epoch": 0.11598645443907087, "grad_norm": 0.36760446429252625, "learning_rate": 0.0002975182175472269, "loss": 2.4053, "step": 11200 }, { "epoch": 0.11702204778227686, "grad_norm": 0.353981077671051, "learning_rate": 0.00029747381956174765, "loss": 2.4087, "step": 11300 }, { "epoch": 0.11805764112548285, "grad_norm": 0.3907415568828583, "learning_rate": 0.0002974294811446876, "loss": 2.3834, "step": 11400 }, { "epoch": 0.11909323446868883, "grad_norm": 0.3460184633731842, "learning_rate": 0.00029738430668797746, "loss": 2.408, "step": 11500 }, { "epoch": 0.12012882781189482, "grad_norm": 0.3757179379463196, "learning_rate": 0.00029733874222292295, "loss": 2.3976, "step": 11600 }, { "epoch": 0.12116442115510082, "grad_norm": 0.34744834899902344, "learning_rate": 0.00029729278787009696, "loss": 2.4081, "step": 11700 }, { "epoch": 0.12220001449830681, "grad_norm": 0.3553219437599182, "learning_rate": 0.00029724644375110377, "loss": 2.3885, "step": 11800 }, { "epoch": 0.1232356078415128, "grad_norm": 0.3509339988231659, "learning_rate": 0.0002971997099885792, "loss": 2.4014, "step": 11900 }, { "epoch": 0.12427120118471878, "grad_norm": 0.34177297353744507, "learning_rate": 0.0002971525867061902, "loss": 2.3781, "step": 12000 }, { "epoch": 0.1253067945279248, "grad_norm": 0.358868271112442, "learning_rate": 0.00029710507402863433, "loss": 2.3779, "step": 12100 }, { "epoch": 0.12634238787113075, "grad_norm": 0.36207178235054016, "learning_rate": 0.00029705765302757796, "loss": 2.3832, "step": 12200 }, { "epoch": 0.12737798121433674, "grad_norm": 0.35576069355010986, "learning_rate": 0.00029700936582869854, "loss": 2.3797, "step": 12300 }, { "epoch": 0.12841357455754274, "grad_norm": 0.36441880464553833, "learning_rate": 0.0002969606896136434, "loss": 2.3914, "step": 12400 }, { "epoch": 0.12944916790074873, "grad_norm": 0.3493978679180145, "learning_rate": 0.00029691162451121945, "loss": 2.3684, "step": 12500 }, { "epoch": 0.13048476124395472, "grad_norm": 0.34875813126564026, "learning_rate": 0.00029686217065126277, "loss": 2.3752, "step": 12600 }, { "epoch": 0.13152035458716071, "grad_norm": 0.32410070300102234, "learning_rate": 0.0002968123281646383, "loss": 2.3741, "step": 12700 }, { "epoch": 0.1325559479303667, "grad_norm": 0.3325338065624237, "learning_rate": 0.000296762097183239, "loss": 2.3643, "step": 12800 }, { "epoch": 0.1335915412735727, "grad_norm": 0.3336518406867981, "learning_rate": 0.0002967114778399862, "loss": 2.3641, "step": 12900 }, { "epoch": 0.1346271346167787, "grad_norm": 0.34080633521080017, "learning_rate": 0.00029666047026882884, "loss": 2.3558, "step": 13000 }, { "epoch": 0.13566272795998469, "grad_norm": 0.36425724625587463, "learning_rate": 0.0002966090746047431, "loss": 2.3802, "step": 13100 }, { "epoch": 0.13669832130319065, "grad_norm": 0.3432435095310211, "learning_rate": 0.0002965572909837322, "loss": 2.3636, "step": 13200 }, { "epoch": 0.13773391464639664, "grad_norm": 0.34149283170700073, "learning_rate": 0.00029650511954282596, "loss": 2.3668, "step": 13300 }, { "epoch": 0.13876950798960264, "grad_norm": 0.3454321622848511, "learning_rate": 0.00029645256042008046, "loss": 2.3992, "step": 13400 }, { "epoch": 0.13980510133280863, "grad_norm": 0.3457489013671875, "learning_rate": 0.0002963996137545776, "loss": 2.3534, "step": 13500 }, { "epoch": 0.14084069467601462, "grad_norm": 0.34762370586395264, "learning_rate": 0.00029634627968642486, "loss": 2.3647, "step": 13600 }, { "epoch": 0.14187628801922061, "grad_norm": 0.34544262290000916, "learning_rate": 0.0002962925583567549, "loss": 2.3552, "step": 13700 }, { "epoch": 0.1429118813624266, "grad_norm": 0.33848074078559875, "learning_rate": 0.000296238449907725, "loss": 2.3524, "step": 13800 }, { "epoch": 0.1439474747056326, "grad_norm": 0.33040791749954224, "learning_rate": 0.00029618395448251705, "loss": 2.3473, "step": 13900 }, { "epoch": 0.1449830680488386, "grad_norm": 0.3539000153541565, "learning_rate": 0.0002961290722253367, "loss": 2.3658, "step": 14000 }, { "epoch": 0.14601866139204459, "grad_norm": 0.33283117413520813, "learning_rate": 0.00029607380328141355, "loss": 2.3363, "step": 14100 }, { "epoch": 0.14705425473525055, "grad_norm": 0.3440040946006775, "learning_rate": 0.00029601814779700017, "loss": 2.3383, "step": 14200 }, { "epoch": 0.14808984807845654, "grad_norm": 0.375338077545166, "learning_rate": 0.0002959621059193721, "loss": 2.3565, "step": 14300 }, { "epoch": 0.14912544142166254, "grad_norm": 0.34358930587768555, "learning_rate": 0.00029590567779682735, "loss": 2.3497, "step": 14400 }, { "epoch": 0.15016103476486853, "grad_norm": 0.3195157051086426, "learning_rate": 0.000295848863578686, "loss": 2.3443, "step": 14500 }, { "epoch": 0.15119662810807452, "grad_norm": 0.3252195417881012, "learning_rate": 0.00029579166341528994, "loss": 2.3429, "step": 14600 }, { "epoch": 0.1522322214512805, "grad_norm": 0.3274465799331665, "learning_rate": 0.00029573407745800217, "loss": 2.344, "step": 14700 }, { "epoch": 0.1532678147944865, "grad_norm": 0.3528379797935486, "learning_rate": 0.00029567610585920663, "loss": 2.3323, "step": 14800 }, { "epoch": 0.1543034081376925, "grad_norm": 0.3459634482860565, "learning_rate": 0.00029561774877230775, "loss": 2.3294, "step": 14900 }, { "epoch": 0.1553390014808985, "grad_norm": 0.3366759121417999, "learning_rate": 0.00029555900635173005, "loss": 2.316, "step": 15000 }, { "epoch": 0.15637459482410446, "grad_norm": 0.3397423326969147, "learning_rate": 0.0002954998787529178, "loss": 2.3258, "step": 15100 }, { "epoch": 0.15741018816731045, "grad_norm": 0.32144978642463684, "learning_rate": 0.0002954403661323343, "loss": 2.3255, "step": 15200 }, { "epoch": 0.15844578151051644, "grad_norm": 0.336167573928833, "learning_rate": 0.00029538046864746207, "loss": 2.33, "step": 15300 }, { "epoch": 0.15948137485372244, "grad_norm": 0.3246804177761078, "learning_rate": 0.00029532018645680156, "loss": 2.3065, "step": 15400 }, { "epoch": 0.16051696819692843, "grad_norm": 0.3427474796772003, "learning_rate": 0.0002952595197198717, "loss": 2.3324, "step": 15500 }, { "epoch": 0.16155256154013442, "grad_norm": 0.322971910238266, "learning_rate": 0.00029519846859720877, "loss": 2.3272, "step": 15600 }, { "epoch": 0.1625881548833404, "grad_norm": 0.34383878111839294, "learning_rate": 0.00029513703325036617, "loss": 2.3284, "step": 15700 }, { "epoch": 0.1636237482265464, "grad_norm": 0.3304254114627838, "learning_rate": 0.0002950752138419142, "loss": 2.3072, "step": 15800 }, { "epoch": 0.1646593415697524, "grad_norm": 0.32697999477386475, "learning_rate": 0.00029501301053543933, "loss": 2.3272, "step": 15900 }, { "epoch": 0.1656949349129584, "grad_norm": 0.3319956064224243, "learning_rate": 0.0002949504234955439, "loss": 2.3223, "step": 16000 }, { "epoch": 0.16673052825616436, "grad_norm": 0.3203921914100647, "learning_rate": 0.0002948874528878458, "loss": 2.3195, "step": 16100 }, { "epoch": 0.16776612159937035, "grad_norm": 0.31924641132354736, "learning_rate": 0.00029482409887897793, "loss": 2.3028, "step": 16200 }, { "epoch": 0.16880171494257634, "grad_norm": 0.3042753040790558, "learning_rate": 0.0002947603616365874, "loss": 2.2924, "step": 16300 }, { "epoch": 0.16983730828578233, "grad_norm": 0.3248065710067749, "learning_rate": 0.00029469624132933596, "loss": 2.3217, "step": 16400 }, { "epoch": 0.17087290162898833, "grad_norm": 0.32555335760116577, "learning_rate": 0.0002946317381268987, "loss": 2.3292, "step": 16500 }, { "epoch": 0.17190849497219432, "grad_norm": 0.30988815426826477, "learning_rate": 0.00029456750295315663, "loss": 2.3032, "step": 16600 }, { "epoch": 0.1729440883154003, "grad_norm": 0.3234647810459137, "learning_rate": 0.0002945022382981002, "loss": 2.2918, "step": 16700 }, { "epoch": 0.1739796816586063, "grad_norm": 0.3368314206600189, "learning_rate": 0.00029443659126122875, "loss": 2.3135, "step": 16800 }, { "epoch": 0.1750152750018123, "grad_norm": 0.3322274088859558, "learning_rate": 0.00029437056201625744, "loss": 2.302, "step": 16900 }, { "epoch": 0.1760508683450183, "grad_norm": 0.3242185115814209, "learning_rate": 0.0002943041507379128, "loss": 2.3076, "step": 17000 }, { "epoch": 0.17708646168822426, "grad_norm": 0.31631043553352356, "learning_rate": 0.00029423735760193253, "loss": 2.3045, "step": 17100 }, { "epoch": 0.17812205503143025, "grad_norm": 0.325104296207428, "learning_rate": 0.00029417018278506444, "loss": 2.2935, "step": 17200 }, { "epoch": 0.17915764837463624, "grad_norm": 0.32389163970947266, "learning_rate": 0.0002941026264650666, "loss": 2.2962, "step": 17300 }, { "epoch": 0.18019324171784223, "grad_norm": 0.34304019808769226, "learning_rate": 0.0002940346888207065, "loss": 2.3073, "step": 17400 }, { "epoch": 0.18122883506104823, "grad_norm": 0.33360743522644043, "learning_rate": 0.0002939663700317608, "loss": 2.3038, "step": 17500 }, { "epoch": 0.18226442840425422, "grad_norm": 0.3324051797389984, "learning_rate": 0.00029389767027901463, "loss": 2.2903, "step": 17600 }, { "epoch": 0.1833000217474602, "grad_norm": 0.324114054441452, "learning_rate": 0.0002938285897442613, "loss": 2.2929, "step": 17700 }, { "epoch": 0.1843356150906662, "grad_norm": 0.5985817313194275, "learning_rate": 0.0002937591286103016, "loss": 2.2912, "step": 17800 }, { "epoch": 0.1853712084338722, "grad_norm": 0.3194010555744171, "learning_rate": 0.00029368928706094373, "loss": 2.2878, "step": 17900 }, { "epoch": 0.1864068017770782, "grad_norm": 0.3176760673522949, "learning_rate": 0.00029361906528100235, "loss": 2.3022, "step": 18000 }, { "epoch": 0.18744239512028416, "grad_norm": 0.32210999727249146, "learning_rate": 0.0002935484634562983, "loss": 2.2862, "step": 18100 }, { "epoch": 0.18847798846349015, "grad_norm": 0.3103707730770111, "learning_rate": 0.0002934774817736582, "loss": 2.288, "step": 18200 }, { "epoch": 0.18951358180669614, "grad_norm": 0.348433256149292, "learning_rate": 0.00029340612042091365, "loss": 2.2818, "step": 18300 }, { "epoch": 0.19054917514990213, "grad_norm": 0.3255865275859833, "learning_rate": 0.0002933343795869011, "loss": 2.2811, "step": 18400 }, { "epoch": 0.19158476849310813, "grad_norm": 0.3146674633026123, "learning_rate": 0.00029326225946146124, "loss": 2.2828, "step": 18500 }, { "epoch": 0.19262036183631412, "grad_norm": 0.32427600026130676, "learning_rate": 0.00029318976023543826, "loss": 2.2843, "step": 18600 }, { "epoch": 0.1936559551795201, "grad_norm": 0.3294953405857086, "learning_rate": 0.00029311688210067956, "loss": 2.2894, "step": 18700 }, { "epoch": 0.1946915485227261, "grad_norm": 0.3182157874107361, "learning_rate": 0.0002930436252500354, "loss": 2.2801, "step": 18800 }, { "epoch": 0.1957271418659321, "grad_norm": 0.321718692779541, "learning_rate": 0.00029296998987735784, "loss": 2.2873, "step": 18900 }, { "epoch": 0.19676273520913806, "grad_norm": 0.31337428092956543, "learning_rate": 0.000292895976177501, "loss": 2.2723, "step": 19000 }, { "epoch": 0.19779832855234405, "grad_norm": 0.31540966033935547, "learning_rate": 0.00029282158434631983, "loss": 2.2804, "step": 19100 }, { "epoch": 0.19883392189555005, "grad_norm": 0.31452780961990356, "learning_rate": 0.00029274681458067004, "loss": 2.2715, "step": 19200 }, { "epoch": 0.19986951523875604, "grad_norm": 0.31823790073394775, "learning_rate": 0.00029267166707840737, "loss": 2.2832, "step": 19300 }, { "epoch": 0.20090510858196203, "grad_norm": 0.308596134185791, "learning_rate": 0.0002925961420383871, "loss": 2.2708, "step": 19400 }, { "epoch": 0.20194070192516803, "grad_norm": 0.3338356018066406, "learning_rate": 0.0002925202396604636, "loss": 2.2736, "step": 19500 }, { "epoch": 0.20297629526837402, "grad_norm": 0.3159531354904175, "learning_rate": 0.00029244396014548973, "loss": 2.26, "step": 19600 }, { "epoch": 0.20401188861158, "grad_norm": 0.320416122674942, "learning_rate": 0.0002923673036953164, "loss": 2.2678, "step": 19700 }, { "epoch": 0.205047481954786, "grad_norm": 0.31474068760871887, "learning_rate": 0.0002922902705127918, "loss": 2.2738, "step": 19800 }, { "epoch": 0.206083075297992, "grad_norm": 0.3203798234462738, "learning_rate": 0.0002922128608017612, "loss": 2.2854, "step": 19900 }, { "epoch": 0.20711866864119796, "grad_norm": 0.3128805458545685, "learning_rate": 0.0002921350747670662, "loss": 2.2573, "step": 20000 }, { "epoch": 0.20815426198440395, "grad_norm": 0.31023895740509033, "learning_rate": 0.0002920569126145441, "loss": 2.2539, "step": 20100 }, { "epoch": 0.20918985532760995, "grad_norm": 0.3238907754421234, "learning_rate": 0.0002919783745510277, "loss": 2.2579, "step": 20200 }, { "epoch": 0.21022544867081594, "grad_norm": 0.3260408937931061, "learning_rate": 0.00029189946078434436, "loss": 2.2602, "step": 20300 }, { "epoch": 0.21126104201402193, "grad_norm": 0.3139326572418213, "learning_rate": 0.0002918201715233157, "loss": 2.2577, "step": 20400 }, { "epoch": 0.21229663535722793, "grad_norm": 0.31961774826049805, "learning_rate": 0.00029174050697775694, "loss": 2.2635, "step": 20500 }, { "epoch": 0.21333222870043392, "grad_norm": 0.30576133728027344, "learning_rate": 0.0002916604673584764, "loss": 2.2504, "step": 20600 }, { "epoch": 0.2143678220436399, "grad_norm": 0.3124041259288788, "learning_rate": 0.00029158005287727493, "loss": 2.2469, "step": 20700 }, { "epoch": 0.2154034153868459, "grad_norm": 0.31025636196136475, "learning_rate": 0.0002914992637469453, "loss": 2.2553, "step": 20800 }, { "epoch": 0.2164390087300519, "grad_norm": 0.3061777353286743, "learning_rate": 0.00029141810018127175, "loss": 2.2546, "step": 20900 }, { "epoch": 0.21747460207325786, "grad_norm": 0.3112472593784332, "learning_rate": 0.00029133656239502926, "loss": 2.2503, "step": 21000 }, { "epoch": 0.21851019541646385, "grad_norm": 0.316125750541687, "learning_rate": 0.00029125465060398314, "loss": 2.2644, "step": 21100 }, { "epoch": 0.21954578875966985, "grad_norm": 0.34522414207458496, "learning_rate": 0.0002911723650248883, "loss": 2.249, "step": 21200 }, { "epoch": 0.22058138210287584, "grad_norm": 0.30432987213134766, "learning_rate": 0.000291089705875489, "loss": 2.254, "step": 21300 }, { "epoch": 0.22161697544608183, "grad_norm": 0.2924303114414215, "learning_rate": 0.00029100667337451767, "loss": 2.2575, "step": 21400 }, { "epoch": 0.22265256878928782, "grad_norm": 0.30747348070144653, "learning_rate": 0.0002909241036443021, "loss": 2.2455, "step": 21500 }, { "epoch": 0.22368816213249382, "grad_norm": 0.3152693212032318, "learning_rate": 0.0002908403288283518, "loss": 2.2391, "step": 21600 }, { "epoch": 0.2247237554756998, "grad_norm": 0.3082772493362427, "learning_rate": 0.0002907561813207311, "loss": 2.2525, "step": 21700 }, { "epoch": 0.2257593488189058, "grad_norm": 0.28237590193748474, "learning_rate": 0.0002906716613441111, "loss": 2.2576, "step": 21800 }, { "epoch": 0.2267949421621118, "grad_norm": 0.32759952545166016, "learning_rate": 0.00029058676912214863, "loss": 2.2507, "step": 21900 }, { "epoch": 0.22783053550531776, "grad_norm": 0.3267344534397125, "learning_rate": 0.0002905015048794854, "loss": 2.245, "step": 22000 }, { "epoch": 0.22886612884852375, "grad_norm": 0.29457345604896545, "learning_rate": 0.00029041672704176744, "loss": 2.2656, "step": 22100 }, { "epoch": 0.22990172219172975, "grad_norm": 0.3085882365703583, "learning_rate": 0.000290330723150125, "loss": 2.2431, "step": 22200 }, { "epoch": 0.23093731553493574, "grad_norm": 0.3108998239040375, "learning_rate": 0.0002902443479153307, "loss": 2.2398, "step": 22300 }, { "epoch": 0.23197290887814173, "grad_norm": 0.3052406311035156, "learning_rate": 0.0002901576015659507, "loss": 2.249, "step": 22400 }, { "epoch": 0.23300850222134772, "grad_norm": 0.2990036904811859, "learning_rate": 0.00029007048433153307, "loss": 2.2304, "step": 22500 }, { "epoch": 0.23404409556455372, "grad_norm": 0.3134109079837799, "learning_rate": 0.0002899829964426075, "loss": 2.2455, "step": 22600 }, { "epoch": 0.2350796889077597, "grad_norm": 0.28845998644828796, "learning_rate": 0.0002898951381306845, "loss": 2.2519, "step": 22700 }, { "epoch": 0.2361152822509657, "grad_norm": 0.31323590874671936, "learning_rate": 0.00028980690962825465, "loss": 2.2377, "step": 22800 }, { "epoch": 0.23715087559417167, "grad_norm": 0.30466216802597046, "learning_rate": 0.00028971831116878817, "loss": 2.2389, "step": 22900 }, { "epoch": 0.23818646893737766, "grad_norm": 0.3084065914154053, "learning_rate": 0.0002896293429867344, "loss": 2.235, "step": 23000 }, { "epoch": 0.23922206228058365, "grad_norm": 0.3137773275375366, "learning_rate": 0.000289540900522399, "loss": 2.2343, "step": 23100 }, { "epoch": 0.24025765562378965, "grad_norm": 0.2849820852279663, "learning_rate": 0.0002894511972937653, "loss": 2.2275, "step": 23200 }, { "epoch": 0.24129324896699564, "grad_norm": 0.30652517080307007, "learning_rate": 0.00028936112504938087, "loss": 2.2287, "step": 23300 }, { "epoch": 0.24232884231020163, "grad_norm": 0.3142112195491791, "learning_rate": 0.000289270684027595, "loss": 2.2522, "step": 23400 }, { "epoch": 0.24336443565340762, "grad_norm": 0.31096917390823364, "learning_rate": 0.0002891807843868066, "loss": 2.2273, "step": 23500 }, { "epoch": 0.24440002899661362, "grad_norm": 0.30777618288993835, "learning_rate": 0.0002890896102109533, "loss": 2.2293, "step": 23600 }, { "epoch": 0.2454356223398196, "grad_norm": 0.2939918041229248, "learning_rate": 0.000288998067976181, "loss": 2.2222, "step": 23700 }, { "epoch": 0.2464712156830256, "grad_norm": 0.297870934009552, "learning_rate": 0.0002889061579247289, "loss": 2.2183, "step": 23800 }, { "epoch": 0.24750680902623157, "grad_norm": 0.3040159344673157, "learning_rate": 0.00028881388029980935, "loss": 2.2181, "step": 23900 }, { "epoch": 0.24854240236943756, "grad_norm": 0.30422455072402954, "learning_rate": 0.0002887212353456074, "loss": 2.223, "step": 24000 }, { "epoch": 0.24957799571264355, "grad_norm": 0.28880754113197327, "learning_rate": 0.0002886282233072801, "loss": 2.2245, "step": 24100 }, { "epoch": 0.2506135890558496, "grad_norm": 0.29438304901123047, "learning_rate": 0.00028853484443095584, "loss": 2.2178, "step": 24200 }, { "epoch": 0.25164918239905554, "grad_norm": 0.3096265196800232, "learning_rate": 0.0002884410989637339, "loss": 2.2166, "step": 24300 }, { "epoch": 0.2526847757422615, "grad_norm": 0.29660022258758545, "learning_rate": 0.00028834698715368354, "loss": 2.2213, "step": 24400 }, { "epoch": 0.2537203690854675, "grad_norm": 0.30047789216041565, "learning_rate": 0.0002882525092498433, "loss": 2.2251, "step": 24500 }, { "epoch": 0.2547559624286735, "grad_norm": 0.29826945066452026, "learning_rate": 0.0002881576655022207, "loss": 2.2266, "step": 24600 }, { "epoch": 0.2557915557718795, "grad_norm": 0.29573553800582886, "learning_rate": 0.0002880624561617913, "loss": 2.1965, "step": 24700 }, { "epoch": 0.2568271491150855, "grad_norm": 0.3082514703273773, "learning_rate": 0.00028796688148049793, "loss": 2.2083, "step": 24800 }, { "epoch": 0.2578627424582915, "grad_norm": 0.29338857531547546, "learning_rate": 0.00028787094171125045, "loss": 2.2254, "step": 24900 }, { "epoch": 0.25889833580149746, "grad_norm": 0.3036903738975525, "learning_rate": 0.0002877746371079245, "loss": 2.2088, "step": 25000 }, { "epoch": 0.2599339291447035, "grad_norm": 0.3120311498641968, "learning_rate": 0.00028767796792536146, "loss": 2.216, "step": 25100 }, { "epoch": 0.26096952248790944, "grad_norm": 0.3183603882789612, "learning_rate": 0.00028758093441936725, "loss": 2.2091, "step": 25200 }, { "epoch": 0.26200511583111546, "grad_norm": 0.3062897026538849, "learning_rate": 0.00028748353684671194, "loss": 2.2294, "step": 25300 }, { "epoch": 0.26304070917432143, "grad_norm": 0.3039403557777405, "learning_rate": 0.000287385775465129, "loss": 2.2104, "step": 25400 }, { "epoch": 0.2640763025175274, "grad_norm": 0.2830694615840912, "learning_rate": 0.0002872876505333145, "loss": 2.2245, "step": 25500 }, { "epoch": 0.2651118958607334, "grad_norm": 0.3058655261993408, "learning_rate": 0.00028718916231092683, "loss": 2.2073, "step": 25600 }, { "epoch": 0.2661474892039394, "grad_norm": 0.2940881550312042, "learning_rate": 0.0002870903110585853, "loss": 2.2055, "step": 25700 }, { "epoch": 0.2671830825471454, "grad_norm": 0.2841914892196655, "learning_rate": 0.0002869910970378702, "loss": 2.1967, "step": 25800 }, { "epoch": 0.26821867589035137, "grad_norm": 0.3028845191001892, "learning_rate": 0.00028689152051132165, "loss": 2.2228, "step": 25900 }, { "epoch": 0.2692542692335574, "grad_norm": 0.3082047402858734, "learning_rate": 0.00028679158174243905, "loss": 2.2041, "step": 26000 }, { "epoch": 0.27028986257676335, "grad_norm": 0.30441662669181824, "learning_rate": 0.0002866912809956804, "loss": 2.2024, "step": 26100 }, { "epoch": 0.27132545591996937, "grad_norm": 0.2886449694633484, "learning_rate": 0.00028659061853646145, "loss": 2.2018, "step": 26200 }, { "epoch": 0.27236104926317534, "grad_norm": 0.2993202209472656, "learning_rate": 0.00028648959463115526, "loss": 2.2082, "step": 26300 }, { "epoch": 0.2733966426063813, "grad_norm": 0.28834328055381775, "learning_rate": 0.0002863882095470912, "loss": 2.207, "step": 26400 }, { "epoch": 0.2744322359495873, "grad_norm": 0.3029148280620575, "learning_rate": 0.0002862864635525546, "loss": 2.2128, "step": 26500 }, { "epoch": 0.2754678292927933, "grad_norm": 0.28520163893699646, "learning_rate": 0.0002861843569167856, "loss": 2.2126, "step": 26600 }, { "epoch": 0.2765034226359993, "grad_norm": 0.31348302960395813, "learning_rate": 0.0002860818899099788, "loss": 2.218, "step": 26700 }, { "epoch": 0.2775390159792053, "grad_norm": 0.2876550853252411, "learning_rate": 0.0002859790628032825, "loss": 2.2054, "step": 26800 }, { "epoch": 0.2785746093224113, "grad_norm": 0.27926498651504517, "learning_rate": 0.00028587587586879757, "loss": 2.1972, "step": 26900 }, { "epoch": 0.27961020266561726, "grad_norm": 0.27992501854896545, "learning_rate": 0.00028577232937957743, "loss": 2.1919, "step": 27000 }, { "epoch": 0.2806457960088233, "grad_norm": 0.30239158868789673, "learning_rate": 0.0002856684236096268, "loss": 2.2035, "step": 27100 }, { "epoch": 0.28168138935202924, "grad_norm": 0.28912410140037537, "learning_rate": 0.000285564158833901, "loss": 2.1942, "step": 27200 }, { "epoch": 0.2827169826952352, "grad_norm": 0.3459964692592621, "learning_rate": 0.00028545953532830556, "loss": 2.1964, "step": 27300 }, { "epoch": 0.28375257603844123, "grad_norm": 0.2805766761302948, "learning_rate": 0.00028535455336969515, "loss": 2.1905, "step": 27400 }, { "epoch": 0.2847881693816472, "grad_norm": 0.2879911959171295, "learning_rate": 0.00028524921323587314, "loss": 2.2079, "step": 27500 }, { "epoch": 0.2858237627248532, "grad_norm": 0.28764310479164124, "learning_rate": 0.00028514351520559045, "loss": 2.1965, "step": 27600 }, { "epoch": 0.2868593560680592, "grad_norm": 0.28259527683258057, "learning_rate": 0.0002850374595585452, "loss": 2.1943, "step": 27700 }, { "epoch": 0.2878949494112652, "grad_norm": 0.30469658970832825, "learning_rate": 0.00028493104657538194, "loss": 2.1917, "step": 27800 }, { "epoch": 0.28893054275447116, "grad_norm": 0.28971099853515625, "learning_rate": 0.0002848242765376907, "loss": 2.1815, "step": 27900 }, { "epoch": 0.2899661360976772, "grad_norm": 0.29892224073410034, "learning_rate": 0.0002847171497280062, "loss": 2.1959, "step": 28000 }, { "epoch": 0.29100172944088315, "grad_norm": 0.3001098930835724, "learning_rate": 0.00028460966642980756, "loss": 2.1847, "step": 28100 }, { "epoch": 0.29203732278408917, "grad_norm": 0.28982076048851013, "learning_rate": 0.00028450182692751696, "loss": 2.1965, "step": 28200 }, { "epoch": 0.29307291612729514, "grad_norm": 0.28448495268821716, "learning_rate": 0.00028439363150649936, "loss": 2.1836, "step": 28300 }, { "epoch": 0.2941085094705011, "grad_norm": 0.2766580879688263, "learning_rate": 0.0002842850804530615, "loss": 2.1933, "step": 28400 }, { "epoch": 0.2951441028137071, "grad_norm": 0.29042986035346985, "learning_rate": 0.00028417617405445105, "loss": 2.1786, "step": 28500 }, { "epoch": 0.2961796961569131, "grad_norm": 0.2862723469734192, "learning_rate": 0.00028406691259885627, "loss": 2.1957, "step": 28600 }, { "epoch": 0.2972152895001191, "grad_norm": 0.2934880554676056, "learning_rate": 0.0002839572963754047, "loss": 2.1884, "step": 28700 }, { "epoch": 0.29825088284332507, "grad_norm": 0.3762242794036865, "learning_rate": 0.0002838473256741628, "loss": 2.1892, "step": 28800 }, { "epoch": 0.2992864761865311, "grad_norm": 0.29659008979797363, "learning_rate": 0.0002837370007861351, "loss": 2.1874, "step": 28900 }, { "epoch": 0.30032206952973706, "grad_norm": 0.29282739758491516, "learning_rate": 0.00028362632200326323, "loss": 2.1757, "step": 29000 }, { "epoch": 0.3013576628729431, "grad_norm": 0.278513640165329, "learning_rate": 0.00028351528961842546, "loss": 2.1821, "step": 29100 }, { "epoch": 0.30239325621614904, "grad_norm": 0.289573609828949, "learning_rate": 0.00028340390392543564, "loss": 2.1802, "step": 29200 }, { "epoch": 0.303428849559355, "grad_norm": 0.29329681396484375, "learning_rate": 0.00028329216521904253, "loss": 2.2034, "step": 29300 }, { "epoch": 0.304464442902561, "grad_norm": 0.2838422954082489, "learning_rate": 0.0002831800737949292, "loss": 2.1836, "step": 29400 }, { "epoch": 0.305500036245767, "grad_norm": 0.27982914447784424, "learning_rate": 0.00028306762994971193, "loss": 2.1736, "step": 29500 }, { "epoch": 0.306535629588973, "grad_norm": 0.29552334547042847, "learning_rate": 0.00028295483398093957, "loss": 2.1905, "step": 29600 }, { "epoch": 0.307571222932179, "grad_norm": 0.28646761178970337, "learning_rate": 0.0002828416861870928, "loss": 2.1993, "step": 29700 }, { "epoch": 0.308606816275385, "grad_norm": 0.2799683213233948, "learning_rate": 0.0002827281868675834, "loss": 2.1762, "step": 29800 }, { "epoch": 0.30964240961859096, "grad_norm": 0.28532129526138306, "learning_rate": 0.00028261433632275316, "loss": 2.17, "step": 29900 }, { "epoch": 0.310678002961797, "grad_norm": 0.27241483330726624, "learning_rate": 0.0002825001348538735, "loss": 2.1904, "step": 30000 }, { "epoch": 0.31171359630500295, "grad_norm": 0.29317033290863037, "learning_rate": 0.00028238558276314423, "loss": 2.1808, "step": 30100 }, { "epoch": 0.3127491896482089, "grad_norm": 0.2925361394882202, "learning_rate": 0.00028227068035369314, "loss": 2.1718, "step": 30200 }, { "epoch": 0.31378478299141493, "grad_norm": 0.28848564624786377, "learning_rate": 0.00028215542792957494, "loss": 2.1888, "step": 30300 }, { "epoch": 0.3148203763346209, "grad_norm": 0.2919078469276428, "learning_rate": 0.0002820398257957706, "loss": 2.1751, "step": 30400 }, { "epoch": 0.3158559696778269, "grad_norm": 0.29842934012413025, "learning_rate": 0.0002819238742581866, "loss": 2.1908, "step": 30500 }, { "epoch": 0.3168915630210329, "grad_norm": 0.2826724052429199, "learning_rate": 0.0002818075736236537, "loss": 2.1653, "step": 30600 }, { "epoch": 0.3179271563642389, "grad_norm": 0.2898552715778351, "learning_rate": 0.00028169092419992673, "loss": 2.161, "step": 30700 }, { "epoch": 0.31896274970744487, "grad_norm": 0.2759314775466919, "learning_rate": 0.0002815739262956834, "loss": 2.1746, "step": 30800 }, { "epoch": 0.3199983430506509, "grad_norm": 0.29369938373565674, "learning_rate": 0.00028145658022052356, "loss": 2.1628, "step": 30900 }, { "epoch": 0.32103393639385686, "grad_norm": 0.281052827835083, "learning_rate": 0.0002813388862849683, "loss": 2.1668, "step": 31000 }, { "epoch": 0.3220695297370629, "grad_norm": 0.2892392873764038, "learning_rate": 0.00028122084480045945, "loss": 2.1733, "step": 31100 }, { "epoch": 0.32310512308026884, "grad_norm": 0.2832517623901367, "learning_rate": 0.00028110245607935823, "loss": 2.1904, "step": 31200 }, { "epoch": 0.3241407164234748, "grad_norm": 0.300758421421051, "learning_rate": 0.000280983720434945, "loss": 2.1602, "step": 31300 }, { "epoch": 0.3251763097666808, "grad_norm": 0.28140220046043396, "learning_rate": 0.00028086463818141795, "loss": 2.1634, "step": 31400 }, { "epoch": 0.3262119031098868, "grad_norm": 0.2958952784538269, "learning_rate": 0.0002807452096338925, "loss": 2.1611, "step": 31500 }, { "epoch": 0.3272474964530928, "grad_norm": 0.2867650091648102, "learning_rate": 0.00028062543510840054, "loss": 2.1853, "step": 31600 }, { "epoch": 0.3282830897962988, "grad_norm": 0.27689510583877563, "learning_rate": 0.0002805053149218893, "loss": 2.1782, "step": 31700 }, { "epoch": 0.3293186831395048, "grad_norm": 0.282528817653656, "learning_rate": 0.0002803860557559203, "loss": 2.1739, "step": 31800 }, { "epoch": 0.33035427648271076, "grad_norm": 0.2702793776988983, "learning_rate": 0.000280266458428425, "loss": 2.1588, "step": 31900 }, { "epoch": 0.3313898698259168, "grad_norm": 0.28457000851631165, "learning_rate": 0.00028014531006063795, "loss": 2.1621, "step": 32000 }, { "epoch": 0.33242546316912275, "grad_norm": 0.30046817660331726, "learning_rate": 0.00028002381730233856, "loss": 2.1784, "step": 32100 }, { "epoch": 0.3334610565123287, "grad_norm": 0.2909027934074402, "learning_rate": 0.000279901980475021, "loss": 2.1812, "step": 32200 }, { "epoch": 0.33449664985553473, "grad_norm": 0.3148478865623474, "learning_rate": 0.0002797797999010898, "loss": 2.1702, "step": 32300 }, { "epoch": 0.3355322431987407, "grad_norm": 0.29774710536003113, "learning_rate": 0.0002796572759038592, "loss": 2.1696, "step": 32400 }, { "epoch": 0.3365678365419467, "grad_norm": 0.2851652503013611, "learning_rate": 0.0002795344088075523, "loss": 2.1529, "step": 32500 }, { "epoch": 0.3376034298851527, "grad_norm": 0.2804024815559387, "learning_rate": 0.0002794111989372999, "loss": 2.1575, "step": 32600 }, { "epoch": 0.3386390232283587, "grad_norm": 0.27687808871269226, "learning_rate": 0.00027928764661914, "loss": 2.1701, "step": 32700 }, { "epoch": 0.33967461657156467, "grad_norm": 0.27368998527526855, "learning_rate": 0.0002791637521800167, "loss": 2.1637, "step": 32800 }, { "epoch": 0.3407102099147707, "grad_norm": 0.28404197096824646, "learning_rate": 0.00027903951594777953, "loss": 2.1699, "step": 32900 }, { "epoch": 0.34174580325797665, "grad_norm": 0.2896747887134552, "learning_rate": 0.00027891493825118227, "loss": 2.154, "step": 33000 }, { "epoch": 0.3427813966011827, "grad_norm": 0.27398934960365295, "learning_rate": 0.00027879001941988254, "loss": 2.1595, "step": 33100 }, { "epoch": 0.34381698994438864, "grad_norm": 0.2844621241092682, "learning_rate": 0.0002786647597844405, "loss": 2.1503, "step": 33200 }, { "epoch": 0.3448525832875946, "grad_norm": 0.2817366123199463, "learning_rate": 0.00027853915967631824, "loss": 2.1439, "step": 33300 }, { "epoch": 0.3458881766308006, "grad_norm": 0.28939321637153625, "learning_rate": 0.0002784132194278786, "loss": 2.1637, "step": 33400 }, { "epoch": 0.3469237699740066, "grad_norm": 0.2798002362251282, "learning_rate": 0.0002782869393723849, "loss": 2.1705, "step": 33500 }, { "epoch": 0.3479593633172126, "grad_norm": 1.0665955543518066, "learning_rate": 0.0002781603198439992, "loss": 2.1631, "step": 33600 }, { "epoch": 0.3489949566604186, "grad_norm": 0.2818824350833893, "learning_rate": 0.0002780333611777822, "loss": 2.1656, "step": 33700 }, { "epoch": 0.3500305500036246, "grad_norm": 0.2864476442337036, "learning_rate": 0.0002779060637096919, "loss": 2.1487, "step": 33800 }, { "epoch": 0.35106614334683056, "grad_norm": 0.3252882957458496, "learning_rate": 0.00027777842777658283, "loss": 2.1625, "step": 33900 }, { "epoch": 0.3521017366900366, "grad_norm": 0.2838381826877594, "learning_rate": 0.0002776504537162052, "loss": 2.1406, "step": 34000 }, { "epoch": 0.35313733003324255, "grad_norm": 0.2922430634498596, "learning_rate": 0.0002775221418672039, "loss": 2.1559, "step": 34100 }, { "epoch": 0.3541729233764485, "grad_norm": 0.27985620498657227, "learning_rate": 0.0002773934925691179, "loss": 2.1518, "step": 34200 }, { "epoch": 0.35520851671965453, "grad_norm": 0.2863463759422302, "learning_rate": 0.0002772645061623788, "loss": 2.1485, "step": 34300 }, { "epoch": 0.3562441100628605, "grad_norm": 0.28856831789016724, "learning_rate": 0.0002771351829883105, "loss": 2.1389, "step": 34400 }, { "epoch": 0.3572797034060665, "grad_norm": 0.29371294379234314, "learning_rate": 0.000277005523389128, "loss": 2.138, "step": 34500 }, { "epoch": 0.3583152967492725, "grad_norm": 0.26485511660575867, "learning_rate": 0.00027687552770793653, "loss": 2.1567, "step": 34600 }, { "epoch": 0.3593508900924785, "grad_norm": 0.2836166322231293, "learning_rate": 0.0002767451962887305, "loss": 2.146, "step": 34700 }, { "epoch": 0.36038648343568447, "grad_norm": 0.29489925503730774, "learning_rate": 0.000276614529476393, "loss": 2.1344, "step": 34800 }, { "epoch": 0.3614220767788905, "grad_norm": 0.29321590065956116, "learning_rate": 0.0002764835276166947, "loss": 2.1373, "step": 34900 }, { "epoch": 0.36245767012209645, "grad_norm": 0.286069393157959, "learning_rate": 0.00027635219105629253, "loss": 2.1444, "step": 35000 }, { "epoch": 0.3634932634653024, "grad_norm": 0.27905821800231934, "learning_rate": 0.00027622052014272927, "loss": 2.1503, "step": 35100 }, { "epoch": 0.36452885680850844, "grad_norm": 0.26933392882347107, "learning_rate": 0.0002760885152244326, "loss": 2.1443, "step": 35200 }, { "epoch": 0.3655644501517144, "grad_norm": 0.29244914650917053, "learning_rate": 0.0002759561766507138, "loss": 2.1494, "step": 35300 }, { "epoch": 0.3666000434949204, "grad_norm": 0.3004661500453949, "learning_rate": 0.00027582350477176737, "loss": 2.1289, "step": 35400 }, { "epoch": 0.3676356368381264, "grad_norm": 0.2758159637451172, "learning_rate": 0.00027569049993866944, "loss": 2.1592, "step": 35500 }, { "epoch": 0.3686712301813324, "grad_norm": 0.29431262612342834, "learning_rate": 0.0002755571625033775, "loss": 2.1532, "step": 35600 }, { "epoch": 0.3697068235245384, "grad_norm": 0.27346983551979065, "learning_rate": 0.0002754234928187291, "loss": 2.1537, "step": 35700 }, { "epoch": 0.3707424168677444, "grad_norm": 0.2814033627510071, "learning_rate": 0.0002752894912384409, "loss": 2.1471, "step": 35800 }, { "epoch": 0.37177801021095036, "grad_norm": 0.28040292859077454, "learning_rate": 0.0002751551581171078, "loss": 2.1331, "step": 35900 }, { "epoch": 0.3728136035541564, "grad_norm": 0.2969619929790497, "learning_rate": 0.00027502184209147047, "loss": 2.1479, "step": 36000 }, { "epoch": 0.37384919689736235, "grad_norm": 0.3014155328273773, "learning_rate": 0.0002748868502618665, "loss": 2.1525, "step": 36100 }, { "epoch": 0.3748847902405683, "grad_norm": 0.2906414568424225, "learning_rate": 0.0002747515279566862, "loss": 2.1345, "step": 36200 }, { "epoch": 0.37592038358377433, "grad_norm": 0.28534749150276184, "learning_rate": 0.00027461723369114977, "loss": 2.1591, "step": 36300 }, { "epoch": 0.3769559769269803, "grad_norm": 0.2885964512825012, "learning_rate": 0.00027448125480576555, "loss": 2.1405, "step": 36400 }, { "epoch": 0.3779915702701863, "grad_norm": 0.28362932801246643, "learning_rate": 0.0002743449465180917, "loss": 2.1468, "step": 36500 }, { "epoch": 0.3790271636133923, "grad_norm": 0.2700118124485016, "learning_rate": 0.0002742083091888272, "loss": 2.1414, "step": 36600 }, { "epoch": 0.3800627569565983, "grad_norm": 0.2751924395561218, "learning_rate": 0.0002740713431795418, "loss": 2.1335, "step": 36700 }, { "epoch": 0.38109835029980427, "grad_norm": 0.25589242577552795, "learning_rate": 0.00027393404885267487, "loss": 2.1339, "step": 36800 }, { "epoch": 0.3821339436430103, "grad_norm": 0.28398755192756653, "learning_rate": 0.0002737964265715347, "loss": 2.1317, "step": 36900 }, { "epoch": 0.38316953698621625, "grad_norm": 0.264668732881546, "learning_rate": 0.00027365847670029727, "loss": 2.1356, "step": 37000 }, { "epoch": 0.3842051303294222, "grad_norm": 0.2758809030056, "learning_rate": 0.0002735201996040056, "loss": 2.1489, "step": 37100 }, { "epoch": 0.38524072367262824, "grad_norm": 0.5117419958114624, "learning_rate": 0.00027338159564856856, "loss": 2.1165, "step": 37200 }, { "epoch": 0.3862763170158342, "grad_norm": 0.2877856492996216, "learning_rate": 0.00027324266520075996, "loss": 2.1346, "step": 37300 }, { "epoch": 0.3873119103590402, "grad_norm": 0.3041744828224182, "learning_rate": 0.0002731034086282174, "loss": 2.1275, "step": 37400 }, { "epoch": 0.3883475037022462, "grad_norm": 0.27315208315849304, "learning_rate": 0.0002729638262994417, "loss": 2.1346, "step": 37500 }, { "epoch": 0.3893830970454522, "grad_norm": 0.29633215069770813, "learning_rate": 0.0002728239185837956, "loss": 2.1163, "step": 37600 }, { "epoch": 0.3904186903886582, "grad_norm": 0.28749552369117737, "learning_rate": 0.00027268368585150294, "loss": 2.1357, "step": 37700 }, { "epoch": 0.3914542837318642, "grad_norm": 0.3102233409881592, "learning_rate": 0.0002725431284736475, "loss": 2.1549, "step": 37800 }, { "epoch": 0.39248987707507016, "grad_norm": 0.26565811038017273, "learning_rate": 0.0002724022468221722, "loss": 2.1327, "step": 37900 }, { "epoch": 0.3935254704182761, "grad_norm": 0.26212960481643677, "learning_rate": 0.0002722610412698782, "loss": 2.1178, "step": 38000 }, { "epoch": 0.39456106376148214, "grad_norm": 0.29480013251304626, "learning_rate": 0.0002721195121904234, "loss": 2.1236, "step": 38100 }, { "epoch": 0.3955966571046881, "grad_norm": 0.2614869177341461, "learning_rate": 0.00027197765995832216, "loss": 2.13, "step": 38200 }, { "epoch": 0.39663225044789413, "grad_norm": 0.2832598388195038, "learning_rate": 0.00027183548494894385, "loss": 2.1276, "step": 38300 }, { "epoch": 0.3976678437911001, "grad_norm": 0.30248579382896423, "learning_rate": 0.000271692987538512, "loss": 2.1264, "step": 38400 }, { "epoch": 0.3987034371343061, "grad_norm": 0.28193435072898865, "learning_rate": 0.0002715501681041032, "loss": 2.1324, "step": 38500 }, { "epoch": 0.3997390304775121, "grad_norm": 0.26891788840293884, "learning_rate": 0.0002714070270236462, "loss": 2.1182, "step": 38600 }, { "epoch": 0.4007746238207181, "grad_norm": 0.271941602230072, "learning_rate": 0.00027126356467592103, "loss": 2.1321, "step": 38700 }, { "epoch": 0.40181021716392407, "grad_norm": 0.2680422067642212, "learning_rate": 0.0002711197814405576, "loss": 2.1353, "step": 38800 }, { "epoch": 0.4028458105071301, "grad_norm": 0.2849148213863373, "learning_rate": 0.0002709756776980353, "loss": 2.1226, "step": 38900 }, { "epoch": 0.40388140385033605, "grad_norm": 0.27381595969200134, "learning_rate": 0.0002708312538296812, "loss": 2.1026, "step": 39000 }, { "epoch": 0.404916997193542, "grad_norm": 0.26304328441619873, "learning_rate": 0.00027068651021766994, "loss": 2.1176, "step": 39100 }, { "epoch": 0.40595259053674804, "grad_norm": 0.27255985140800476, "learning_rate": 0.00027054144724502195, "loss": 2.1155, "step": 39200 }, { "epoch": 0.406988183879954, "grad_norm": 0.2747509777545929, "learning_rate": 0.00027039606529560286, "loss": 2.1178, "step": 39300 }, { "epoch": 0.40802377722316, "grad_norm": 0.2740117013454437, "learning_rate": 0.0002702518233353027, "loss": 2.1299, "step": 39400 }, { "epoch": 0.409059370566366, "grad_norm": 0.26572299003601074, "learning_rate": 0.0002701058077674675, "loss": 2.1111, "step": 39500 }, { "epoch": 0.410094963909572, "grad_norm": 0.2748086750507355, "learning_rate": 0.00026995947437565037, "loss": 2.1157, "step": 39600 }, { "epoch": 0.411130557252778, "grad_norm": 0.2732029855251312, "learning_rate": 0.00026981282354707876, "loss": 2.0955, "step": 39700 }, { "epoch": 0.412166150595984, "grad_norm": 0.2764507830142975, "learning_rate": 0.0002696658556698201, "loss": 2.1282, "step": 39800 }, { "epoch": 0.41320174393918996, "grad_norm": 0.2752842307090759, "learning_rate": 0.000269520045544338, "loss": 2.1079, "step": 39900 }, { "epoch": 0.4142373372823959, "grad_norm": 0.25819823145866394, "learning_rate": 0.00026937244789803055, "loss": 2.117, "step": 40000 }, { "epoch": 0.41527293062560194, "grad_norm": 0.35628432035446167, "learning_rate": 0.00026922453436835814, "loss": 2.1266, "step": 40100 }, { "epoch": 0.4163085239688079, "grad_norm": 0.260325163602829, "learning_rate": 0.0002690763053467294, "loss": 2.1247, "step": 40200 }, { "epoch": 0.41734411731201393, "grad_norm": 0.29574063420295715, "learning_rate": 0.00026892776122538803, "loss": 2.1205, "step": 40300 }, { "epoch": 0.4183797106552199, "grad_norm": 0.2717191278934479, "learning_rate": 0.0002687789023974114, "loss": 2.1145, "step": 40400 }, { "epoch": 0.4194153039984259, "grad_norm": 0.2918296158313751, "learning_rate": 0.00026862972925670975, "loss": 2.1235, "step": 40500 }, { "epoch": 0.4204508973416319, "grad_norm": 0.2755708396434784, "learning_rate": 0.0002684802421980252, "loss": 2.1095, "step": 40600 }, { "epoch": 0.4214864906848379, "grad_norm": 0.2766462564468384, "learning_rate": 0.0002683304416169301, "loss": 2.1242, "step": 40700 }, { "epoch": 0.42252208402804387, "grad_norm": 0.25984522700309753, "learning_rate": 0.0002681803279098269, "loss": 2.1097, "step": 40800 }, { "epoch": 0.42355767737124983, "grad_norm": 0.2688266634941101, "learning_rate": 0.0002680299014739466, "loss": 2.1035, "step": 40900 }, { "epoch": 0.42459327071445585, "grad_norm": 0.26839035749435425, "learning_rate": 0.00026787916270734743, "loss": 2.1392, "step": 41000 }, { "epoch": 0.4256288640576618, "grad_norm": 0.25181278586387634, "learning_rate": 0.0002677281120089144, "loss": 2.1201, "step": 41100 }, { "epoch": 0.42666445740086784, "grad_norm": 0.3058003783226013, "learning_rate": 0.00026757674977835784, "loss": 2.1044, "step": 41200 }, { "epoch": 0.4277000507440738, "grad_norm": 0.4025069773197174, "learning_rate": 0.00026742659468861814, "loss": 2.1134, "step": 41300 }, { "epoch": 0.4287356440872798, "grad_norm": 0.2872450649738312, "learning_rate": 0.0002672746137015549, "loss": 2.1164, "step": 41400 }, { "epoch": 0.4297712374304858, "grad_norm": 0.26822835206985474, "learning_rate": 0.00026712232238241537, "loss": 2.1082, "step": 41500 }, { "epoch": 0.4308068307736918, "grad_norm": 0.33343052864074707, "learning_rate": 0.00026696972113419276, "loss": 2.1055, "step": 41600 }, { "epoch": 0.43184242411689777, "grad_norm": 0.2828270494937897, "learning_rate": 0.0002668168103607005, "loss": 2.1137, "step": 41700 }, { "epoch": 0.4328780174601038, "grad_norm": 0.28803205490112305, "learning_rate": 0.0002666635904665711, "loss": 2.1109, "step": 41800 }, { "epoch": 0.43391361080330976, "grad_norm": 0.2918589115142822, "learning_rate": 0.00026651006185725496, "loss": 2.1216, "step": 41900 }, { "epoch": 0.4349492041465157, "grad_norm": 0.26440688967704773, "learning_rate": 0.0002663562249390196, "loss": 2.1155, "step": 42000 }, { "epoch": 0.43598479748972174, "grad_norm": 0.26510992646217346, "learning_rate": 0.0002662020801189482, "loss": 2.1008, "step": 42100 }, { "epoch": 0.4370203908329277, "grad_norm": 0.2779260277748108, "learning_rate": 0.0002660476278049388, "loss": 2.1058, "step": 42200 }, { "epoch": 0.43805598417613373, "grad_norm": 0.27170440554618835, "learning_rate": 0.00026589286840570303, "loss": 2.1252, "step": 42300 }, { "epoch": 0.4390915775193397, "grad_norm": 0.2927117943763733, "learning_rate": 0.0002657378023307653, "loss": 2.118, "step": 42400 }, { "epoch": 0.4401271708625457, "grad_norm": 0.2740183174610138, "learning_rate": 0.00026558242999046136, "loss": 2.1201, "step": 42500 }, { "epoch": 0.4411627642057517, "grad_norm": 0.25800156593322754, "learning_rate": 0.00026542675179593755, "loss": 2.0982, "step": 42600 }, { "epoch": 0.4421983575489577, "grad_norm": 0.2769431471824646, "learning_rate": 0.0002652707681591495, "loss": 2.1081, "step": 42700 }, { "epoch": 0.44323395089216366, "grad_norm": 0.280876487493515, "learning_rate": 0.00026511447949286105, "loss": 2.1048, "step": 42800 }, { "epoch": 0.44426954423536963, "grad_norm": 0.27514490485191345, "learning_rate": 0.00026495788621064335, "loss": 2.1067, "step": 42900 }, { "epoch": 0.44530513757857565, "grad_norm": 0.27088895440101624, "learning_rate": 0.0002648009887268734, "loss": 2.1016, "step": 43000 }, { "epoch": 0.4463407309217816, "grad_norm": 0.2773234248161316, "learning_rate": 0.00026464378745673345, "loss": 2.1024, "step": 43100 }, { "epoch": 0.44737632426498763, "grad_norm": 0.2844267785549164, "learning_rate": 0.00026448628281620945, "loss": 2.1164, "step": 43200 }, { "epoch": 0.4484119176081936, "grad_norm": 0.27871444821357727, "learning_rate": 0.0002643284752220901, "loss": 2.1105, "step": 43300 }, { "epoch": 0.4494475109513996, "grad_norm": 0.29704660177230835, "learning_rate": 0.0002641703650919659, "loss": 2.0936, "step": 43400 }, { "epoch": 0.4504831042946056, "grad_norm": 0.2896607518196106, "learning_rate": 0.0002640119528442279, "loss": 2.1342, "step": 43500 }, { "epoch": 0.4515186976378116, "grad_norm": 0.26584675908088684, "learning_rate": 0.00026385323889806657, "loss": 2.103, "step": 43600 }, { "epoch": 0.45255429098101757, "grad_norm": 0.272367388010025, "learning_rate": 0.00026369422367347067, "loss": 2.0987, "step": 43700 }, { "epoch": 0.4535898843242236, "grad_norm": 0.2840176820755005, "learning_rate": 0.0002635349075912264, "loss": 2.1092, "step": 43800 }, { "epoch": 0.45462547766742956, "grad_norm": 0.34640875458717346, "learning_rate": 0.00026337529107291587, "loss": 2.0919, "step": 43900 }, { "epoch": 0.4556610710106355, "grad_norm": 0.28771328926086426, "learning_rate": 0.0002632153745409163, "loss": 2.1032, "step": 44000 }, { "epoch": 0.45669666435384154, "grad_norm": 0.28308072686195374, "learning_rate": 0.0002630551584183989, "loss": 2.1056, "step": 44100 }, { "epoch": 0.4577322576970475, "grad_norm": 0.2610084116458893, "learning_rate": 0.00026289464312932747, "loss": 2.1041, "step": 44200 }, { "epoch": 0.4587678510402535, "grad_norm": 0.26441308856010437, "learning_rate": 0.0002627338290984576, "loss": 2.1045, "step": 44300 }, { "epoch": 0.4598034443834595, "grad_norm": 0.30344316363334656, "learning_rate": 0.00026257271675133535, "loss": 2.1095, "step": 44400 }, { "epoch": 0.4608390377266655, "grad_norm": 0.2567410469055176, "learning_rate": 0.00026241130651429625, "loss": 2.0944, "step": 44500 }, { "epoch": 0.4618746310698715, "grad_norm": 0.25524041056632996, "learning_rate": 0.00026224959881446404, "loss": 2.0829, "step": 44600 }, { "epoch": 0.4629102244130775, "grad_norm": 0.2526829242706299, "learning_rate": 0.0002620875940797496, "loss": 2.0996, "step": 44700 }, { "epoch": 0.46394581775628346, "grad_norm": 0.28170037269592285, "learning_rate": 0.00026192529273884985, "loss": 2.0943, "step": 44800 }, { "epoch": 0.46498141109948943, "grad_norm": 0.2842423617839813, "learning_rate": 0.00026176269522124664, "loss": 2.089, "step": 44900 }, { "epoch": 0.46601700444269545, "grad_norm": 0.2779385447502136, "learning_rate": 0.0002615998019572055, "loss": 2.1022, "step": 45000 }, { "epoch": 0.4670525977859014, "grad_norm": 0.25650814175605774, "learning_rate": 0.0002614366133777745, "loss": 2.0821, "step": 45100 }, { "epoch": 0.46808819112910743, "grad_norm": 0.30221664905548096, "learning_rate": 0.0002612731299147833, "loss": 2.106, "step": 45200 }, { "epoch": 0.4691237844723134, "grad_norm": 0.27854058146476746, "learning_rate": 0.00026110935200084187, "loss": 2.0903, "step": 45300 }, { "epoch": 0.4701593778155194, "grad_norm": 0.2626960873603821, "learning_rate": 0.0002609452800693393, "loss": 2.0919, "step": 45400 }, { "epoch": 0.4711949711587254, "grad_norm": 0.26258763670921326, "learning_rate": 0.00026078091455444265, "loss": 2.079, "step": 45500 }, { "epoch": 0.4722305645019314, "grad_norm": 0.26652756333351135, "learning_rate": 0.00026061625589109605, "loss": 2.0926, "step": 45600 }, { "epoch": 0.47326615784513737, "grad_norm": 0.27656105160713196, "learning_rate": 0.00026045130451501913, "loss": 2.0992, "step": 45700 }, { "epoch": 0.47430175118834333, "grad_norm": 0.2672775685787201, "learning_rate": 0.00026028606086270635, "loss": 2.0811, "step": 45800 }, { "epoch": 0.47533734453154936, "grad_norm": 0.27459150552749634, "learning_rate": 0.0002601205253714253, "loss": 2.1061, "step": 45900 }, { "epoch": 0.4763729378747553, "grad_norm": 0.2768867313861847, "learning_rate": 0.00025995469847921616, "loss": 2.0905, "step": 46000 }, { "epoch": 0.47740853121796134, "grad_norm": 0.2583029866218567, "learning_rate": 0.0002597885806248899, "loss": 2.0954, "step": 46100 }, { "epoch": 0.4784441245611673, "grad_norm": 0.27331164479255676, "learning_rate": 0.00025962217224802757, "loss": 2.0701, "step": 46200 }, { "epoch": 0.4794797179043733, "grad_norm": 0.27349796891212463, "learning_rate": 0.00025945547378897915, "loss": 2.0843, "step": 46300 }, { "epoch": 0.4805153112475793, "grad_norm": 0.25313106179237366, "learning_rate": 0.00025928848568886195, "loss": 2.0694, "step": 46400 }, { "epoch": 0.4815509045907853, "grad_norm": 0.28267520666122437, "learning_rate": 0.00025912120838955996, "loss": 2.0737, "step": 46500 }, { "epoch": 0.4825864979339913, "grad_norm": 0.28459227085113525, "learning_rate": 0.0002589536423337223, "loss": 2.0842, "step": 46600 }, { "epoch": 0.4836220912771973, "grad_norm": 0.2636272609233856, "learning_rate": 0.00025878578796476227, "loss": 2.0773, "step": 46700 }, { "epoch": 0.48465768462040326, "grad_norm": 0.264024019241333, "learning_rate": 0.00025861764572685615, "loss": 2.0706, "step": 46800 }, { "epoch": 0.4856932779636092, "grad_norm": 0.31876304745674133, "learning_rate": 0.0002584492160649419, "loss": 2.0745, "step": 46900 }, { "epoch": 0.48672887130681525, "grad_norm": 0.32105201482772827, "learning_rate": 0.000258280499424718, "loss": 2.0817, "step": 47000 }, { "epoch": 0.4877644646500212, "grad_norm": 0.2878267168998718, "learning_rate": 0.0002581114962526425, "loss": 2.0697, "step": 47100 }, { "epoch": 0.48880005799322723, "grad_norm": 0.2572951316833496, "learning_rate": 0.00025794220699593156, "loss": 2.076, "step": 47200 }, { "epoch": 0.4898356513364332, "grad_norm": 0.25959664583206177, "learning_rate": 0.00025777263210255836, "loss": 2.094, "step": 47300 }, { "epoch": 0.4908712446796392, "grad_norm": 0.27159959077835083, "learning_rate": 0.00025760277202125205, "loss": 2.0846, "step": 47400 }, { "epoch": 0.4919068380228452, "grad_norm": 0.26176223158836365, "learning_rate": 0.00025743262720149634, "loss": 2.0927, "step": 47500 }, { "epoch": 0.4929424313660512, "grad_norm": 0.25360336899757385, "learning_rate": 0.0002572621980935284, "loss": 2.096, "step": 47600 }, { "epoch": 0.49397802470925717, "grad_norm": 0.26628628373146057, "learning_rate": 0.0002570914851483378, "loss": 2.0816, "step": 47700 }, { "epoch": 0.49501361805246313, "grad_norm": 0.26940736174583435, "learning_rate": 0.0002569204888176651, "loss": 2.0756, "step": 47800 }, { "epoch": 0.49604921139566915, "grad_norm": 0.27887284755706787, "learning_rate": 0.00025674920955400074, "loss": 2.0948, "step": 47900 }, { "epoch": 0.4970848047388751, "grad_norm": 0.2778899073600769, "learning_rate": 0.0002565776478105839, "loss": 2.0878, "step": 48000 }, { "epoch": 0.49812039808208114, "grad_norm": 0.2750851511955261, "learning_rate": 0.00025640580404140135, "loss": 2.086, "step": 48100 }, { "epoch": 0.4991559914252871, "grad_norm": 0.25740739703178406, "learning_rate": 0.00025623367870118593, "loss": 2.0699, "step": 48200 }, { "epoch": 0.5001915847684931, "grad_norm": 0.2833239734172821, "learning_rate": 0.0002560612722454158, "loss": 2.0863, "step": 48300 }, { "epoch": 0.5012271781116991, "grad_norm": 0.266124963760376, "learning_rate": 0.00025588858513031287, "loss": 2.0912, "step": 48400 }, { "epoch": 0.502262771454905, "grad_norm": 0.27121883630752563, "learning_rate": 0.0002557156178128418, "loss": 2.0607, "step": 48500 }, { "epoch": 0.5032983647981111, "grad_norm": 0.32747069001197815, "learning_rate": 0.0002555423707507087, "loss": 2.0925, "step": 48600 }, { "epoch": 0.5043339581413171, "grad_norm": 0.261800616979599, "learning_rate": 0.0002553688444023599, "loss": 2.0803, "step": 48700 }, { "epoch": 0.505369551484523, "grad_norm": 0.25316718220710754, "learning_rate": 0.0002551950392269808, "loss": 2.0775, "step": 48800 }, { "epoch": 0.506405144827729, "grad_norm": 0.25740888714790344, "learning_rate": 0.0002550226978963248, "loss": 2.0779, "step": 48900 }, { "epoch": 0.507440738170935, "grad_norm": 0.38132351636886597, "learning_rate": 0.00025484833922417334, "loss": 2.0862, "step": 49000 }, { "epoch": 0.5084763315141411, "grad_norm": 0.259734183549881, "learning_rate": 0.0002546737031023524, "loss": 2.0578, "step": 49100 }, { "epoch": 0.509511924857347, "grad_norm": 0.2478483021259308, "learning_rate": 0.0002544987899929841, "loss": 2.0745, "step": 49200 }, { "epoch": 0.510547518200553, "grad_norm": 0.26957714557647705, "learning_rate": 0.0002543236003589234, "loss": 2.0635, "step": 49300 }, { "epoch": 0.511583111543759, "grad_norm": 0.28369587659835815, "learning_rate": 0.0002541481346637572, "loss": 2.081, "step": 49400 }, { "epoch": 0.512618704886965, "grad_norm": 0.2657583951950073, "learning_rate": 0.00025397239337180273, "loss": 2.0714, "step": 49500 }, { "epoch": 0.513654298230171, "grad_norm": 0.2918652296066284, "learning_rate": 0.00025379813847271703, "loss": 2.0683, "step": 49600 }, { "epoch": 0.514689891573377, "grad_norm": 0.25033167004585266, "learning_rate": 0.00025362185012740564, "loss": 2.0776, "step": 49700 }, { "epoch": 0.515725484916583, "grad_norm": 0.2708190679550171, "learning_rate": 0.0002534452875779602, "loss": 2.0812, "step": 49800 }, { "epoch": 0.5167610782597889, "grad_norm": 0.27673089504241943, "learning_rate": 0.00025326845129160046, "loss": 2.0835, "step": 49900 }, { "epoch": 0.5177966716029949, "grad_norm": 0.2670213580131531, "learning_rate": 0.00025309134173627057, "loss": 2.0835, "step": 50000 }, { "epoch": 0.5188322649462009, "grad_norm": 0.27812016010284424, "learning_rate": 0.0002529139593806379, "loss": 2.0634, "step": 50100 }, { "epoch": 0.519867858289407, "grad_norm": 0.2907336950302124, "learning_rate": 0.0002527363046940915, "loss": 2.0823, "step": 50200 }, { "epoch": 0.5209034516326129, "grad_norm": 0.257276326417923, "learning_rate": 0.0002525583781467413, "loss": 2.0503, "step": 50300 }, { "epoch": 0.5219390449758189, "grad_norm": 0.25246426463127136, "learning_rate": 0.0002523801802094165, "loss": 2.0681, "step": 50400 }, { "epoch": 0.5229746383190249, "grad_norm": 0.29212358593940735, "learning_rate": 0.00025220171135366436, "loss": 2.0902, "step": 50500 }, { "epoch": 0.5240102316622309, "grad_norm": 0.2634747326374054, "learning_rate": 0.0002520229720517494, "loss": 2.0721, "step": 50600 }, { "epoch": 0.5250458250054368, "grad_norm": 0.26756858825683594, "learning_rate": 0.0002518439627766513, "loss": 2.0667, "step": 50700 }, { "epoch": 0.5260814183486429, "grad_norm": 0.25739890336990356, "learning_rate": 0.00025166468400206454, "loss": 2.0639, "step": 50800 }, { "epoch": 0.5271170116918489, "grad_norm": 0.2692582607269287, "learning_rate": 0.0002514851362023965, "loss": 2.0622, "step": 50900 }, { "epoch": 0.5281526050350548, "grad_norm": 0.4750751256942749, "learning_rate": 0.0002513053198527667, "loss": 2.0491, "step": 51000 }, { "epoch": 0.5291881983782608, "grad_norm": 0.2643128037452698, "learning_rate": 0.000251125235429005, "loss": 2.0575, "step": 51100 }, { "epoch": 0.5302237917214668, "grad_norm": 0.27474769949913025, "learning_rate": 0.0002509448834076509, "loss": 2.0801, "step": 51200 }, { "epoch": 0.5312593850646729, "grad_norm": 0.2603282928466797, "learning_rate": 0.00025076426426595195, "loss": 2.0668, "step": 51300 }, { "epoch": 0.5322949784078788, "grad_norm": 0.27214357256889343, "learning_rate": 0.00025058337848186236, "loss": 2.0664, "step": 51400 }, { "epoch": 0.5333305717510848, "grad_norm": 0.25883734226226807, "learning_rate": 0.00025040222653404217, "loss": 2.0888, "step": 51500 }, { "epoch": 0.5343661650942908, "grad_norm": 0.257216215133667, "learning_rate": 0.00025022080890185565, "loss": 2.0649, "step": 51600 }, { "epoch": 0.5354017584374967, "grad_norm": 0.26857179403305054, "learning_rate": 0.00025003912606537013, "loss": 2.0638, "step": 51700 }, { "epoch": 0.5364373517807027, "grad_norm": 0.25013187527656555, "learning_rate": 0.00024985717850535473, "loss": 2.0763, "step": 51800 }, { "epoch": 0.5374729451239088, "grad_norm": 0.2590459883213043, "learning_rate": 0.00024967496670327897, "loss": 2.0583, "step": 51900 }, { "epoch": 0.5385085384671148, "grad_norm": 0.2819346487522125, "learning_rate": 0.0002494924911413119, "loss": 2.059, "step": 52000 }, { "epoch": 0.5395441318103207, "grad_norm": 0.27067631483078003, "learning_rate": 0.0002493097523023202, "loss": 2.0726, "step": 52100 }, { "epoch": 0.5405797251535267, "grad_norm": 0.2522888779640198, "learning_rate": 0.00024912675066986737, "loss": 2.0546, "step": 52200 }, { "epoch": 0.5416153184967327, "grad_norm": 0.3434728682041168, "learning_rate": 0.0002489434867282125, "loss": 2.0906, "step": 52300 }, { "epoch": 0.5426509118399387, "grad_norm": 0.2540159821510315, "learning_rate": 0.00024875996096230835, "loss": 2.085, "step": 52400 }, { "epoch": 0.5436865051831447, "grad_norm": 0.26025110483169556, "learning_rate": 0.000248576173857801, "loss": 2.072, "step": 52500 }, { "epoch": 0.5447220985263507, "grad_norm": 0.27003923058509827, "learning_rate": 0.00024839212590102783, "loss": 2.057, "step": 52600 }, { "epoch": 0.5457576918695567, "grad_norm": 0.2669220566749573, "learning_rate": 0.00024820781757901644, "loss": 2.0784, "step": 52700 }, { "epoch": 0.5467932852127626, "grad_norm": 0.26811477541923523, "learning_rate": 0.0002480232493794836, "loss": 2.0524, "step": 52800 }, { "epoch": 0.5478288785559686, "grad_norm": 0.28776293992996216, "learning_rate": 0.00024783842179083366, "loss": 2.0691, "step": 52900 }, { "epoch": 0.5488644718991746, "grad_norm": 0.2606721520423889, "learning_rate": 0.0002476533353021573, "loss": 2.0578, "step": 53000 }, { "epoch": 0.5499000652423807, "grad_norm": 0.2606494724750519, "learning_rate": 0.0002474679904032304, "loss": 2.0746, "step": 53100 }, { "epoch": 0.5509356585855866, "grad_norm": 0.26623305678367615, "learning_rate": 0.00024728238758451275, "loss": 2.0653, "step": 53200 }, { "epoch": 0.5519712519287926, "grad_norm": 0.26554974913597107, "learning_rate": 0.0002470965273371464, "loss": 2.0567, "step": 53300 }, { "epoch": 0.5530068452719986, "grad_norm": 0.277811735868454, "learning_rate": 0.00024691041015295476, "loss": 2.0768, "step": 53400 }, { "epoch": 0.5540424386152046, "grad_norm": 0.288761705160141, "learning_rate": 0.00024672403652444106, "loss": 2.0546, "step": 53500 }, { "epoch": 0.5550780319584105, "grad_norm": 0.2567208409309387, "learning_rate": 0.00024653740694478724, "loss": 2.0636, "step": 53600 }, { "epoch": 0.5561136253016166, "grad_norm": 0.2469269037246704, "learning_rate": 0.0002463505219078525, "loss": 2.0576, "step": 53700 }, { "epoch": 0.5571492186448226, "grad_norm": 0.2551238536834717, "learning_rate": 0.00024616338190817195, "loss": 2.06, "step": 53800 }, { "epoch": 0.5581848119880285, "grad_norm": 0.2401515692472458, "learning_rate": 0.0002459759874409555, "loss": 2.048, "step": 53900 }, { "epoch": 0.5592204053312345, "grad_norm": 0.25514185428619385, "learning_rate": 0.0002457883390020863, "loss": 2.0593, "step": 54000 }, { "epoch": 0.5602559986744405, "grad_norm": 0.261246919631958, "learning_rate": 0.0002456004370881197, "loss": 2.0455, "step": 54100 }, { "epoch": 0.5612915920176466, "grad_norm": 0.2556232213973999, "learning_rate": 0.00024541228219628183, "loss": 2.0602, "step": 54200 }, { "epoch": 0.5623271853608525, "grad_norm": 0.27437207102775574, "learning_rate": 0.000245223874824468, "loss": 2.0495, "step": 54300 }, { "epoch": 0.5633627787040585, "grad_norm": 0.27209702134132385, "learning_rate": 0.00024503521547124187, "loss": 2.0747, "step": 54400 }, { "epoch": 0.5643983720472645, "grad_norm": 0.24252693355083466, "learning_rate": 0.0002448481949873838, "loss": 2.0647, "step": 54500 }, { "epoch": 0.5654339653904704, "grad_norm": 0.24433889985084534, "learning_rate": 0.00024465903567703575, "loss": 2.0591, "step": 54600 }, { "epoch": 0.5664695587336764, "grad_norm": 0.337563693523407, "learning_rate": 0.0002444696258799527, "loss": 2.0585, "step": 54700 }, { "epoch": 0.5675051520768825, "grad_norm": 0.26767203211784363, "learning_rate": 0.00024427996609735085, "loss": 2.0562, "step": 54800 }, { "epoch": 0.5685407454200885, "grad_norm": 0.25655797123908997, "learning_rate": 0.00024409195715706483, "loss": 2.042, "step": 54900 }, { "epoch": 0.5695763387632944, "grad_norm": 0.24760207533836365, "learning_rate": 0.00024390180139703996, "loss": 2.0376, "step": 55000 }, { "epoch": 0.5706119321065004, "grad_norm": 0.25450366735458374, "learning_rate": 0.0002437113971540734, "loss": 2.0463, "step": 55100 }, { "epoch": 0.5716475254497064, "grad_norm": 0.25239065289497375, "learning_rate": 0.00024352074493201286, "loss": 2.0568, "step": 55200 }, { "epoch": 0.5726831187929124, "grad_norm": 0.2417611926794052, "learning_rate": 0.0002433298452353622, "loss": 2.0606, "step": 55300 }, { "epoch": 0.5737187121361184, "grad_norm": 0.25062090158462524, "learning_rate": 0.0002431386985692802, "loss": 2.0546, "step": 55400 }, { "epoch": 0.5747543054793244, "grad_norm": 0.2620830237865448, "learning_rate": 0.0002429473054395792, "loss": 2.0495, "step": 55500 }, { "epoch": 0.5757898988225304, "grad_norm": 0.24547018110752106, "learning_rate": 0.00024275566635272367, "loss": 2.0413, "step": 55600 }, { "epoch": 0.5768254921657363, "grad_norm": 0.26979315280914307, "learning_rate": 0.00024256378181582887, "loss": 2.0574, "step": 55700 }, { "epoch": 0.5778610855089423, "grad_norm": 0.2580671012401581, "learning_rate": 0.00024237165233665975, "loss": 2.0529, "step": 55800 }, { "epoch": 0.5788966788521483, "grad_norm": 0.26585468649864197, "learning_rate": 0.00024217927842362929, "loss": 2.0504, "step": 55900 }, { "epoch": 0.5799322721953544, "grad_norm": 0.26549628376960754, "learning_rate": 0.00024198666058579735, "loss": 2.0491, "step": 56000 }, { "epoch": 0.5809678655385603, "grad_norm": 0.2863091826438904, "learning_rate": 0.00024179379933286918, "loss": 2.0601, "step": 56100 }, { "epoch": 0.5820034588817663, "grad_norm": 0.24631023406982422, "learning_rate": 0.00024160069517519428, "loss": 2.0451, "step": 56200 }, { "epoch": 0.5830390522249723, "grad_norm": 0.2749427855014801, "learning_rate": 0.00024140734862376488, "loss": 2.0457, "step": 56300 }, { "epoch": 0.5840746455681783, "grad_norm": 0.2617676556110382, "learning_rate": 0.0002412137601902145, "loss": 2.0455, "step": 56400 }, { "epoch": 0.5851102389113843, "grad_norm": 0.3701717257499695, "learning_rate": 0.000241019930386817, "loss": 2.052, "step": 56500 }, { "epoch": 0.5861458322545903, "grad_norm": 0.24550727009773254, "learning_rate": 0.00024082585972648468, "loss": 2.0563, "step": 56600 }, { "epoch": 0.5871814255977963, "grad_norm": 0.2525273859500885, "learning_rate": 0.00024063154872276736, "loss": 2.0451, "step": 56700 }, { "epoch": 0.5882170189410022, "grad_norm": 0.27047526836395264, "learning_rate": 0.0002404369978898508, "loss": 2.0592, "step": 56800 }, { "epoch": 0.5892526122842082, "grad_norm": 0.24413259327411652, "learning_rate": 0.00024024220774255533, "loss": 2.0525, "step": 56900 }, { "epoch": 0.5902882056274142, "grad_norm": 0.2623489201068878, "learning_rate": 0.00024004717879633474, "loss": 2.0421, "step": 57000 }, { "epoch": 0.5913237989706203, "grad_norm": 0.28543058037757874, "learning_rate": 0.00023985191156727454, "loss": 2.0544, "step": 57100 }, { "epoch": 0.5923593923138262, "grad_norm": 0.24446852505207062, "learning_rate": 0.00023965640657209092, "loss": 2.0648, "step": 57200 }, { "epoch": 0.5933949856570322, "grad_norm": 0.25410106778144836, "learning_rate": 0.00023946066432812917, "loss": 2.0364, "step": 57300 }, { "epoch": 0.5944305790002382, "grad_norm": 0.25680220127105713, "learning_rate": 0.00023926468535336235, "loss": 2.0545, "step": 57400 }, { "epoch": 0.5954661723434441, "grad_norm": 0.26186510920524597, "learning_rate": 0.0002390684701663901, "loss": 2.0474, "step": 57500 }, { "epoch": 0.5965017656866501, "grad_norm": 0.2583567202091217, "learning_rate": 0.00023887201928643695, "loss": 2.0378, "step": 57600 }, { "epoch": 0.5975373590298562, "grad_norm": 0.244516059756279, "learning_rate": 0.00023867533323335127, "loss": 2.0404, "step": 57700 }, { "epoch": 0.5985729523730622, "grad_norm": 0.2766864001750946, "learning_rate": 0.00023847841252760366, "loss": 2.0486, "step": 57800 }, { "epoch": 0.5996085457162681, "grad_norm": 0.2514510154724121, "learning_rate": 0.00023828125769028569, "loss": 2.0457, "step": 57900 }, { "epoch": 0.6006441390594741, "grad_norm": 0.2914152443408966, "learning_rate": 0.00023808386924310845, "loss": 2.0312, "step": 58000 }, { "epoch": 0.6016797324026801, "grad_norm": 0.2614096701145172, "learning_rate": 0.00023788624770840127, "loss": 2.0417, "step": 58100 }, { "epoch": 0.6027153257458862, "grad_norm": 0.29130396246910095, "learning_rate": 0.00023768839360911024, "loss": 2.0409, "step": 58200 }, { "epoch": 0.6037509190890921, "grad_norm": 0.2692072093486786, "learning_rate": 0.00023749030746879685, "loss": 2.0489, "step": 58300 }, { "epoch": 0.6047865124322981, "grad_norm": 0.2614396810531616, "learning_rate": 0.0002372919898116367, "loss": 2.0468, "step": 58400 }, { "epoch": 0.6058221057755041, "grad_norm": 0.2660903334617615, "learning_rate": 0.00023709344116241784, "loss": 2.0407, "step": 58500 }, { "epoch": 0.60685769911871, "grad_norm": 0.24488508701324463, "learning_rate": 0.00023689466204653974, "loss": 2.0408, "step": 58600 }, { "epoch": 0.607893292461916, "grad_norm": 0.2617338299751282, "learning_rate": 0.0002366956529900118, "loss": 2.0407, "step": 58700 }, { "epoch": 0.608928885805122, "grad_norm": 0.27952614426612854, "learning_rate": 0.0002364964145194516, "loss": 2.0322, "step": 58800 }, { "epoch": 0.6099644791483281, "grad_norm": 0.25049981474876404, "learning_rate": 0.00023629694716208413, "loss": 2.0422, "step": 58900 }, { "epoch": 0.611000072491534, "grad_norm": 0.2596982419490814, "learning_rate": 0.00023609725144573977, "loss": 2.0398, "step": 59000 }, { "epoch": 0.61203566583474, "grad_norm": 0.24386398494243622, "learning_rate": 0.00023589732789885344, "loss": 2.0531, "step": 59100 }, { "epoch": 0.613071259177946, "grad_norm": 0.27489227056503296, "learning_rate": 0.00023569717705046266, "loss": 2.0468, "step": 59200 }, { "epoch": 0.614106852521152, "grad_norm": 0.24467331171035767, "learning_rate": 0.0002354967994302067, "loss": 2.0419, "step": 59300 }, { "epoch": 0.615142445864358, "grad_norm": 0.24510686099529266, "learning_rate": 0.0002352961955683248, "loss": 2.0373, "step": 59400 }, { "epoch": 0.616178039207564, "grad_norm": 0.2680029571056366, "learning_rate": 0.00023509536599565484, "loss": 2.0341, "step": 59500 }, { "epoch": 0.61721363255077, "grad_norm": 0.26040682196617126, "learning_rate": 0.0002348963229040437, "loss": 2.0328, "step": 59600 }, { "epoch": 0.6182492258939759, "grad_norm": 0.25532248616218567, "learning_rate": 0.0002346950457485369, "loss": 2.0589, "step": 59700 }, { "epoch": 0.6192848192371819, "grad_norm": 0.2541707456111908, "learning_rate": 0.0002344935444730047, "loss": 2.0361, "step": 59800 }, { "epoch": 0.620320412580388, "grad_norm": 0.25915879011154175, "learning_rate": 0.00023429181961065977, "loss": 2.0292, "step": 59900 }, { "epoch": 0.621356005923594, "grad_norm": 0.23977366089820862, "learning_rate": 0.00023408987169530645, "loss": 2.0412, "step": 60000 }, { "epoch": 0.6223915992667999, "grad_norm": 0.258909672498703, "learning_rate": 0.00023388770126133923, "loss": 2.0325, "step": 60100 }, { "epoch": 0.6234271926100059, "grad_norm": 0.2565488815307617, "learning_rate": 0.00023368530884374157, "loss": 2.0231, "step": 60200 }, { "epoch": 0.6244627859532119, "grad_norm": 0.2569873631000519, "learning_rate": 0.0002334826949780842, "loss": 2.0422, "step": 60300 }, { "epoch": 0.6254983792964178, "grad_norm": 0.2474081963300705, "learning_rate": 0.000233279860200524, "loss": 2.0375, "step": 60400 }, { "epoch": 0.6265339726396238, "grad_norm": 0.2544451951980591, "learning_rate": 0.00023307680504780232, "loss": 2.0363, "step": 60500 }, { "epoch": 0.6275695659828299, "grad_norm": 0.26724931597709656, "learning_rate": 0.0002328735300572437, "loss": 2.0241, "step": 60600 }, { "epoch": 0.6286051593260359, "grad_norm": 0.2859784662723541, "learning_rate": 0.00023267003576675437, "loss": 2.0273, "step": 60700 }, { "epoch": 0.6296407526692418, "grad_norm": 0.24628446996212006, "learning_rate": 0.00023246632271482092, "loss": 2.0266, "step": 60800 }, { "epoch": 0.6306763460124478, "grad_norm": 0.2625367343425751, "learning_rate": 0.00023226239144050887, "loss": 2.0176, "step": 60900 }, { "epoch": 0.6317119393556538, "grad_norm": 0.24925480782985687, "learning_rate": 0.0002320582424834611, "loss": 2.017, "step": 61000 }, { "epoch": 0.6327475326988599, "grad_norm": 0.26570290327072144, "learning_rate": 0.00023185387638389662, "loss": 2.0213, "step": 61100 }, { "epoch": 0.6337831260420658, "grad_norm": 0.2579900920391083, "learning_rate": 0.00023164929368260894, "loss": 2.0412, "step": 61200 }, { "epoch": 0.6348187193852718, "grad_norm": 0.25061291456222534, "learning_rate": 0.00023144449492096478, "loss": 2.0318, "step": 61300 }, { "epoch": 0.6358543127284778, "grad_norm": 0.25929582118988037, "learning_rate": 0.00023123948064090262, "loss": 2.044, "step": 61400 }, { "epoch": 0.6368899060716837, "grad_norm": 0.24248668551445007, "learning_rate": 0.0002310342513849313, "loss": 2.0235, "step": 61500 }, { "epoch": 0.6379254994148897, "grad_norm": 0.23884373903274536, "learning_rate": 0.00023082880769612838, "loss": 2.0375, "step": 61600 }, { "epoch": 0.6389610927580958, "grad_norm": 0.24625521898269653, "learning_rate": 0.00023062315011813898, "loss": 2.03, "step": 61700 }, { "epoch": 0.6399966861013018, "grad_norm": 0.26001396775245667, "learning_rate": 0.00023041727919517418, "loss": 2.0111, "step": 61800 }, { "epoch": 0.6410322794445077, "grad_norm": 0.26365765929222107, "learning_rate": 0.00023021119547200959, "loss": 2.0453, "step": 61900 }, { "epoch": 0.6420678727877137, "grad_norm": 0.263185054063797, "learning_rate": 0.00023000489949398395, "loss": 2.0339, "step": 62000 }, { "epoch": 0.6431034661309197, "grad_norm": 0.25443968176841736, "learning_rate": 0.00022979839180699765, "loss": 2.0346, "step": 62100 }, { "epoch": 0.6441390594741258, "grad_norm": 0.256481409072876, "learning_rate": 0.0002295916729575114, "loss": 2.0544, "step": 62200 }, { "epoch": 0.6451746528173317, "grad_norm": 0.2831222414970398, "learning_rate": 0.00022938474349254448, "loss": 2.0245, "step": 62300 }, { "epoch": 0.6462102461605377, "grad_norm": 0.23776544630527496, "learning_rate": 0.0002291776039596737, "loss": 2.0314, "step": 62400 }, { "epoch": 0.6472458395037437, "grad_norm": 0.24850037693977356, "learning_rate": 0.00022897025490703173, "loss": 2.0381, "step": 62500 }, { "epoch": 0.6482814328469496, "grad_norm": 0.2560637891292572, "learning_rate": 0.00022876269688330551, "loss": 2.027, "step": 62600 }, { "epoch": 0.6493170261901556, "grad_norm": 0.25750643014907837, "learning_rate": 0.00022855493043773514, "loss": 2.0219, "step": 62700 }, { "epoch": 0.6503526195333617, "grad_norm": 0.2579807937145233, "learning_rate": 0.00022834695612011213, "loss": 2.0228, "step": 62800 }, { "epoch": 0.6513882128765677, "grad_norm": 0.25793078541755676, "learning_rate": 0.0002281408573216056, "loss": 2.023, "step": 62900 }, { "epoch": 0.6524238062197736, "grad_norm": 0.27499786019325256, "learning_rate": 0.00022793247097643046, "loss": 2.023, "step": 63000 }, { "epoch": 0.6534593995629796, "grad_norm": 0.2614569067955017, "learning_rate": 0.00022772387840635483, "loss": 2.0255, "step": 63100 }, { "epoch": 0.6544949929061856, "grad_norm": 0.24664799869060516, "learning_rate": 0.0002275150801633565, "loss": 2.0213, "step": 63200 }, { "epoch": 0.6555305862493915, "grad_norm": 0.24992604553699493, "learning_rate": 0.00022730607679995722, "loss": 2.0345, "step": 63300 }, { "epoch": 0.6565661795925976, "grad_norm": 0.2893676161766052, "learning_rate": 0.00022709686886922183, "loss": 2.0216, "step": 63400 }, { "epoch": 0.6576017729358036, "grad_norm": 0.24437743425369263, "learning_rate": 0.00022688745692475625, "loss": 2.03, "step": 63500 }, { "epoch": 0.6586373662790096, "grad_norm": 0.2941676080226898, "learning_rate": 0.0002266778415207064, "loss": 2.0105, "step": 63600 }, { "epoch": 0.6596729596222155, "grad_norm": 0.27731597423553467, "learning_rate": 0.00022646802321175653, "loss": 2.0328, "step": 63700 }, { "epoch": 0.6607085529654215, "grad_norm": 0.2532055079936981, "learning_rate": 0.0002262580025531278, "loss": 2.0307, "step": 63800 }, { "epoch": 0.6617441463086275, "grad_norm": 0.24110396206378937, "learning_rate": 0.0002260477801005769, "loss": 2.0357, "step": 63900 }, { "epoch": 0.6627797396518336, "grad_norm": 0.24422571063041687, "learning_rate": 0.00022583735641039458, "loss": 2.0285, "step": 64000 }, { "epoch": 0.6638153329950395, "grad_norm": 0.2425120323896408, "learning_rate": 0.00022562673203940377, "loss": 2.0194, "step": 64100 }, { "epoch": 0.6648509263382455, "grad_norm": 0.23561102151870728, "learning_rate": 0.00022541590754495888, "loss": 2.0207, "step": 64200 }, { "epoch": 0.6658865196814515, "grad_norm": 0.25439807772636414, "learning_rate": 0.00022520488348494352, "loss": 2.0162, "step": 64300 }, { "epoch": 0.6669221130246574, "grad_norm": 0.2837825119495392, "learning_rate": 0.0002249936604177697, "loss": 2.0296, "step": 64400 }, { "epoch": 0.6679577063678634, "grad_norm": 0.24267669022083282, "learning_rate": 0.00022478223890237578, "loss": 2.0146, "step": 64500 }, { "epoch": 0.6689932997110695, "grad_norm": 0.2377946972846985, "learning_rate": 0.00022457061949822534, "loss": 2.0312, "step": 64600 }, { "epoch": 0.6700288930542755, "grad_norm": 0.24502654373645782, "learning_rate": 0.00022435880276530572, "loss": 2.0214, "step": 64700 }, { "epoch": 0.6710644863974814, "grad_norm": 0.2545722424983978, "learning_rate": 0.0002241467892641263, "loss": 2.0084, "step": 64800 }, { "epoch": 0.6721000797406874, "grad_norm": 0.2797817885875702, "learning_rate": 0.0002239345795557172, "loss": 2.0247, "step": 64900 }, { "epoch": 0.6731356730838934, "grad_norm": 0.2518276274204254, "learning_rate": 0.00022372217420162777, "loss": 2.0214, "step": 65000 }, { "epoch": 0.6741712664270995, "grad_norm": 0.24065960943698883, "learning_rate": 0.00022350957376392498, "loss": 2.0276, "step": 65100 }, { "epoch": 0.6752068597703054, "grad_norm": 0.2677326202392578, "learning_rate": 0.00022329677880519218, "loss": 2.0084, "step": 65200 }, { "epoch": 0.6762424531135114, "grad_norm": 0.23652470111846924, "learning_rate": 0.00022308378988852726, "loss": 2.018, "step": 65300 }, { "epoch": 0.6772780464567174, "grad_norm": 0.24972309172153473, "learning_rate": 0.00022287060757754162, "loss": 2.021, "step": 65400 }, { "epoch": 0.6783136397999233, "grad_norm": 0.27198293805122375, "learning_rate": 0.00022265723243635824, "loss": 2.0286, "step": 65500 }, { "epoch": 0.6793492331431293, "grad_norm": 0.26782822608947754, "learning_rate": 0.00022244366502961033, "loss": 2.0193, "step": 65600 }, { "epoch": 0.6803848264863354, "grad_norm": 0.24242490530014038, "learning_rate": 0.00022222990592244, "loss": 2.0172, "step": 65700 }, { "epoch": 0.6814204198295414, "grad_norm": 0.24801382422447205, "learning_rate": 0.00022201595568049662, "loss": 2.0178, "step": 65800 }, { "epoch": 0.6824560131727473, "grad_norm": 0.260474294424057, "learning_rate": 0.00022180395721949528, "loss": 2.0287, "step": 65900 }, { "epoch": 0.6834916065159533, "grad_norm": 0.2730622887611389, "learning_rate": 0.00022158962830418834, "loss": 2.0186, "step": 66000 }, { "epoch": 0.6845271998591593, "grad_norm": 0.2648753225803375, "learning_rate": 0.0002213751099484109, "loss": 2.0429, "step": 66100 }, { "epoch": 0.6855627932023654, "grad_norm": 0.26469871401786804, "learning_rate": 0.00022116040271982142, "loss": 2.0209, "step": 66200 }, { "epoch": 0.6865983865455713, "grad_norm": 0.25764134526252747, "learning_rate": 0.0002209455071865782, "loss": 1.9953, "step": 66300 }, { "epoch": 0.6876339798887773, "grad_norm": 0.2497389167547226, "learning_rate": 0.00022073042391733774, "loss": 2.0253, "step": 66400 }, { "epoch": 0.6886695732319833, "grad_norm": 0.24793919920921326, "learning_rate": 0.0002205151534812534, "loss": 2.0303, "step": 66500 }, { "epoch": 0.6897051665751892, "grad_norm": 0.25411102175712585, "learning_rate": 0.00022029969644797386, "loss": 2.0062, "step": 66600 }, { "epoch": 0.6907407599183952, "grad_norm": 0.2514505386352539, "learning_rate": 0.00022008405338764137, "loss": 2.0081, "step": 66700 }, { "epoch": 0.6917763532616013, "grad_norm": 0.2650257349014282, "learning_rate": 0.00021986822487089067, "loss": 2.0103, "step": 66800 }, { "epoch": 0.6928119466048073, "grad_norm": 0.2810705602169037, "learning_rate": 0.00021965221146884716, "loss": 2.0181, "step": 66900 }, { "epoch": 0.6938475399480132, "grad_norm": 0.5697815418243408, "learning_rate": 0.0002194360137531255, "loss": 2.0088, "step": 67000 }, { "epoch": 0.6948831332912192, "grad_norm": 0.2409239411354065, "learning_rate": 0.00021921963229582807, "loss": 2.017, "step": 67100 }, { "epoch": 0.6959187266344252, "grad_norm": 0.24445517361164093, "learning_rate": 0.00021900306766954342, "loss": 2.0179, "step": 67200 }, { "epoch": 0.6969543199776311, "grad_norm": 0.25933966040611267, "learning_rate": 0.00021878632044734493, "loss": 2.0315, "step": 67300 }, { "epoch": 0.6979899133208372, "grad_norm": 0.25666582584381104, "learning_rate": 0.00021856939120278905, "loss": 2.009, "step": 67400 }, { "epoch": 0.6990255066640432, "grad_norm": 0.2586144506931305, "learning_rate": 0.00021835228050991393, "loss": 2.0078, "step": 67500 }, { "epoch": 0.7000611000072492, "grad_norm": 0.2634826600551605, "learning_rate": 0.0002181349889432379, "loss": 1.9967, "step": 67600 }, { "epoch": 0.7010966933504551, "grad_norm": 0.4144364297389984, "learning_rate": 0.0002179175170777579, "loss": 2.0002, "step": 67700 }, { "epoch": 0.7021322866936611, "grad_norm": 0.24579951167106628, "learning_rate": 0.0002176998654889479, "loss": 2.0194, "step": 67800 }, { "epoch": 0.7031678800368671, "grad_norm": 0.2515772581100464, "learning_rate": 0.00021748203475275752, "loss": 2.0194, "step": 67900 }, { "epoch": 0.7042034733800732, "grad_norm": 0.25677964091300964, "learning_rate": 0.00021726402544561054, "loss": 1.9973, "step": 68000 }, { "epoch": 0.7052390667232791, "grad_norm": 0.2586514353752136, "learning_rate": 0.0002170458381444031, "loss": 2.004, "step": 68100 }, { "epoch": 0.7062746600664851, "grad_norm": 0.24003922939300537, "learning_rate": 0.00021682747342650248, "loss": 2.0248, "step": 68200 }, { "epoch": 0.7073102534096911, "grad_norm": 0.24476157128810883, "learning_rate": 0.00021660893186974537, "loss": 1.9968, "step": 68300 }, { "epoch": 0.708345846752897, "grad_norm": 0.26136067509651184, "learning_rate": 0.00021639021405243635, "loss": 2.0065, "step": 68400 }, { "epoch": 0.709381440096103, "grad_norm": 0.27516236901283264, "learning_rate": 0.0002161713205533466, "loss": 2.0104, "step": 68500 }, { "epoch": 0.7104170334393091, "grad_norm": 0.251360684633255, "learning_rate": 0.00021595225195171198, "loss": 2.0052, "step": 68600 }, { "epoch": 0.7114526267825151, "grad_norm": 0.2523316442966461, "learning_rate": 0.00021573300882723193, "loss": 2.0121, "step": 68700 }, { "epoch": 0.712488220125721, "grad_norm": 0.26253175735473633, "learning_rate": 0.00021551359176006754, "loss": 2.0015, "step": 68800 }, { "epoch": 0.713523813468927, "grad_norm": 0.2567383944988251, "learning_rate": 0.00021529400133084018, "loss": 2.0141, "step": 68900 }, { "epoch": 0.714559406812133, "grad_norm": 0.25148722529411316, "learning_rate": 0.00021507423812063014, "loss": 2.0168, "step": 69000 }, { "epoch": 0.7155950001553391, "grad_norm": 0.2394070029258728, "learning_rate": 0.00021485430271097472, "loss": 1.999, "step": 69100 }, { "epoch": 0.716630593498545, "grad_norm": 0.2716984450817108, "learning_rate": 0.00021463419568386706, "loss": 2.0055, "step": 69200 }, { "epoch": 0.717666186841751, "grad_norm": 0.24565351009368896, "learning_rate": 0.0002144139176217543, "loss": 2.0069, "step": 69300 }, { "epoch": 0.718701780184957, "grad_norm": 0.2614271640777588, "learning_rate": 0.00021419346910753632, "loss": 2.0015, "step": 69400 }, { "epoch": 0.7197373735281629, "grad_norm": 0.26003938913345337, "learning_rate": 0.00021397285072456386, "loss": 2.0131, "step": 69500 }, { "epoch": 0.7207729668713689, "grad_norm": 0.23795704543590546, "learning_rate": 0.00021375206305663735, "loss": 2.0046, "step": 69600 }, { "epoch": 0.721808560214575, "grad_norm": 0.24042952060699463, "learning_rate": 0.00021353110668800512, "loss": 2.0024, "step": 69700 }, { "epoch": 0.722844153557781, "grad_norm": 0.24879732728004456, "learning_rate": 0.00021330998220336182, "loss": 1.9848, "step": 69800 }, { "epoch": 0.7238797469009869, "grad_norm": 0.2585168182849884, "learning_rate": 0.00021308869018784712, "loss": 1.9973, "step": 69900 }, { "epoch": 0.7249153402441929, "grad_norm": 0.24421517550945282, "learning_rate": 0.0002128672312270439, "loss": 2.0058, "step": 70000 }, { "epoch": 0.7259509335873989, "grad_norm": 0.2535378932952881, "learning_rate": 0.00021264560590697688, "loss": 1.9808, "step": 70100 }, { "epoch": 0.7269865269306048, "grad_norm": 0.23428113758563995, "learning_rate": 0.00021242381481411093, "loss": 2.0007, "step": 70200 }, { "epoch": 0.7280221202738109, "grad_norm": 0.23449118435382843, "learning_rate": 0.00021220185853534963, "loss": 2.0007, "step": 70300 }, { "epoch": 0.7290577136170169, "grad_norm": 0.2528586983680725, "learning_rate": 0.00021197973765803377, "loss": 2.0014, "step": 70400 }, { "epoch": 0.7300933069602229, "grad_norm": 0.25187110900878906, "learning_rate": 0.00021175745276993943, "loss": 1.9949, "step": 70500 }, { "epoch": 0.7311289003034288, "grad_norm": 0.2575116753578186, "learning_rate": 0.00021153722974939294, "loss": 2.0008, "step": 70600 }, { "epoch": 0.7321644936466348, "grad_norm": 0.23507162928581238, "learning_rate": 0.00021131462023022917, "loss": 2.0106, "step": 70700 }, { "epoch": 0.7332000869898408, "grad_norm": 0.23946309089660645, "learning_rate": 0.00021109184846032067, "loss": 1.9841, "step": 70800 }, { "epoch": 0.7342356803330469, "grad_norm": 0.23551732301712036, "learning_rate": 0.00021086891502916617, "loss": 1.9878, "step": 70900 }, { "epoch": 0.7352712736762528, "grad_norm": 0.2436358779668808, "learning_rate": 0.00021064582052669208, "loss": 2.0185, "step": 71000 }, { "epoch": 0.7363068670194588, "grad_norm": 0.2541455030441284, "learning_rate": 0.00021042256554325102, "loss": 1.9967, "step": 71100 }, { "epoch": 0.7373424603626648, "grad_norm": 0.24278046190738678, "learning_rate": 0.00021019915066962036, "loss": 1.9972, "step": 71200 }, { "epoch": 0.7383780537058707, "grad_norm": 0.2799344062805176, "learning_rate": 0.0002099755764970005, "loss": 2.0035, "step": 71300 }, { "epoch": 0.7394136470490767, "grad_norm": 0.2736598551273346, "learning_rate": 0.00020975184361701342, "loss": 2.0084, "step": 71400 }, { "epoch": 0.7404492403922828, "grad_norm": 0.24888059496879578, "learning_rate": 0.00020953019231237998, "loss": 2.0127, "step": 71500 }, { "epoch": 0.7414848337354888, "grad_norm": 0.25305336713790894, "learning_rate": 0.00020930614536649725, "loss": 2.0102, "step": 71600 }, { "epoch": 0.7425204270786947, "grad_norm": 0.25712886452674866, "learning_rate": 0.00020908194148469588, "loss": 1.9889, "step": 71700 }, { "epoch": 0.7435560204219007, "grad_norm": 0.23383405804634094, "learning_rate": 0.00020885758126026427, "loss": 1.9825, "step": 71800 }, { "epoch": 0.7445916137651067, "grad_norm": 0.24051252007484436, "learning_rate": 0.00020863306528690428, "loss": 1.9924, "step": 71900 }, { "epoch": 0.7456272071083128, "grad_norm": 0.2587604522705078, "learning_rate": 0.0002084083941587302, "loss": 1.9766, "step": 72000 }, { "epoch": 0.7466628004515187, "grad_norm": 0.2326751947402954, "learning_rate": 0.0002081835684702667, "loss": 1.9881, "step": 72100 }, { "epoch": 0.7476983937947247, "grad_norm": 0.23553723096847534, "learning_rate": 0.0002079585888164475, "loss": 1.995, "step": 72200 }, { "epoch": 0.7487339871379307, "grad_norm": 0.2775946259498596, "learning_rate": 0.0002077334557926138, "loss": 1.9852, "step": 72300 }, { "epoch": 0.7497695804811366, "grad_norm": 0.2542034685611725, "learning_rate": 0.0002075081699945125, "loss": 1.9967, "step": 72400 }, { "epoch": 0.7508051738243426, "grad_norm": 0.24794547259807587, "learning_rate": 0.00020728273201829492, "loss": 1.998, "step": 72500 }, { "epoch": 0.7518407671675487, "grad_norm": 0.2581765651702881, "learning_rate": 0.00020705714246051506, "loss": 1.9734, "step": 72600 }, { "epoch": 0.7528763605107547, "grad_norm": 0.26272204518318176, "learning_rate": 0.00020683140191812794, "loss": 1.9937, "step": 72700 }, { "epoch": 0.7539119538539606, "grad_norm": 0.2397502362728119, "learning_rate": 0.0002066055109884882, "loss": 1.9834, "step": 72800 }, { "epoch": 0.7549475471971666, "grad_norm": 0.2633424699306488, "learning_rate": 0.00020637947026934836, "loss": 1.9898, "step": 72900 }, { "epoch": 0.7559831405403726, "grad_norm": 0.2585466802120209, "learning_rate": 0.0002061532803588574, "loss": 1.9861, "step": 73000 }, { "epoch": 0.7570187338835785, "grad_norm": 0.24099628627300262, "learning_rate": 0.00020592694185555906, "loss": 1.983, "step": 73100 }, { "epoch": 0.7580543272267846, "grad_norm": 0.2721065878868103, "learning_rate": 0.0002057004553583903, "loss": 1.9928, "step": 73200 }, { "epoch": 0.7590899205699906, "grad_norm": 0.24477939307689667, "learning_rate": 0.00020547382146667955, "loss": 1.9895, "step": 73300 }, { "epoch": 0.7601255139131966, "grad_norm": 0.2544732391834259, "learning_rate": 0.00020524704078014555, "loss": 1.9802, "step": 73400 }, { "epoch": 0.7611611072564025, "grad_norm": 0.23969247937202454, "learning_rate": 0.00020502011389889524, "loss": 1.9913, "step": 73500 }, { "epoch": 0.7621967005996085, "grad_norm": 0.2460978776216507, "learning_rate": 0.00020479304142342252, "loss": 2.0015, "step": 73600 }, { "epoch": 0.7632322939428146, "grad_norm": 0.241098552942276, "learning_rate": 0.00020456582395460664, "loss": 1.9768, "step": 73700 }, { "epoch": 0.7642678872860206, "grad_norm": 0.26376426219940186, "learning_rate": 0.00020433846209371042, "loss": 1.9925, "step": 73800 }, { "epoch": 0.7653034806292265, "grad_norm": 0.2637104094028473, "learning_rate": 0.00020411095644237877, "loss": 1.9977, "step": 73900 }, { "epoch": 0.7663390739724325, "grad_norm": 0.23449744284152985, "learning_rate": 0.00020388330760263715, "loss": 1.9905, "step": 74000 }, { "epoch": 0.7673746673156385, "grad_norm": 0.25932568311691284, "learning_rate": 0.00020365551617688993, "loss": 1.9766, "step": 74100 }, { "epoch": 0.7684102606588444, "grad_norm": 0.2501455247402191, "learning_rate": 0.00020342758276791875, "loss": 2.0043, "step": 74200 }, { "epoch": 0.7694458540020505, "grad_norm": 0.24867907166481018, "learning_rate": 0.000203199507978881, "loss": 1.9807, "step": 74300 }, { "epoch": 0.7704814473452565, "grad_norm": 0.2546774446964264, "learning_rate": 0.00020297129241330817, "loss": 1.9995, "step": 74400 }, { "epoch": 0.7715170406884625, "grad_norm": 0.25409987568855286, "learning_rate": 0.00020274293667510426, "loss": 1.9734, "step": 74500 }, { "epoch": 0.7725526340316684, "grad_norm": 0.23935672640800476, "learning_rate": 0.0002025144413685442, "loss": 1.9891, "step": 74600 }, { "epoch": 0.7735882273748744, "grad_norm": 0.24716085195541382, "learning_rate": 0.00020228580709827227, "loss": 1.9811, "step": 74700 }, { "epoch": 0.7746238207180804, "grad_norm": 0.24609538912773132, "learning_rate": 0.00020205703446930047, "loss": 1.9914, "step": 74800 }, { "epoch": 0.7756594140612865, "grad_norm": 0.25824907422065735, "learning_rate": 0.00020182812408700692, "loss": 1.9934, "step": 74900 }, { "epoch": 0.7766950074044924, "grad_norm": 0.2460188865661621, "learning_rate": 0.0002015990765571343, "loss": 1.9789, "step": 75000 }, { "epoch": 0.7777306007476984, "grad_norm": 0.2536432147026062, "learning_rate": 0.0002013721850003908, "loss": 1.9967, "step": 75100 }, { "epoch": 0.7787661940909044, "grad_norm": 0.2792772352695465, "learning_rate": 0.00020114286635038482, "loss": 1.9815, "step": 75200 }, { "epoch": 0.7798017874341103, "grad_norm": 0.23500213027000427, "learning_rate": 0.0002009134123661287, "loss": 1.9847, "step": 75300 }, { "epoch": 0.7808373807773163, "grad_norm": 0.24085991084575653, "learning_rate": 0.00020068382365480357, "loss": 1.9984, "step": 75400 }, { "epoch": 0.7818729741205224, "grad_norm": 0.2575315535068512, "learning_rate": 0.0002004541008239471, "loss": 1.9921, "step": 75500 }, { "epoch": 0.7829085674637284, "grad_norm": 0.24535280466079712, "learning_rate": 0.00020022424448145175, "loss": 1.983, "step": 75600 }, { "epoch": 0.7839441608069343, "grad_norm": 0.2506721317768097, "learning_rate": 0.00019999425523556345, "loss": 1.9923, "step": 75700 }, { "epoch": 0.7849797541501403, "grad_norm": 0.2742637097835541, "learning_rate": 0.0001997641336948797, "loss": 1.9909, "step": 75800 }, { "epoch": 0.7860153474933463, "grad_norm": 0.24554753303527832, "learning_rate": 0.00019953388046834808, "loss": 1.9896, "step": 75900 }, { "epoch": 0.7870509408365522, "grad_norm": 0.23163291811943054, "learning_rate": 0.0001993034961652647, "loss": 1.9651, "step": 76000 }, { "epoch": 0.7880865341797583, "grad_norm": 0.2596520185470581, "learning_rate": 0.00019907298139527243, "loss": 1.9825, "step": 76100 }, { "epoch": 0.7891221275229643, "grad_norm": 0.2425704300403595, "learning_rate": 0.00019884233676835956, "loss": 1.9673, "step": 76200 }, { "epoch": 0.7901577208661703, "grad_norm": 0.2547663152217865, "learning_rate": 0.00019861156289485775, "loss": 1.9843, "step": 76300 }, { "epoch": 0.7911933142093762, "grad_norm": 0.25040435791015625, "learning_rate": 0.00019838066038544095, "loss": 1.9905, "step": 76400 }, { "epoch": 0.7922289075525822, "grad_norm": 0.232724130153656, "learning_rate": 0.00019814962985112325, "loss": 1.9842, "step": 76500 }, { "epoch": 0.7932645008957883, "grad_norm": 0.2375490516424179, "learning_rate": 0.00019792078411142526, "loss": 1.9864, "step": 76600 }, { "epoch": 0.7943000942389943, "grad_norm": 0.2362678200006485, "learning_rate": 0.00019768950062669157, "loss": 1.978, "step": 76700 }, { "epoch": 0.7953356875822002, "grad_norm": 0.2464078664779663, "learning_rate": 0.000197458090946004, "loss": 1.976, "step": 76800 }, { "epoch": 0.7963712809254062, "grad_norm": 0.32281023263931274, "learning_rate": 0.00019722655568171876, "loss": 1.9793, "step": 76900 }, { "epoch": 0.7974068742686122, "grad_norm": 0.25067856907844543, "learning_rate": 0.00019699489544652442, "loss": 1.9769, "step": 77000 }, { "epoch": 0.7984424676118181, "grad_norm": 0.23218156397342682, "learning_rate": 0.0001967631108534404, "loss": 1.9794, "step": 77100 }, { "epoch": 0.7994780609550242, "grad_norm": 0.22952504456043243, "learning_rate": 0.00019653120251581497, "loss": 1.9813, "step": 77200 }, { "epoch": 0.8005136542982302, "grad_norm": 0.24321745336055756, "learning_rate": 0.00019629917104732402, "loss": 1.9874, "step": 77300 }, { "epoch": 0.8015492476414362, "grad_norm": 0.24674613773822784, "learning_rate": 0.00019606701706196918, "loss": 1.9906, "step": 77400 }, { "epoch": 0.8025848409846421, "grad_norm": 0.23949798941612244, "learning_rate": 0.00019583474117407633, "loss": 1.9938, "step": 77500 }, { "epoch": 0.8036204343278481, "grad_norm": 0.24143968522548676, "learning_rate": 0.00019560234399829395, "loss": 1.9814, "step": 77600 }, { "epoch": 0.8046560276710542, "grad_norm": 0.2421048879623413, "learning_rate": 0.00019536982614959132, "loss": 1.9712, "step": 77700 }, { "epoch": 0.8056916210142602, "grad_norm": 0.25979506969451904, "learning_rate": 0.00019513718824325723, "loss": 1.974, "step": 77800 }, { "epoch": 0.8067272143574661, "grad_norm": 0.23263593018054962, "learning_rate": 0.00019490443089489816, "loss": 1.9544, "step": 77900 }, { "epoch": 0.8077628077006721, "grad_norm": 0.25045648217201233, "learning_rate": 0.0001946715547204364, "loss": 1.9644, "step": 78000 }, { "epoch": 0.8087984010438781, "grad_norm": 0.2449960708618164, "learning_rate": 0.00019443856033610912, "loss": 1.9776, "step": 78100 }, { "epoch": 0.809833994387084, "grad_norm": 0.2566138207912445, "learning_rate": 0.00019420544835846583, "loss": 1.966, "step": 78200 }, { "epoch": 0.81086958773029, "grad_norm": 0.23505167663097382, "learning_rate": 0.00019397221940436758, "loss": 1.9692, "step": 78300 }, { "epoch": 0.8119051810734961, "grad_norm": 0.24771443009376526, "learning_rate": 0.00019373887409098475, "loss": 1.9659, "step": 78400 }, { "epoch": 0.8129407744167021, "grad_norm": 0.23590005934238434, "learning_rate": 0.00019350541303579568, "loss": 1.9829, "step": 78500 }, { "epoch": 0.813976367759908, "grad_norm": 0.23944394290447235, "learning_rate": 0.00019327183685658505, "loss": 1.955, "step": 78600 }, { "epoch": 0.815011961103114, "grad_norm": 0.274551659822464, "learning_rate": 0.00019303814617144211, "loss": 1.9831, "step": 78700 }, { "epoch": 0.81604755444632, "grad_norm": 0.2385895699262619, "learning_rate": 0.00019280434159875913, "loss": 1.9824, "step": 78800 }, { "epoch": 0.817083147789526, "grad_norm": 0.24645298719406128, "learning_rate": 0.00019257042375722975, "loss": 1.9825, "step": 78900 }, { "epoch": 0.818118741132732, "grad_norm": 0.24062161147594452, "learning_rate": 0.0001923363932658474, "loss": 1.9812, "step": 79000 }, { "epoch": 0.819154334475938, "grad_norm": 0.5793871879577637, "learning_rate": 0.0001921022507439035, "loss": 1.9621, "step": 79100 }, { "epoch": 0.820189927819144, "grad_norm": 0.24148312211036682, "learning_rate": 0.00019186799681098597, "loss": 1.9651, "step": 79200 }, { "epoch": 0.8212255211623499, "grad_norm": 0.23721380531787872, "learning_rate": 0.0001916336320869776, "loss": 1.9896, "step": 79300 }, { "epoch": 0.822261114505556, "grad_norm": 0.2557445168495178, "learning_rate": 0.00019139915719205424, "loss": 1.9711, "step": 79400 }, { "epoch": 0.823296707848762, "grad_norm": 0.2520018517971039, "learning_rate": 0.00019116457274668348, "loss": 1.975, "step": 79500 }, { "epoch": 0.824332301191968, "grad_norm": 0.24257516860961914, "learning_rate": 0.00019092987937162256, "loss": 1.9558, "step": 79600 }, { "epoch": 0.8253678945351739, "grad_norm": 0.2335796356201172, "learning_rate": 0.00019069507768791706, "loss": 1.9811, "step": 79700 }, { "epoch": 0.8264034878783799, "grad_norm": 0.24471502006053925, "learning_rate": 0.00019046016831689928, "loss": 1.968, "step": 79800 }, { "epoch": 0.8274390812215859, "grad_norm": 0.2673594653606415, "learning_rate": 0.00019022515188018627, "loss": 1.9678, "step": 79900 }, { "epoch": 0.8284746745647918, "grad_norm": 0.2933145761489868, "learning_rate": 0.00018999002899967857, "loss": 1.9793, "step": 80000 }, { "epoch": 0.8295102679079979, "grad_norm": 0.24044671654701233, "learning_rate": 0.00018975480029755833, "loss": 1.9704, "step": 80100 }, { "epoch": 0.8305458612512039, "grad_norm": 0.2411203682422638, "learning_rate": 0.00018951946639628775, "loss": 1.954, "step": 80200 }, { "epoch": 0.8315814545944099, "grad_norm": 0.2325044423341751, "learning_rate": 0.00018928402791860735, "loss": 1.9797, "step": 80300 }, { "epoch": 0.8326170479376158, "grad_norm": 0.21887433528900146, "learning_rate": 0.0001890484854875344, "loss": 1.9542, "step": 80400 }, { "epoch": 0.8336526412808218, "grad_norm": 0.2547757029533386, "learning_rate": 0.00018881283972636137, "loss": 1.9822, "step": 80500 }, { "epoch": 0.8346882346240279, "grad_norm": 0.24561412632465363, "learning_rate": 0.00018857944924968018, "loss": 1.9808, "step": 80600 }, { "epoch": 0.8357238279672339, "grad_norm": 0.2566587030887604, "learning_rate": 0.00018834359971701408, "loss": 1.9645, "step": 80700 }, { "epoch": 0.8367594213104398, "grad_norm": 0.23394763469696045, "learning_rate": 0.00018810764871951655, "loss": 1.953, "step": 80800 }, { "epoch": 0.8377950146536458, "grad_norm": 0.23540134727954865, "learning_rate": 0.000187871596881561, "loss": 1.9648, "step": 80900 }, { "epoch": 0.8388306079968518, "grad_norm": 0.25846412777900696, "learning_rate": 0.00018763544482778793, "loss": 1.9768, "step": 81000 }, { "epoch": 0.8398662013400577, "grad_norm": 0.2589336037635803, "learning_rate": 0.00018739919318310277, "loss": 1.9775, "step": 81100 }, { "epoch": 0.8409017946832638, "grad_norm": 0.26688146591186523, "learning_rate": 0.0001871628425726747, "loss": 1.9579, "step": 81200 }, { "epoch": 0.8419373880264698, "grad_norm": 0.2547050714492798, "learning_rate": 0.00018692639362193463, "loss": 1.9696, "step": 81300 }, { "epoch": 0.8429729813696758, "grad_norm": 0.27237093448638916, "learning_rate": 0.0001866898469565738, "loss": 1.9752, "step": 81400 }, { "epoch": 0.8440085747128817, "grad_norm": 0.24073271453380585, "learning_rate": 0.00018645320320254198, "loss": 1.9655, "step": 81500 }, { "epoch": 0.8450441680560877, "grad_norm": 0.24638384580612183, "learning_rate": 0.00018621646298604583, "loss": 1.9692, "step": 81600 }, { "epoch": 0.8460797613992938, "grad_norm": 0.24088312685489655, "learning_rate": 0.00018597962693354736, "loss": 1.9662, "step": 81700 }, { "epoch": 0.8471153547424997, "grad_norm": 0.27659064531326294, "learning_rate": 0.00018574269567176198, "loss": 1.9726, "step": 81800 }, { "epoch": 0.8481509480857057, "grad_norm": 0.2347290813922882, "learning_rate": 0.00018550566982765738, "loss": 1.956, "step": 81900 }, { "epoch": 0.8491865414289117, "grad_norm": 0.24576643109321594, "learning_rate": 0.0001852709216894609, "loss": 1.9516, "step": 82000 }, { "epoch": 0.8502221347721177, "grad_norm": 0.2404729276895523, "learning_rate": 0.0001850337094927892, "loss": 1.9564, "step": 82100 }, { "epoch": 0.8512577281153236, "grad_norm": 0.23867496848106384, "learning_rate": 0.00018479640458991733, "loss": 1.9476, "step": 82200 }, { "epoch": 0.8522933214585297, "grad_norm": 0.24426643550395966, "learning_rate": 0.0001845590076088015, "loss": 1.953, "step": 82300 }, { "epoch": 0.8533289148017357, "grad_norm": 0.23025211691856384, "learning_rate": 0.00018432151917764167, "loss": 1.9681, "step": 82400 }, { "epoch": 0.8543645081449417, "grad_norm": 0.2465585321187973, "learning_rate": 0.0001840839399248797, "loss": 1.958, "step": 82500 }, { "epoch": 0.8554001014881476, "grad_norm": 0.24563215672969818, "learning_rate": 0.00018384627047919773, "loss": 1.9733, "step": 82600 }, { "epoch": 0.8564356948313536, "grad_norm": 0.25759199261665344, "learning_rate": 0.0001836085114695167, "loss": 1.9629, "step": 82700 }, { "epoch": 0.8574712881745596, "grad_norm": 0.2293255478143692, "learning_rate": 0.00018337066352499444, "loss": 1.9605, "step": 82800 }, { "epoch": 0.8585068815177656, "grad_norm": 0.24663272500038147, "learning_rate": 0.00018313272727502424, "loss": 1.9544, "step": 82900 }, { "epoch": 0.8595424748609716, "grad_norm": 0.25857609510421753, "learning_rate": 0.0001828947033492329, "loss": 1.9564, "step": 83000 }, { "epoch": 0.8605780682041776, "grad_norm": 0.24891872704029083, "learning_rate": 0.00018265659237747943, "loss": 1.9752, "step": 83100 }, { "epoch": 0.8616136615473836, "grad_norm": 0.2388416975736618, "learning_rate": 0.00018241839498985295, "loss": 1.9538, "step": 83200 }, { "epoch": 0.8626492548905895, "grad_norm": 0.26303911209106445, "learning_rate": 0.00018218011181667145, "loss": 1.9643, "step": 83300 }, { "epoch": 0.8636848482337955, "grad_norm": 0.2458028942346573, "learning_rate": 0.00018194174348847985, "loss": 1.9439, "step": 83400 }, { "epoch": 0.8647204415770016, "grad_norm": 0.23554839193820953, "learning_rate": 0.00018170329063604834, "loss": 1.9659, "step": 83500 }, { "epoch": 0.8657560349202076, "grad_norm": 0.23672832548618317, "learning_rate": 0.00018146475389037094, "loss": 1.9615, "step": 83600 }, { "epoch": 0.8667916282634135, "grad_norm": 0.25055161118507385, "learning_rate": 0.00018122613388266346, "loss": 1.9792, "step": 83700 }, { "epoch": 0.8678272216066195, "grad_norm": 0.24413429200649261, "learning_rate": 0.00018098743124436226, "loss": 1.9573, "step": 83800 }, { "epoch": 0.8688628149498255, "grad_norm": 0.25972703099250793, "learning_rate": 0.00018074864660712223, "loss": 1.9697, "step": 83900 }, { "epoch": 0.8698984082930314, "grad_norm": 0.24612946808338165, "learning_rate": 0.0001805097806028152, "loss": 1.9596, "step": 84000 }, { "epoch": 0.8709340016362375, "grad_norm": 0.22612716257572174, "learning_rate": 0.00018027083386352842, "loss": 1.9691, "step": 84100 }, { "epoch": 0.8719695949794435, "grad_norm": 0.292486310005188, "learning_rate": 0.00018003180702156276, "loss": 1.9341, "step": 84200 }, { "epoch": 0.8730051883226495, "grad_norm": 0.23603379726409912, "learning_rate": 0.000179792700709431, "loss": 1.9749, "step": 84300 }, { "epoch": 0.8740407816658554, "grad_norm": 0.24144618213176727, "learning_rate": 0.00017955351555985622, "loss": 1.9473, "step": 84400 }, { "epoch": 0.8750763750090614, "grad_norm": 0.24716521799564362, "learning_rate": 0.0001793142522057702, "loss": 1.9469, "step": 84500 }, { "epoch": 0.8761119683522675, "grad_norm": 0.23078346252441406, "learning_rate": 0.00017907491128031164, "loss": 1.9646, "step": 84600 }, { "epoch": 0.8771475616954735, "grad_norm": 0.24642567336559296, "learning_rate": 0.00017883549341682443, "loss": 1.9531, "step": 84700 }, { "epoch": 0.8781831550386794, "grad_norm": 0.25259271264076233, "learning_rate": 0.00017859599924885614, "loss": 1.9555, "step": 84800 }, { "epoch": 0.8792187483818854, "grad_norm": 0.24995897710323334, "learning_rate": 0.00017835642941015615, "loss": 1.9616, "step": 84900 }, { "epoch": 0.8802543417250914, "grad_norm": 0.25117799639701843, "learning_rate": 0.00017811678453467429, "loss": 1.9542, "step": 85000 }, { "epoch": 0.8812899350682973, "grad_norm": 0.2627142071723938, "learning_rate": 0.0001778770652565587, "loss": 1.9515, "step": 85100 }, { "epoch": 0.8823255284115034, "grad_norm": 0.25060367584228516, "learning_rate": 0.00017763727221015456, "loss": 1.9554, "step": 85200 }, { "epoch": 0.8833611217547094, "grad_norm": 0.24538376927375793, "learning_rate": 0.0001773974060300023, "loss": 1.9449, "step": 85300 }, { "epoch": 0.8843967150979154, "grad_norm": 0.25490647554397583, "learning_rate": 0.0001771574673508356, "loss": 1.9623, "step": 85400 }, { "epoch": 0.8854323084411213, "grad_norm": 0.28633439540863037, "learning_rate": 0.00017691745680758033, "loss": 1.9615, "step": 85500 }, { "epoch": 0.8864679017843273, "grad_norm": 0.251254677772522, "learning_rate": 0.00017667737503535233, "loss": 1.9355, "step": 85600 }, { "epoch": 0.8875034951275333, "grad_norm": 0.2436380237340927, "learning_rate": 0.000176437222669456, "loss": 1.9578, "step": 85700 }, { "epoch": 0.8885390884707393, "grad_norm": 0.24810588359832764, "learning_rate": 0.00017619700034538248, "loss": 1.9441, "step": 85800 }, { "epoch": 0.8895746818139453, "grad_norm": 0.3414151966571808, "learning_rate": 0.00017595670869880803, "loss": 1.9319, "step": 85900 }, { "epoch": 0.8906102751571513, "grad_norm": 0.24401932954788208, "learning_rate": 0.00017571634836559253, "loss": 1.9571, "step": 86000 }, { "epoch": 0.8916458685003573, "grad_norm": 0.24051256477832794, "learning_rate": 0.00017547591998177733, "loss": 1.9547, "step": 86100 }, { "epoch": 0.8926814618435632, "grad_norm": 0.2365662008523941, "learning_rate": 0.00017523542418358413, "loss": 1.9524, "step": 86200 }, { "epoch": 0.8937170551867692, "grad_norm": 0.253713995218277, "learning_rate": 0.00017499486160741284, "loss": 1.9523, "step": 86300 }, { "epoch": 0.8947526485299753, "grad_norm": 0.23283855617046356, "learning_rate": 0.0001747542328898401, "loss": 1.9541, "step": 86400 }, { "epoch": 0.8957882418731813, "grad_norm": 0.24103082716464996, "learning_rate": 0.0001745135386676177, "loss": 1.9571, "step": 86500 }, { "epoch": 0.8968238352163872, "grad_norm": 0.23687419295310974, "learning_rate": 0.00017427277957767057, "loss": 1.9598, "step": 86600 }, { "epoch": 0.8978594285595932, "grad_norm": 0.2780131995677948, "learning_rate": 0.00017403195625709548, "loss": 1.9468, "step": 86700 }, { "epoch": 0.8988950219027992, "grad_norm": 0.24298681318759918, "learning_rate": 0.00017379106934315905, "loss": 1.9564, "step": 86800 }, { "epoch": 0.8999306152460051, "grad_norm": 0.25848403573036194, "learning_rate": 0.0001735501194732963, "loss": 1.9453, "step": 86900 }, { "epoch": 0.9009662085892112, "grad_norm": 0.23628826439380646, "learning_rate": 0.0001733091072851086, "loss": 1.9485, "step": 87000 }, { "epoch": 0.9020018019324172, "grad_norm": 0.23663002252578735, "learning_rate": 0.0001730680334163625, "loss": 1.9304, "step": 87100 }, { "epoch": 0.9030373952756232, "grad_norm": 0.22020183503627777, "learning_rate": 0.0001728268985049877, "loss": 1.9587, "step": 87200 }, { "epoch": 0.9040729886188291, "grad_norm": 0.293626606464386, "learning_rate": 0.00017258570318907528, "loss": 1.9469, "step": 87300 }, { "epoch": 0.9051085819620351, "grad_norm": 0.2634652554988861, "learning_rate": 0.00017234444810687634, "loss": 1.9469, "step": 87400 }, { "epoch": 0.9061441753052412, "grad_norm": 0.2406974732875824, "learning_rate": 0.00017210313389680006, "loss": 1.9346, "step": 87500 }, { "epoch": 0.9071797686484472, "grad_norm": 0.2431829273700714, "learning_rate": 0.00017186176119741208, "loss": 1.9447, "step": 87600 }, { "epoch": 0.9082153619916531, "grad_norm": 0.2331622689962387, "learning_rate": 0.00017162033064743281, "loss": 1.9443, "step": 87700 }, { "epoch": 0.9092509553348591, "grad_norm": 0.2355496734380722, "learning_rate": 0.00017137884288573573, "loss": 1.9484, "step": 87800 }, { "epoch": 0.9102865486780651, "grad_norm": 0.25062501430511475, "learning_rate": 0.0001711372985513458, "loss": 1.9511, "step": 87900 }, { "epoch": 0.911322142021271, "grad_norm": 0.22969716787338257, "learning_rate": 0.00017089569828343762, "loss": 1.9524, "step": 88000 }, { "epoch": 0.9123577353644771, "grad_norm": Infinity, "learning_rate": 0.00017065645954856154, "loss": 1.9562, "step": 88100 }, { "epoch": 0.9133933287076831, "grad_norm": 0.22948400676250458, "learning_rate": 0.00017041474987511244, "loss": 1.9383, "step": 88200 }, { "epoch": 0.9144289220508891, "grad_norm": 0.24315734207630157, "learning_rate": 0.00017017298618015328, "loss": 1.9442, "step": 88300 }, { "epoch": 0.915464515394095, "grad_norm": 0.235651433467865, "learning_rate": 0.00016993116910343924, "loss": 1.9552, "step": 88400 }, { "epoch": 0.916500108737301, "grad_norm": 0.23327066004276276, "learning_rate": 0.00016968929928486663, "loss": 1.9495, "step": 88500 }, { "epoch": 0.917535702080507, "grad_norm": 0.2551262378692627, "learning_rate": 0.0001694473773644713, "loss": 1.953, "step": 88600 }, { "epoch": 0.918571295423713, "grad_norm": 0.2437225729227066, "learning_rate": 0.00016920540398242712, "loss": 1.9602, "step": 88700 }, { "epoch": 0.919606888766919, "grad_norm": 0.23832499980926514, "learning_rate": 0.0001689633797790439, "loss": 1.9477, "step": 88800 }, { "epoch": 0.920642482110125, "grad_norm": 0.23777015507221222, "learning_rate": 0.0001687213053947662, "loss": 1.9658, "step": 88900 }, { "epoch": 0.921678075453331, "grad_norm": 0.2510216534137726, "learning_rate": 0.00016848160295253802, "loss": 1.9379, "step": 89000 }, { "epoch": 0.9227136687965369, "grad_norm": 0.2538154423236847, "learning_rate": 0.00016823943061415817, "loss": 1.938, "step": 89100 }, { "epoch": 0.923749262139743, "grad_norm": 0.2405252903699875, "learning_rate": 0.00016799721001059802, "loss": 1.9381, "step": 89200 }, { "epoch": 0.924784855482949, "grad_norm": 0.24780845642089844, "learning_rate": 0.00016775494178282173, "loss": 1.9305, "step": 89300 }, { "epoch": 0.925820448826155, "grad_norm": 0.2316620647907257, "learning_rate": 0.0001675126265719195, "loss": 1.9525, "step": 89400 }, { "epoch": 0.9268560421693609, "grad_norm": 0.22902575135231018, "learning_rate": 0.00016727026501910572, "loss": 1.9499, "step": 89500 }, { "epoch": 0.9278916355125669, "grad_norm": 0.23176421225070953, "learning_rate": 0.00016702785776571765, "loss": 1.9272, "step": 89600 }, { "epoch": 0.928927228855773, "grad_norm": 0.23081515729427338, "learning_rate": 0.00016678540545321332, "loss": 1.9399, "step": 89700 }, { "epoch": 0.9299628221989789, "grad_norm": 0.22756661474704742, "learning_rate": 0.00016654290872317, "loss": 1.9449, "step": 89800 }, { "epoch": 0.9309984155421849, "grad_norm": 0.24266347289085388, "learning_rate": 0.0001663003682172825, "loss": 1.9295, "step": 89900 }, { "epoch": 0.9320340088853909, "grad_norm": 0.22966596484184265, "learning_rate": 0.00016605778457736147, "loss": 1.9285, "step": 90000 }, { "epoch": 0.9330696022285969, "grad_norm": 0.29232585430145264, "learning_rate": 0.00016581515844533187, "loss": 1.9459, "step": 90100 }, { "epoch": 0.9341051955718028, "grad_norm": 0.23304279148578644, "learning_rate": 0.00016557249046323077, "loss": 1.9439, "step": 90200 }, { "epoch": 0.9351407889150088, "grad_norm": 0.2505282461643219, "learning_rate": 0.00016532978127320632, "loss": 1.9627, "step": 90300 }, { "epoch": 0.9361763822582149, "grad_norm": 0.23868633806705475, "learning_rate": 0.0001650870315175155, "loss": 1.9418, "step": 90400 }, { "epoch": 0.9372119756014209, "grad_norm": 0.2486335188150406, "learning_rate": 0.0001648442418385227, "loss": 1.9519, "step": 90500 }, { "epoch": 0.9382475689446268, "grad_norm": 0.22507202625274658, "learning_rate": 0.00016460141287869798, "loss": 1.9406, "step": 90600 }, { "epoch": 0.9392831622878328, "grad_norm": 0.2548425793647766, "learning_rate": 0.00016435854528061526, "loss": 1.9478, "step": 90700 }, { "epoch": 0.9403187556310388, "grad_norm": 0.23417390882968903, "learning_rate": 0.00016411563968695087, "loss": 1.9287, "step": 90800 }, { "epoch": 0.9413543489742447, "grad_norm": 0.268239825963974, "learning_rate": 0.0001638726967404815, "loss": 1.9353, "step": 90900 }, { "epoch": 0.9423899423174508, "grad_norm": 0.2342870682477951, "learning_rate": 0.00016362971708408275, "loss": 1.9221, "step": 91000 }, { "epoch": 0.9434255356606568, "grad_norm": 0.252216100692749, "learning_rate": 0.0001633867013607274, "loss": 1.9299, "step": 91100 }, { "epoch": 0.9444611290038628, "grad_norm": 0.2492091804742813, "learning_rate": 0.00016314365021348365, "loss": 1.9234, "step": 91200 }, { "epoch": 0.9454967223470687, "grad_norm": 0.233814999461174, "learning_rate": 0.00016290056428551345, "loss": 1.9292, "step": 91300 }, { "epoch": 0.9465323156902747, "grad_norm": 0.23142579197883606, "learning_rate": 0.0001626574442200707, "loss": 1.9488, "step": 91400 }, { "epoch": 0.9475679090334808, "grad_norm": 0.24332857131958008, "learning_rate": 0.0001624142906604998, "loss": 1.9366, "step": 91500 }, { "epoch": 0.9486035023766867, "grad_norm": 0.24913480877876282, "learning_rate": 0.00016217110425023363, "loss": 1.9437, "step": 91600 }, { "epoch": 0.9496390957198927, "grad_norm": 0.24253283441066742, "learning_rate": 0.00016192788563279203, "loss": 1.9356, "step": 91700 }, { "epoch": 0.9506746890630987, "grad_norm": 0.24340420961380005, "learning_rate": 0.00016168463545178016, "loss": 1.9457, "step": 91800 }, { "epoch": 0.9517102824063047, "grad_norm": 0.2424236685037613, "learning_rate": 0.00016144135435088657, "loss": 1.9377, "step": 91900 }, { "epoch": 0.9527458757495106, "grad_norm": 0.23764050006866455, "learning_rate": 0.00016119804297388176, "loss": 1.9426, "step": 92000 }, { "epoch": 0.9537814690927167, "grad_norm": 0.23119772970676422, "learning_rate": 0.0001609547019646162, "loss": 1.9408, "step": 92100 }, { "epoch": 0.9548170624359227, "grad_norm": 0.23783893883228302, "learning_rate": 0.00016071133196701896, "loss": 1.9359, "step": 92200 }, { "epoch": 0.9558526557791287, "grad_norm": 0.23280414938926697, "learning_rate": 0.00016046793362509566, "loss": 1.9452, "step": 92300 }, { "epoch": 0.9568882491223346, "grad_norm": 0.2365211844444275, "learning_rate": 0.0001602245075829269, "loss": 1.949, "step": 92400 }, { "epoch": 0.9579238424655406, "grad_norm": 0.24333146214485168, "learning_rate": 0.00015998105448466682, "loss": 1.9327, "step": 92500 }, { "epoch": 0.9589594358087467, "grad_norm": 0.2442314326763153, "learning_rate": 0.00015973757497454084, "loss": 1.9429, "step": 92600 }, { "epoch": 0.9599950291519526, "grad_norm": 0.24195174872875214, "learning_rate": 0.00015949406969684453, "loss": 1.925, "step": 92700 }, { "epoch": 0.9610306224951586, "grad_norm": 0.3703487515449524, "learning_rate": 0.00015925053929594153, "loss": 1.924, "step": 92800 }, { "epoch": 0.9620662158383646, "grad_norm": 0.23762178421020508, "learning_rate": 0.00015900698441626191, "loss": 1.9363, "step": 92900 }, { "epoch": 0.9631018091815706, "grad_norm": 0.3592216670513153, "learning_rate": 0.00015876584160530362, "loss": 1.9426, "step": 93000 }, { "epoch": 0.9641374025247765, "grad_norm": 0.2345329076051712, "learning_rate": 0.00015852223993032514, "loss": 1.9293, "step": 93100 }, { "epoch": 0.9651729958679826, "grad_norm": 0.2400391548871994, "learning_rate": 0.00015828105205557456, "loss": 1.9211, "step": 93200 }, { "epoch": 0.9662085892111886, "grad_norm": 0.22153514623641968, "learning_rate": 0.00015803740613805017, "loss": 1.9441, "step": 93300 }, { "epoch": 0.9672441825543946, "grad_norm": 0.23999086022377014, "learning_rate": 0.00015779373895194216, "loss": 1.929, "step": 93400 }, { "epoch": 0.9682797758976005, "grad_norm": 0.24127469956874847, "learning_rate": 0.0001575500511420425, "loss": 1.9224, "step": 93500 }, { "epoch": 0.9693153692408065, "grad_norm": 0.23712804913520813, "learning_rate": 0.00015730634335319797, "loss": 1.9271, "step": 93600 }, { "epoch": 0.9703509625840125, "grad_norm": 0.23177595436573029, "learning_rate": 0.00015706261623030804, "loss": 1.9465, "step": 93700 }, { "epoch": 0.9713865559272185, "grad_norm": 0.23145994544029236, "learning_rate": 0.0001568188704183234, "loss": 1.9442, "step": 93800 }, { "epoch": 0.9724221492704245, "grad_norm": 0.2583818733692169, "learning_rate": 0.0001565751065622443, "loss": 1.9302, "step": 93900 }, { "epoch": 0.9734577426136305, "grad_norm": 0.24953649938106537, "learning_rate": 0.00015633132530711857, "loss": 1.9207, "step": 94000 }, { "epoch": 0.9744933359568365, "grad_norm": 0.28383931517601013, "learning_rate": 0.00015608752729804012, "loss": 1.9308, "step": 94100 }, { "epoch": 0.9755289293000424, "grad_norm": 0.2383178174495697, "learning_rate": 0.0001558437131801473, "loss": 1.9346, "step": 94200 }, { "epoch": 0.9765645226432484, "grad_norm": 0.23528681695461273, "learning_rate": 0.00015559988359862097, "loss": 1.919, "step": 94300 }, { "epoch": 0.9776001159864545, "grad_norm": 0.2361796349287033, "learning_rate": 0.00015535603919868298, "loss": 1.9316, "step": 94400 }, { "epoch": 0.9786357093296604, "grad_norm": 0.22073882818222046, "learning_rate": 0.00015511218062559432, "loss": 1.9187, "step": 94500 }, { "epoch": 0.9796713026728664, "grad_norm": 0.23669646680355072, "learning_rate": 0.0001548683085246536, "loss": 1.9435, "step": 94600 }, { "epoch": 0.9807068960160724, "grad_norm": 0.23834872245788574, "learning_rate": 0.0001546244235411951, "loss": 1.9247, "step": 94700 }, { "epoch": 0.9817424893592784, "grad_norm": 0.25243905186653137, "learning_rate": 0.00015438052632058731, "loss": 1.9512, "step": 94800 }, { "epoch": 0.9827780827024843, "grad_norm": 0.22087784111499786, "learning_rate": 0.00015413661750823102, "loss": 1.9254, "step": 94900 }, { "epoch": 0.9838136760456904, "grad_norm": 0.23028938472270966, "learning_rate": 0.00015389269774955766, "loss": 1.9352, "step": 95000 }, { "epoch": 0.9848492693888964, "grad_norm": 0.2522309720516205, "learning_rate": 0.00015364876769002778, "loss": 1.9261, "step": 95100 }, { "epoch": 0.9858848627321024, "grad_norm": 0.23186306655406952, "learning_rate": 0.00015340482797512902, "loss": 1.9353, "step": 95200 }, { "epoch": 0.9869204560753083, "grad_norm": 0.2499614655971527, "learning_rate": 0.0001531608792503747, "loss": 1.9304, "step": 95300 }, { "epoch": 0.9879560494185143, "grad_norm": 0.22175639867782593, "learning_rate": 0.0001529169221613018, "loss": 1.9228, "step": 95400 }, { "epoch": 0.9889916427617204, "grad_norm": 0.25355827808380127, "learning_rate": 0.0001526729573534697, "loss": 1.9367, "step": 95500 }, { "epoch": 0.9900272361049263, "grad_norm": 0.23084016144275665, "learning_rate": 0.00015243142522416053, "loss": 1.9247, "step": 95600 }, { "epoch": 0.9910628294481323, "grad_norm": 0.230432391166687, "learning_rate": 0.00015218744697664761, "loss": 1.9213, "step": 95700 }, { "epoch": 0.9920984227913383, "grad_norm": 0.2483115792274475, "learning_rate": 0.00015194346294071265, "loss": 1.9254, "step": 95800 }, { "epoch": 0.9931340161345443, "grad_norm": 0.29111093282699585, "learning_rate": 0.00015169947376198617, "loss": 1.9245, "step": 95900 }, { "epoch": 0.9941696094777502, "grad_norm": 0.22843021154403687, "learning_rate": 0.00015145548008611227, "loss": 1.9142, "step": 96000 }, { "epoch": 0.9952052028209563, "grad_norm": 0.22067636251449585, "learning_rate": 0.00015121148255874706, "loss": 1.9225, "step": 96100 }, { "epoch": 0.9962407961641623, "grad_norm": 0.23021863400936127, "learning_rate": 0.00015096748182555678, "loss": 1.9181, "step": 96200 }, { "epoch": 0.9972763895073683, "grad_norm": 0.23256871104240417, "learning_rate": 0.00015072347853221617, "loss": 1.9098, "step": 96300 }, { "epoch": 0.9983119828505742, "grad_norm": 0.2889832556247711, "learning_rate": 0.0001504794733244067, "loss": 1.9334, "step": 96400 }, { "epoch": 0.9993475761937802, "grad_norm": 0.2328086793422699, "learning_rate": 0.0001502354668478149, "loss": 1.924, "step": 96500 }, { "epoch": 1.0003831695369862, "grad_norm": 0.2343384027481079, "learning_rate": 0.0001499914597481308, "loss": 1.9101, "step": 96600 }, { "epoch": 1.0014187628801923, "grad_norm": 0.25923722982406616, "learning_rate": 0.00014974745267104584, "loss": 1.8861, "step": 96700 }, { "epoch": 1.0024543562233983, "grad_norm": 0.25080233812332153, "learning_rate": 0.00014950344626225167, "loss": 1.8765, "step": 96800 }, { "epoch": 1.003489949566604, "grad_norm": 0.2270934134721756, "learning_rate": 0.00014925944116743795, "loss": 1.8784, "step": 96900 }, { "epoch": 1.00452554290981, "grad_norm": 0.2468741238117218, "learning_rate": 0.00014901543803229098, "loss": 1.8657, "step": 97000 }, { "epoch": 1.0055611362530161, "grad_norm": 0.251769095659256, "learning_rate": 0.00014877143750249185, "loss": 1.9014, "step": 97100 }, { "epoch": 1.0065967295962222, "grad_norm": 0.23185782134532928, "learning_rate": 0.00014852744022371468, "loss": 1.8908, "step": 97200 }, { "epoch": 1.0076323229394282, "grad_norm": 0.2621222138404846, "learning_rate": 0.00014828344684162517, "loss": 1.8734, "step": 97300 }, { "epoch": 1.0086679162826342, "grad_norm": 0.24131488800048828, "learning_rate": 0.0001480394580018785, "loss": 1.8881, "step": 97400 }, { "epoch": 1.0097035096258402, "grad_norm": 0.24724990129470825, "learning_rate": 0.00014779547435011799, "loss": 1.8813, "step": 97500 }, { "epoch": 1.010739102969046, "grad_norm": 0.2644301950931549, "learning_rate": 0.0001475514965319731, "loss": 1.8891, "step": 97600 }, { "epoch": 1.011774696312252, "grad_norm": 0.2379751205444336, "learning_rate": 0.00014730752519305797, "loss": 1.9028, "step": 97700 }, { "epoch": 1.012810289655458, "grad_norm": 0.24513430893421173, "learning_rate": 0.00014706356097896948, "loss": 1.8918, "step": 97800 }, { "epoch": 1.013845882998664, "grad_norm": 0.24254558980464935, "learning_rate": 0.0001468196045352858, "loss": 1.8888, "step": 97900 }, { "epoch": 1.01488147634187, "grad_norm": 0.2363414168357849, "learning_rate": 0.00014657565650756448, "loss": 1.8837, "step": 98000 }, { "epoch": 1.0159170696850761, "grad_norm": 0.24372878670692444, "learning_rate": 0.00014633171754134064, "loss": 1.8788, "step": 98100 }, { "epoch": 1.0169526630282821, "grad_norm": 0.2519483268260956, "learning_rate": 0.00014608778828212568, "loss": 1.9207, "step": 98200 }, { "epoch": 1.017988256371488, "grad_norm": 0.2332720160484314, "learning_rate": 0.00014584386937540513, "loss": 1.8755, "step": 98300 }, { "epoch": 1.019023849714694, "grad_norm": 0.23432657122612, "learning_rate": 0.00014559996146663724, "loss": 1.8816, "step": 98400 }, { "epoch": 1.0200594430579, "grad_norm": 0.22759872674942017, "learning_rate": 0.00014535606520125105, "loss": 1.8848, "step": 98500 }, { "epoch": 1.021095036401106, "grad_norm": 0.23556314408779144, "learning_rate": 0.00014511218122464486, "loss": 1.9031, "step": 98600 }, { "epoch": 1.022130629744312, "grad_norm": 0.23162241280078888, "learning_rate": 0.00014486831018218444, "loss": 1.8897, "step": 98700 }, { "epoch": 1.023166223087518, "grad_norm": 0.2409902811050415, "learning_rate": 0.00014462689122449396, "loss": 1.8798, "step": 98800 }, { "epoch": 1.024201816430724, "grad_norm": 0.2467098981142044, "learning_rate": 0.00014438304784084193, "loss": 1.8743, "step": 98900 }, { "epoch": 1.02523740977393, "grad_norm": 0.2389885038137436, "learning_rate": 0.00014413921932076847, "loss": 1.8704, "step": 99000 }, { "epoch": 1.0262730031171359, "grad_norm": 0.23866841197013855, "learning_rate": 0.0001438954063094925, "loss": 1.881, "step": 99100 }, { "epoch": 1.027308596460342, "grad_norm": 0.26028040051460266, "learning_rate": 0.00014365160945219198, "loss": 1.8846, "step": 99200 }, { "epoch": 1.028344189803548, "grad_norm": 0.23288673162460327, "learning_rate": 0.00014340782939400223, "loss": 1.8921, "step": 99300 }, { "epoch": 1.029379783146754, "grad_norm": 0.25385284423828125, "learning_rate": 0.00014316406678001393, "loss": 1.8881, "step": 99400 }, { "epoch": 1.03041537648996, "grad_norm": 0.23623453080654144, "learning_rate": 0.00014292032225527183, "loss": 1.8779, "step": 99500 }, { "epoch": 1.031450969833166, "grad_norm": 0.23074805736541748, "learning_rate": 0.00014267659646477243, "loss": 1.8757, "step": 99600 }, { "epoch": 1.032486563176372, "grad_norm": 0.25624775886535645, "learning_rate": 0.0001424328900534632, "loss": 1.8799, "step": 99700 }, { "epoch": 1.0335221565195778, "grad_norm": 0.23896069824695587, "learning_rate": 0.00014218920366623986, "loss": 1.8897, "step": 99800 }, { "epoch": 1.0345577498627838, "grad_norm": 0.24150870740413666, "learning_rate": 0.00014194553794794534, "loss": 1.8849, "step": 99900 }, { "epoch": 1.0355933432059898, "grad_norm": 0.24040450155735016, "learning_rate": 0.00014170189354336784, "loss": 1.8944, "step": 100000 }, { "epoch": 1.0366289365491959, "grad_norm": 0.2770671844482422, "learning_rate": 0.0001414582710972392, "loss": 1.8867, "step": 100100 }, { "epoch": 1.0376645298924019, "grad_norm": 0.25982925295829773, "learning_rate": 0.00014121467125423318, "loss": 1.8971, "step": 100200 }, { "epoch": 1.038700123235608, "grad_norm": 0.24029162526130676, "learning_rate": 0.0001409710946589635, "loss": 1.8927, "step": 100300 }, { "epoch": 1.039735716578814, "grad_norm": 0.2328944057226181, "learning_rate": 0.00014072754195598272, "loss": 1.8908, "step": 100400 }, { "epoch": 1.0407713099220197, "grad_norm": 0.23486821353435516, "learning_rate": 0.00014048401378977983, "loss": 1.8816, "step": 100500 }, { "epoch": 1.0418069032652257, "grad_norm": 0.24510391056537628, "learning_rate": 0.00014024051080477917, "loss": 1.871, "step": 100600 }, { "epoch": 1.0428424966084318, "grad_norm": 0.25957804918289185, "learning_rate": 0.00013999703364533827, "loss": 1.8769, "step": 100700 }, { "epoch": 1.0438780899516378, "grad_norm": 0.2522658407688141, "learning_rate": 0.0001397535829557464, "loss": 1.887, "step": 100800 }, { "epoch": 1.0449136832948438, "grad_norm": 0.24027973413467407, "learning_rate": 0.00013951015938022272, "loss": 1.881, "step": 100900 }, { "epoch": 1.0459492766380498, "grad_norm": 0.2546592950820923, "learning_rate": 0.00013926676356291475, "loss": 1.8785, "step": 101000 }, { "epoch": 1.0469848699812558, "grad_norm": 0.22534821927547455, "learning_rate": 0.00013902339614789641, "loss": 1.8893, "step": 101100 }, { "epoch": 1.0480204633244616, "grad_norm": 0.25759249925613403, "learning_rate": 0.00013878005777916665, "loss": 1.8817, "step": 101200 }, { "epoch": 1.0490560566676677, "grad_norm": 0.24517479538917542, "learning_rate": 0.00013853674910064735, "loss": 1.8827, "step": 101300 }, { "epoch": 1.0500916500108737, "grad_norm": 0.2415887713432312, "learning_rate": 0.0001382934707561819, "loss": 1.8927, "step": 101400 }, { "epoch": 1.0511272433540797, "grad_norm": 0.25100499391555786, "learning_rate": 0.00013805022338953354, "loss": 1.8861, "step": 101500 }, { "epoch": 1.0521628366972857, "grad_norm": 0.260999858379364, "learning_rate": 0.00013780700764438335, "loss": 1.8824, "step": 101600 }, { "epoch": 1.0531984300404917, "grad_norm": 0.28245481848716736, "learning_rate": 0.0001375638241643289, "loss": 1.8871, "step": 101700 }, { "epoch": 1.0542340233836978, "grad_norm": 0.2494635134935379, "learning_rate": 0.00013732067359288223, "loss": 1.8877, "step": 101800 }, { "epoch": 1.0552696167269038, "grad_norm": 0.24767793715000153, "learning_rate": 0.00013707755657346845, "loss": 1.8809, "step": 101900 }, { "epoch": 1.0563052100701096, "grad_norm": 0.2375461459159851, "learning_rate": 0.00013683447374942368, "loss": 1.8863, "step": 102000 }, { "epoch": 1.0573408034133156, "grad_norm": 0.23268288373947144, "learning_rate": 0.00013659142576399384, "loss": 1.8678, "step": 102100 }, { "epoch": 1.0583763967565216, "grad_norm": 0.23155255615711212, "learning_rate": 0.00013634841326033234, "loss": 1.8678, "step": 102200 }, { "epoch": 1.0594119900997276, "grad_norm": 0.22982390224933624, "learning_rate": 0.00013610543688149894, "loss": 1.8896, "step": 102300 }, { "epoch": 1.0604475834429337, "grad_norm": 0.24036739766597748, "learning_rate": 0.00013586249727045772, "loss": 1.8769, "step": 102400 }, { "epoch": 1.0614831767861397, "grad_norm": 0.23711708188056946, "learning_rate": 0.00013561959507007543, "loss": 1.8823, "step": 102500 }, { "epoch": 1.0625187701293457, "grad_norm": 0.23596128821372986, "learning_rate": 0.0001353767309231199, "loss": 1.8787, "step": 102600 }, { "epoch": 1.0635543634725515, "grad_norm": 0.24728642404079437, "learning_rate": 0.00013513633353311113, "loss": 1.8881, "step": 102700 }, { "epoch": 1.0645899568157575, "grad_norm": 0.24226045608520508, "learning_rate": 0.00013489354702434102, "loss": 1.8799, "step": 102800 }, { "epoch": 1.0656255501589635, "grad_norm": 0.25156956911087036, "learning_rate": 0.00013465080049026586, "loss": 1.8762, "step": 102900 }, { "epoch": 1.0666611435021696, "grad_norm": 0.25591355562210083, "learning_rate": 0.00013440809457324166, "loss": 1.8755, "step": 103000 }, { "epoch": 1.0676967368453756, "grad_norm": 0.22271765768527985, "learning_rate": 0.0001341654299155167, "loss": 1.8758, "step": 103100 }, { "epoch": 1.0687323301885816, "grad_norm": 0.24468670785427094, "learning_rate": 0.00013392280715923026, "loss": 1.8988, "step": 103200 }, { "epoch": 1.0697679235317876, "grad_norm": 0.2655929625034332, "learning_rate": 0.00013368022694641065, "loss": 1.887, "step": 103300 }, { "epoch": 1.0708035168749936, "grad_norm": 0.26419320702552795, "learning_rate": 0.00013343768991897357, "loss": 1.8694, "step": 103400 }, { "epoch": 1.0718391102181994, "grad_norm": 0.2383660525083542, "learning_rate": 0.00013319519671872057, "loss": 1.8671, "step": 103500 }, { "epoch": 1.0728747035614055, "grad_norm": 0.2357158064842224, "learning_rate": 0.000132952747987337, "loss": 1.866, "step": 103600 }, { "epoch": 1.0739102969046115, "grad_norm": 0.23546268045902252, "learning_rate": 0.00013271034436639084, "loss": 1.8697, "step": 103700 }, { "epoch": 1.0749458902478175, "grad_norm": 0.2389373630285263, "learning_rate": 0.0001324679864973304, "loss": 1.8821, "step": 103800 }, { "epoch": 1.0759814835910235, "grad_norm": 0.2421797662973404, "learning_rate": 0.00013222567502148318, "loss": 1.852, "step": 103900 }, { "epoch": 1.0770170769342295, "grad_norm": 0.24131540954113007, "learning_rate": 0.0001319834105800537, "loss": 1.8719, "step": 104000 }, { "epoch": 1.0780526702774353, "grad_norm": 0.25092560052871704, "learning_rate": 0.0001317411938141222, "loss": 1.882, "step": 104100 }, { "epoch": 1.0790882636206414, "grad_norm": 0.23742741346359253, "learning_rate": 0.00013149902536464258, "loss": 1.8912, "step": 104200 }, { "epoch": 1.0801238569638474, "grad_norm": 0.2396973818540573, "learning_rate": 0.00013125690587244108, "loss": 1.8769, "step": 104300 }, { "epoch": 1.0811594503070534, "grad_norm": 0.23528113961219788, "learning_rate": 0.00013101483597821413, "loss": 1.873, "step": 104400 }, { "epoch": 1.0821950436502594, "grad_norm": 0.22962799668312073, "learning_rate": 0.0001307728163225273, "loss": 1.8543, "step": 104500 }, { "epoch": 1.0832306369934654, "grad_norm": 0.25876128673553467, "learning_rate": 0.0001305308475458128, "loss": 1.8742, "step": 104600 }, { "epoch": 1.0842662303366715, "grad_norm": 0.23361794650554657, "learning_rate": 0.00013028893028836842, "loss": 1.8784, "step": 104700 }, { "epoch": 1.0853018236798775, "grad_norm": 0.2388489991426468, "learning_rate": 0.0001300470651903557, "loss": 1.8611, "step": 104800 }, { "epoch": 1.0863374170230833, "grad_norm": 0.23678728938102722, "learning_rate": 0.0001298052528917979, "loss": 1.8765, "step": 104900 }, { "epoch": 1.0873730103662893, "grad_norm": 0.2365022897720337, "learning_rate": 0.00012956349403257886, "loss": 1.8681, "step": 105000 }, { "epoch": 1.0884086037094953, "grad_norm": 0.23612427711486816, "learning_rate": 0.0001293217892524407, "loss": 1.8883, "step": 105100 }, { "epoch": 1.0894441970527013, "grad_norm": 0.2390425056219101, "learning_rate": 0.00012908013919098277, "loss": 1.8835, "step": 105200 }, { "epoch": 1.0904797903959074, "grad_norm": 0.23704111576080322, "learning_rate": 0.0001288385444876593, "loss": 1.8722, "step": 105300 }, { "epoch": 1.0915153837391134, "grad_norm": 0.2381979376077652, "learning_rate": 0.00012859700578177828, "loss": 1.8678, "step": 105400 }, { "epoch": 1.0925509770823194, "grad_norm": 0.23762637376785278, "learning_rate": 0.00012835552371249937, "loss": 1.8792, "step": 105500 }, { "epoch": 1.0935865704255252, "grad_norm": 0.23250162601470947, "learning_rate": 0.0001281140989188324, "loss": 1.8599, "step": 105600 }, { "epoch": 1.0946221637687312, "grad_norm": 0.23538027703762054, "learning_rate": 0.00012787273203963576, "loss": 1.8574, "step": 105700 }, { "epoch": 1.0956577571119372, "grad_norm": 0.2512263059616089, "learning_rate": 0.00012763142371361428, "loss": 1.8603, "step": 105800 }, { "epoch": 1.0966933504551433, "grad_norm": 0.2455640733242035, "learning_rate": 0.00012739017457931822, "loss": 1.8701, "step": 105900 }, { "epoch": 1.0977289437983493, "grad_norm": 0.2362925410270691, "learning_rate": 0.00012714898527514088, "loss": 1.8834, "step": 106000 }, { "epoch": 1.0987645371415553, "grad_norm": 0.2549925148487091, "learning_rate": 0.00012690785643931746, "loss": 1.8756, "step": 106100 }, { "epoch": 1.0998001304847613, "grad_norm": 0.23583611845970154, "learning_rate": 0.00012666678870992305, "loss": 1.8859, "step": 106200 }, { "epoch": 1.1008357238279673, "grad_norm": 0.2888440489768982, "learning_rate": 0.0001264257827248711, "loss": 1.8728, "step": 106300 }, { "epoch": 1.1018713171711731, "grad_norm": 0.24092243611812592, "learning_rate": 0.0001261848391219116, "loss": 1.8788, "step": 106400 }, { "epoch": 1.1029069105143792, "grad_norm": 0.27669841051101685, "learning_rate": 0.0001259439585386295, "loss": 1.8522, "step": 106500 }, { "epoch": 1.1039425038575852, "grad_norm": 0.22967857122421265, "learning_rate": 0.000125703141612443, "loss": 1.8712, "step": 106600 }, { "epoch": 1.1049780972007912, "grad_norm": 0.24176256358623505, "learning_rate": 0.00012546238898060186, "loss": 1.851, "step": 106700 }, { "epoch": 1.1060136905439972, "grad_norm": 0.23579706251621246, "learning_rate": 0.00012522170128018563, "loss": 1.8781, "step": 106800 }, { "epoch": 1.1070492838872033, "grad_norm": 0.24712516367435455, "learning_rate": 0.00012498107914810208, "loss": 1.8577, "step": 106900 }, { "epoch": 1.108084877230409, "grad_norm": 0.2389284372329712, "learning_rate": 0.00012474052322108555, "loss": 1.8692, "step": 107000 }, { "epoch": 1.109120470573615, "grad_norm": 0.2463691383600235, "learning_rate": 0.00012450003413569505, "loss": 1.8786, "step": 107100 }, { "epoch": 1.110156063916821, "grad_norm": 0.24113106727600098, "learning_rate": 0.0001242620164082814, "loss": 1.8725, "step": 107200 }, { "epoch": 1.111191657260027, "grad_norm": 0.23971356451511383, "learning_rate": 0.00012402406543337254, "loss": 1.8526, "step": 107300 }, { "epoch": 1.1122272506032331, "grad_norm": 0.23869900405406952, "learning_rate": 0.00012378377930920102, "loss": 1.8714, "step": 107400 }, { "epoch": 1.1132628439464392, "grad_norm": 0.2511535584926605, "learning_rate": 0.00012354596438041382, "loss": 1.8663, "step": 107500 }, { "epoch": 1.1142984372896452, "grad_norm": 0.23651407659053802, "learning_rate": 0.00012330581693539117, "loss": 1.8778, "step": 107600 }, { "epoch": 1.1153340306328512, "grad_norm": 0.23462238907814026, "learning_rate": 0.00012306574012851428, "loss": 1.8727, "step": 107700 }, { "epoch": 1.116369623976057, "grad_norm": 0.2549104690551758, "learning_rate": 0.00012282573459507438, "loss": 1.8636, "step": 107800 }, { "epoch": 1.117405217319263, "grad_norm": 0.24617476761341095, "learning_rate": 0.00012258580097017417, "loss": 1.8602, "step": 107900 }, { "epoch": 1.118440810662469, "grad_norm": 0.24401159584522247, "learning_rate": 0.0001223459398887258, "loss": 1.8801, "step": 108000 }, { "epoch": 1.119476404005675, "grad_norm": 0.24245227873325348, "learning_rate": 0.0001221061519854499, "loss": 1.8926, "step": 108100 }, { "epoch": 1.120511997348881, "grad_norm": 0.26296266913414, "learning_rate": 0.00012186643789487298, "loss": 1.875, "step": 108200 }, { "epoch": 1.121547590692087, "grad_norm": 0.2296479344367981, "learning_rate": 0.00012162679825132662, "loss": 1.857, "step": 108300 }, { "epoch": 1.1225831840352931, "grad_norm": 0.24976491928100586, "learning_rate": 0.00012138723368894512, "loss": 1.8842, "step": 108400 }, { "epoch": 1.123618777378499, "grad_norm": 0.26477983593940735, "learning_rate": 0.00012114774484166428, "loss": 1.879, "step": 108500 }, { "epoch": 1.124654370721705, "grad_norm": 0.23889628052711487, "learning_rate": 0.00012090833234321945, "loss": 1.8696, "step": 108600 }, { "epoch": 1.125689964064911, "grad_norm": 0.23572321236133575, "learning_rate": 0.00012066899682714399, "loss": 1.8754, "step": 108700 }, { "epoch": 1.126725557408117, "grad_norm": 0.26930326223373413, "learning_rate": 0.00012042973892676748, "loss": 1.8523, "step": 108800 }, { "epoch": 1.127761150751323, "grad_norm": 0.22739994525909424, "learning_rate": 0.00012019055927521426, "loss": 1.8639, "step": 108900 }, { "epoch": 1.128796744094529, "grad_norm": 0.23319287598133087, "learning_rate": 0.00011995145850540142, "loss": 1.8783, "step": 109000 }, { "epoch": 1.129832337437735, "grad_norm": 0.22787228226661682, "learning_rate": 0.0001197124372500374, "loss": 1.8764, "step": 109100 }, { "epoch": 1.130867930780941, "grad_norm": 0.2509264051914215, "learning_rate": 0.0001194734961416203, "loss": 1.8749, "step": 109200 }, { "epoch": 1.1319035241241469, "grad_norm": 0.22076621651649475, "learning_rate": 0.00011923463581243599, "loss": 1.8611, "step": 109300 }, { "epoch": 1.1329391174673529, "grad_norm": 0.2421676516532898, "learning_rate": 0.00011899585689455676, "loss": 1.8648, "step": 109400 }, { "epoch": 1.133974710810559, "grad_norm": 0.22357791662216187, "learning_rate": 0.00011875716001983925, "loss": 1.8596, "step": 109500 }, { "epoch": 1.135010304153765, "grad_norm": 0.2394237071275711, "learning_rate": 0.00011851854581992325, "loss": 1.8792, "step": 109600 }, { "epoch": 1.136045897496971, "grad_norm": 0.2515692412853241, "learning_rate": 0.00011828001492622947, "loss": 1.8558, "step": 109700 }, { "epoch": 1.137081490840177, "grad_norm": 0.24260202050209045, "learning_rate": 0.00011804156796995854, "loss": 1.8619, "step": 109800 }, { "epoch": 1.1381170841833828, "grad_norm": 0.22991013526916504, "learning_rate": 0.00011780320558208866, "loss": 1.8625, "step": 109900 }, { "epoch": 1.1391526775265888, "grad_norm": 0.2327783852815628, "learning_rate": 0.0001175649283933744, "loss": 1.8753, "step": 110000 }, { "epoch": 1.1401882708697948, "grad_norm": 0.29044440388679504, "learning_rate": 0.00011732673703434486, "loss": 1.8574, "step": 110100 }, { "epoch": 1.1412238642130008, "grad_norm": 0.2645598351955414, "learning_rate": 0.00011708863213530194, "loss": 1.8555, "step": 110200 }, { "epoch": 1.1422594575562068, "grad_norm": 0.2337404489517212, "learning_rate": 0.00011685061432631892, "loss": 1.8642, "step": 110300 }, { "epoch": 1.1432950508994129, "grad_norm": 0.23765882849693298, "learning_rate": 0.00011661268423723835, "loss": 1.8794, "step": 110400 }, { "epoch": 1.1443306442426189, "grad_norm": 0.2535068392753601, "learning_rate": 0.00011637484249767103, "loss": 1.8617, "step": 110500 }, { "epoch": 1.145366237585825, "grad_norm": 0.22886718809604645, "learning_rate": 0.00011613708973699357, "loss": 1.8702, "step": 110600 }, { "epoch": 1.1464018309290307, "grad_norm": 0.2459760159254074, "learning_rate": 0.0001158994265843474, "loss": 1.8642, "step": 110700 }, { "epoch": 1.1474374242722367, "grad_norm": 0.23509031534194946, "learning_rate": 0.00011566185366863673, "loss": 1.8452, "step": 110800 }, { "epoch": 1.1484730176154427, "grad_norm": 0.23424768447875977, "learning_rate": 0.00011542437161852699, "loss": 1.864, "step": 110900 }, { "epoch": 1.1495086109586488, "grad_norm": 0.23628224432468414, "learning_rate": 0.00011518698106244316, "loss": 1.8751, "step": 111000 }, { "epoch": 1.1505442043018548, "grad_norm": 0.22464323043823242, "learning_rate": 0.00011494968262856817, "loss": 1.8672, "step": 111100 }, { "epoch": 1.1515797976450608, "grad_norm": 0.2577066719532013, "learning_rate": 0.00011471247694484109, "loss": 1.8511, "step": 111200 }, { "epoch": 1.1526153909882668, "grad_norm": 0.24172638356685638, "learning_rate": 0.00011447536463895554, "loss": 1.862, "step": 111300 }, { "epoch": 1.1536509843314726, "grad_norm": 0.23986095190048218, "learning_rate": 0.00011423834633835814, "loss": 1.8603, "step": 111400 }, { "epoch": 1.1546865776746786, "grad_norm": 0.23520790040493011, "learning_rate": 0.00011400142267024668, "loss": 1.8441, "step": 111500 }, { "epoch": 1.1557221710178847, "grad_norm": 0.2556486129760742, "learning_rate": 0.00011376696207206322, "loss": 1.8704, "step": 111600 }, { "epoch": 1.1567577643610907, "grad_norm": 0.23670066893100739, "learning_rate": 0.00011353022858755118, "loss": 1.8682, "step": 111700 }, { "epoch": 1.1577933577042967, "grad_norm": 0.24926917254924774, "learning_rate": 0.00011329359160934627, "loss": 1.8699, "step": 111800 }, { "epoch": 1.1588289510475027, "grad_norm": 0.24650341272354126, "learning_rate": 0.0001130570517636372, "loss": 1.8713, "step": 111900 }, { "epoch": 1.1598645443907087, "grad_norm": 0.2391950786113739, "learning_rate": 0.0001128206096763558, "loss": 1.8781, "step": 112000 }, { "epoch": 1.1609001377339148, "grad_norm": 0.22863338887691498, "learning_rate": 0.00011258426597317493, "loss": 1.8707, "step": 112100 }, { "epoch": 1.1619357310771206, "grad_norm": 0.2276660054922104, "learning_rate": 0.00011234802127950747, "loss": 1.8556, "step": 112200 }, { "epoch": 1.1629713244203266, "grad_norm": 0.2393030971288681, "learning_rate": 0.00011211187622050399, "loss": 1.8624, "step": 112300 }, { "epoch": 1.1640069177635326, "grad_norm": 0.22766567766666412, "learning_rate": 0.00011187583142105156, "loss": 1.8569, "step": 112400 }, { "epoch": 1.1650425111067386, "grad_norm": 0.2281782329082489, "learning_rate": 0.00011163988750577195, "loss": 1.8659, "step": 112500 }, { "epoch": 1.1660781044499446, "grad_norm": 0.2515377402305603, "learning_rate": 0.00011140404509901989, "loss": 1.8563, "step": 112600 }, { "epoch": 1.1671136977931507, "grad_norm": 0.22784626483917236, "learning_rate": 0.00011116830482488162, "loss": 1.857, "step": 112700 }, { "epoch": 1.1681492911363565, "grad_norm": 0.2499839961528778, "learning_rate": 0.00011093266730717291, "loss": 1.8828, "step": 112800 }, { "epoch": 1.1691848844795625, "grad_norm": 0.23827239871025085, "learning_rate": 0.00011069713316943791, "loss": 1.8575, "step": 112900 }, { "epoch": 1.1702204778227685, "grad_norm": 0.2503950595855713, "learning_rate": 0.00011046170303494688, "loss": 1.8479, "step": 113000 }, { "epoch": 1.1712560711659745, "grad_norm": 0.23695173859596252, "learning_rate": 0.00011022637752669515, "loss": 1.8343, "step": 113100 }, { "epoch": 1.1722916645091805, "grad_norm": 0.23354941606521606, "learning_rate": 0.000109991157267401, "loss": 1.8623, "step": 113200 }, { "epoch": 1.1733272578523866, "grad_norm": 0.2438482940196991, "learning_rate": 0.0001097560428795043, "loss": 1.8526, "step": 113300 }, { "epoch": 1.1743628511955926, "grad_norm": 0.3564881980419159, "learning_rate": 0.00010952103498516477, "loss": 1.8506, "step": 113400 }, { "epoch": 1.1753984445387986, "grad_norm": 0.2541187107563019, "learning_rate": 0.00010928848268178658, "loss": 1.8662, "step": 113500 }, { "epoch": 1.1764340378820044, "grad_norm": 0.25882384181022644, "learning_rate": 0.00010905368855946517, "loss": 1.8448, "step": 113600 }, { "epoch": 1.1774696312252104, "grad_norm": 0.2368178367614746, "learning_rate": 0.00010881900278927084, "loss": 1.8448, "step": 113700 }, { "epoch": 1.1785052245684164, "grad_norm": 0.23330169916152954, "learning_rate": 0.00010858442599222907, "loss": 1.8572, "step": 113800 }, { "epoch": 1.1795408179116225, "grad_norm": 0.27821797132492065, "learning_rate": 0.00010834995878907694, "loss": 1.8654, "step": 113900 }, { "epoch": 1.1805764112548285, "grad_norm": 0.2460658997297287, "learning_rate": 0.00010811560180026162, "loss": 1.8435, "step": 114000 }, { "epoch": 1.1816120045980345, "grad_norm": 0.23841862380504608, "learning_rate": 0.00010788135564593847, "loss": 1.8492, "step": 114100 }, { "epoch": 1.1826475979412405, "grad_norm": 0.2332884967327118, "learning_rate": 0.00010764722094596975, "loss": 1.8322, "step": 114200 }, { "epoch": 1.1836831912844463, "grad_norm": 0.23410117626190186, "learning_rate": 0.00010741319831992256, "loss": 1.8589, "step": 114300 }, { "epoch": 1.1847187846276523, "grad_norm": 0.241334930062294, "learning_rate": 0.0001071792883870677, "loss": 1.8768, "step": 114400 }, { "epoch": 1.1857543779708584, "grad_norm": 0.23932315409183502, "learning_rate": 0.00010694549176637755, "loss": 1.8683, "step": 114500 }, { "epoch": 1.1867899713140644, "grad_norm": 0.2460791915655136, "learning_rate": 0.00010671180907652468, "loss": 1.8548, "step": 114600 }, { "epoch": 1.1878255646572704, "grad_norm": 0.23848845064640045, "learning_rate": 0.00010647824093588029, "loss": 1.8426, "step": 114700 }, { "epoch": 1.1888611580004764, "grad_norm": 0.23898836970329285, "learning_rate": 0.00010624478796251232, "loss": 1.8485, "step": 114800 }, { "epoch": 1.1898967513436824, "grad_norm": 0.23634620010852814, "learning_rate": 0.00010601145077418408, "loss": 1.8514, "step": 114900 }, { "epoch": 1.1909323446868885, "grad_norm": 0.23557230830192566, "learning_rate": 0.0001057782299883523, "loss": 1.8562, "step": 115000 }, { "epoch": 1.1919679380300943, "grad_norm": 0.23655882477760315, "learning_rate": 0.00010554512622216597, "loss": 1.8564, "step": 115100 }, { "epoch": 1.1930035313733003, "grad_norm": 0.2394469678401947, "learning_rate": 0.00010531214009246416, "loss": 1.8731, "step": 115200 }, { "epoch": 1.1940391247165063, "grad_norm": 0.2503461539745331, "learning_rate": 0.00010507927221577478, "loss": 1.8623, "step": 115300 }, { "epoch": 1.1950747180597123, "grad_norm": 0.2489718645811081, "learning_rate": 0.00010484652320831278, "loss": 1.8547, "step": 115400 }, { "epoch": 1.1961103114029183, "grad_norm": 0.23475505411624908, "learning_rate": 0.00010461389368597862, "loss": 1.8667, "step": 115500 }, { "epoch": 1.1971459047461244, "grad_norm": 0.2368912398815155, "learning_rate": 0.00010438138426435647, "loss": 1.8494, "step": 115600 }, { "epoch": 1.1981814980893302, "grad_norm": 0.23997525870800018, "learning_rate": 0.00010414899555871277, "loss": 1.8497, "step": 115700 }, { "epoch": 1.1992170914325362, "grad_norm": 0.23326918482780457, "learning_rate": 0.00010391672818399455, "loss": 1.8421, "step": 115800 }, { "epoch": 1.2002526847757422, "grad_norm": 0.23169055581092834, "learning_rate": 0.00010368458275482762, "loss": 1.8565, "step": 115900 }, { "epoch": 1.2012882781189482, "grad_norm": 0.23461148142814636, "learning_rate": 0.00010345255988551528, "loss": 1.8326, "step": 116000 }, { "epoch": 1.2023238714621542, "grad_norm": 0.24423667788505554, "learning_rate": 0.00010322066019003639, "loss": 1.8585, "step": 116100 }, { "epoch": 1.2033594648053603, "grad_norm": 0.23528601229190826, "learning_rate": 0.00010298888428204395, "loss": 1.8605, "step": 116200 }, { "epoch": 1.2043950581485663, "grad_norm": 0.23885977268218994, "learning_rate": 0.00010275723277486331, "loss": 1.858, "step": 116300 }, { "epoch": 1.2054306514917723, "grad_norm": 0.2440977841615677, "learning_rate": 0.00010252570628149076, "loss": 1.8579, "step": 116400 }, { "epoch": 1.206466244834978, "grad_norm": 0.2509276568889618, "learning_rate": 0.00010229430541459157, "loss": 1.8579, "step": 116500 }, { "epoch": 1.2075018381781841, "grad_norm": 0.28732961416244507, "learning_rate": 0.00010206303078649883, "loss": 1.8557, "step": 116600 }, { "epoch": 1.2085374315213901, "grad_norm": 0.2555074095726013, "learning_rate": 0.00010183188300921143, "loss": 1.8426, "step": 116700 }, { "epoch": 1.2095730248645962, "grad_norm": 0.26921746134757996, "learning_rate": 0.00010160086269439246, "loss": 1.8414, "step": 116800 }, { "epoch": 1.2106086182078022, "grad_norm": 0.24056534469127655, "learning_rate": 0.00010136997045336803, "loss": 1.8423, "step": 116900 }, { "epoch": 1.2116442115510082, "grad_norm": 0.23481959104537964, "learning_rate": 0.00010113920689712506, "loss": 1.8505, "step": 117000 }, { "epoch": 1.2126798048942142, "grad_norm": 0.2427511364221573, "learning_rate": 0.00010090857263631016, "loss": 1.8582, "step": 117100 }, { "epoch": 1.21371539823742, "grad_norm": 0.26116400957107544, "learning_rate": 0.00010067806828122762, "loss": 1.8538, "step": 117200 }, { "epoch": 1.214750991580626, "grad_norm": 0.23690100014209747, "learning_rate": 0.0001004476944418381, "loss": 1.8664, "step": 117300 }, { "epoch": 1.215786584923832, "grad_norm": 0.22852669656276703, "learning_rate": 0.00010021745172775682, "loss": 1.8485, "step": 117400 }, { "epoch": 1.216822178267038, "grad_norm": 0.23307280242443085, "learning_rate": 9.998734074825209e-05, "loss": 1.8471, "step": 117500 }, { "epoch": 1.217857771610244, "grad_norm": 0.23883013427257538, "learning_rate": 9.975736211224344e-05, "loss": 1.8538, "step": 117600 }, { "epoch": 1.2188933649534501, "grad_norm": 0.2350393384695053, "learning_rate": 9.952751642830047e-05, "loss": 1.8462, "step": 117700 }, { "epoch": 1.2199289582966562, "grad_norm": 0.23245997726917267, "learning_rate": 9.929780430464075e-05, "loss": 1.8326, "step": 117800 }, { "epoch": 1.2209645516398622, "grad_norm": 0.2444833666086197, "learning_rate": 9.906822634912843e-05, "loss": 1.843, "step": 117900 }, { "epoch": 1.222000144983068, "grad_norm": 0.24783264100551605, "learning_rate": 9.883878316927282e-05, "loss": 1.8467, "step": 118000 }, { "epoch": 1.223035738326274, "grad_norm": 0.2404903918504715, "learning_rate": 9.860947537222631e-05, "loss": 1.848, "step": 118100 }, { "epoch": 1.22407133166948, "grad_norm": 0.25085940957069397, "learning_rate": 9.838030356478333e-05, "loss": 1.8367, "step": 118200 }, { "epoch": 1.225106925012686, "grad_norm": 0.231583833694458, "learning_rate": 9.81512683533781e-05, "loss": 1.843, "step": 118300 }, { "epoch": 1.226142518355892, "grad_norm": 0.240120992064476, "learning_rate": 9.792237034408381e-05, "loss": 1.8563, "step": 118400 }, { "epoch": 1.227178111699098, "grad_norm": 0.22982698678970337, "learning_rate": 9.769361014261021e-05, "loss": 1.8406, "step": 118500 }, { "epoch": 1.2282137050423039, "grad_norm": 0.25001078844070435, "learning_rate": 9.746727388505374e-05, "loss": 1.8626, "step": 118600 }, { "epoch": 1.22924929838551, "grad_norm": 0.25120049715042114, "learning_rate": 9.723878972171611e-05, "loss": 1.856, "step": 118700 }, { "epoch": 1.230284891728716, "grad_norm": 0.24044248461723328, "learning_rate": 9.70104451750903e-05, "loss": 1.8367, "step": 118800 }, { "epoch": 1.231320485071922, "grad_norm": 0.23974071443080902, "learning_rate": 9.678224084942154e-05, "loss": 1.8394, "step": 118900 }, { "epoch": 1.232356078415128, "grad_norm": 0.2431887537240982, "learning_rate": 9.655645728452762e-05, "loss": 1.8411, "step": 119000 }, { "epoch": 1.233391671758334, "grad_norm": 0.24696439504623413, "learning_rate": 9.63285337947537e-05, "loss": 1.8523, "step": 119100 }, { "epoch": 1.23442726510154, "grad_norm": 0.23212474584579468, "learning_rate": 9.610075233041069e-05, "loss": 1.8628, "step": 119200 }, { "epoch": 1.235462858444746, "grad_norm": 0.23137667775154114, "learning_rate": 9.587311349425391e-05, "loss": 1.8321, "step": 119300 }, { "epoch": 1.2364984517879518, "grad_norm": 0.2395658791065216, "learning_rate": 9.564561788866121e-05, "loss": 1.8561, "step": 119400 }, { "epoch": 1.2375340451311578, "grad_norm": 0.2744392156600952, "learning_rate": 9.54182661156315e-05, "loss": 1.8303, "step": 119500 }, { "epoch": 1.2385696384743639, "grad_norm": 0.2632865011692047, "learning_rate": 9.519105877678295e-05, "loss": 1.8573, "step": 119600 }, { "epoch": 1.2396052318175699, "grad_norm": 0.24812206625938416, "learning_rate": 9.496399647335173e-05, "loss": 1.8333, "step": 119700 }, { "epoch": 1.240640825160776, "grad_norm": 0.24596838653087616, "learning_rate": 9.473707980618992e-05, "loss": 1.8534, "step": 119800 }, { "epoch": 1.241676418503982, "grad_norm": 0.2314409464597702, "learning_rate": 9.451030937576458e-05, "loss": 1.8349, "step": 119900 }, { "epoch": 1.242712011847188, "grad_norm": 0.25256529450416565, "learning_rate": 9.428368578215549e-05, "loss": 1.8537, "step": 120000 }, { "epoch": 1.2437476051903937, "grad_norm": 0.2499077320098877, "learning_rate": 9.405720962505398e-05, "loss": 1.8475, "step": 120100 }, { "epoch": 1.2447831985335998, "grad_norm": 0.26463133096694946, "learning_rate": 9.383088150376132e-05, "loss": 1.8488, "step": 120200 }, { "epoch": 1.2458187918768058, "grad_norm": 0.23381349444389343, "learning_rate": 9.360470201718692e-05, "loss": 1.841, "step": 120300 }, { "epoch": 1.2468543852200118, "grad_norm": 0.2297975718975067, "learning_rate": 9.337867176384698e-05, "loss": 1.8451, "step": 120400 }, { "epoch": 1.2478899785632178, "grad_norm": 0.2347477227449417, "learning_rate": 9.315279134186265e-05, "loss": 1.8393, "step": 120500 }, { "epoch": 1.2489255719064238, "grad_norm": 0.24396130442619324, "learning_rate": 9.29270613489588e-05, "loss": 1.8314, "step": 120600 }, { "epoch": 1.2499611652496299, "grad_norm": 0.23611082136631012, "learning_rate": 9.270148238246202e-05, "loss": 1.8244, "step": 120700 }, { "epoch": 1.2509967585928359, "grad_norm": 0.23190715909004211, "learning_rate": 9.247605503929945e-05, "loss": 1.8414, "step": 120800 }, { "epoch": 1.2520323519360417, "grad_norm": 0.24579507112503052, "learning_rate": 9.225077991599684e-05, "loss": 1.8328, "step": 120900 }, { "epoch": 1.2530679452792477, "grad_norm": 0.22648024559020996, "learning_rate": 9.202565760867726e-05, "loss": 1.8467, "step": 121000 }, { "epoch": 1.2541035386224537, "grad_norm": 0.23870104551315308, "learning_rate": 9.180068871305938e-05, "loss": 1.8199, "step": 121100 }, { "epoch": 1.2551391319656597, "grad_norm": 0.25623899698257446, "learning_rate": 9.157587382445572e-05, "loss": 1.8267, "step": 121200 }, { "epoch": 1.2561747253088658, "grad_norm": 0.24756823480129242, "learning_rate": 9.135121353777164e-05, "loss": 1.8444, "step": 121300 }, { "epoch": 1.2572103186520718, "grad_norm": 0.23233170807361603, "learning_rate": 9.112670844750302e-05, "loss": 1.8405, "step": 121400 }, { "epoch": 1.2582459119952776, "grad_norm": 0.23981282114982605, "learning_rate": 9.090235914773527e-05, "loss": 1.851, "step": 121500 }, { "epoch": 1.2592815053384836, "grad_norm": 0.24224667251110077, "learning_rate": 9.067816623214148e-05, "loss": 1.8521, "step": 121600 }, { "epoch": 1.2603170986816896, "grad_norm": 0.23124560713768005, "learning_rate": 9.045413029398099e-05, "loss": 1.8444, "step": 121700 }, { "epoch": 1.2613526920248956, "grad_norm": 0.2451067417860031, "learning_rate": 9.02302519260976e-05, "loss": 1.8382, "step": 121800 }, { "epoch": 1.2623882853681017, "grad_norm": 0.23551911115646362, "learning_rate": 9.00065317209183e-05, "loss": 1.8365, "step": 121900 }, { "epoch": 1.2634238787113077, "grad_norm": 0.22939565777778625, "learning_rate": 8.978297027045144e-05, "loss": 1.8311, "step": 122000 }, { "epoch": 1.2644594720545137, "grad_norm": 0.24728162586688995, "learning_rate": 8.95595681662854e-05, "loss": 1.8368, "step": 122100 }, { "epoch": 1.2654950653977197, "grad_norm": 0.23099398612976074, "learning_rate": 8.933632599958673e-05, "loss": 1.8402, "step": 122200 }, { "epoch": 1.2665306587409257, "grad_norm": 0.24729430675506592, "learning_rate": 8.911324436109885e-05, "loss": 1.8424, "step": 122300 }, { "epoch": 1.2675662520841315, "grad_norm": 0.2498714029788971, "learning_rate": 8.88903238411404e-05, "loss": 1.8371, "step": 122400 }, { "epoch": 1.2686018454273376, "grad_norm": 0.23077557981014252, "learning_rate": 8.86675650296036e-05, "loss": 1.833, "step": 122500 }, { "epoch": 1.2696374387705436, "grad_norm": 0.24345731735229492, "learning_rate": 8.844496851595291e-05, "loss": 1.8328, "step": 122600 }, { "epoch": 1.2706730321137496, "grad_norm": 0.23817458748817444, "learning_rate": 8.82225348892231e-05, "loss": 1.8301, "step": 122700 }, { "epoch": 1.2717086254569556, "grad_norm": 0.23893265426158905, "learning_rate": 8.800026473801814e-05, "loss": 1.8466, "step": 122800 }, { "epoch": 1.2727442188001614, "grad_norm": 0.23662212491035461, "learning_rate": 8.777815865050914e-05, "loss": 1.8337, "step": 122900 }, { "epoch": 1.2737798121433674, "grad_norm": 0.2303624451160431, "learning_rate": 8.755621721443337e-05, "loss": 1.8455, "step": 123000 }, { "epoch": 1.2748154054865735, "grad_norm": 0.2392922192811966, "learning_rate": 8.733444101709212e-05, "loss": 1.8293, "step": 123100 }, { "epoch": 1.2758509988297795, "grad_norm": 0.22916318476200104, "learning_rate": 8.711504592630449e-05, "loss": 1.8315, "step": 123200 }, { "epoch": 1.2768865921729855, "grad_norm": 0.24341729283332825, "learning_rate": 8.689360029956446e-05, "loss": 1.8241, "step": 123300 }, { "epoch": 1.2779221855161915, "grad_norm": 0.24398744106292725, "learning_rate": 8.667232166497586e-05, "loss": 1.8465, "step": 123400 }, { "epoch": 1.2789577788593975, "grad_norm": 0.24452991783618927, "learning_rate": 8.645121060808624e-05, "loss": 1.8355, "step": 123500 }, { "epoch": 1.2799933722026036, "grad_norm": 0.24184665083885193, "learning_rate": 8.623026771399956e-05, "loss": 1.8435, "step": 123600 }, { "epoch": 1.2810289655458096, "grad_norm": 0.24016697704792023, "learning_rate": 8.600949356737495e-05, "loss": 1.8256, "step": 123700 }, { "epoch": 1.2820645588890154, "grad_norm": 0.24616780877113342, "learning_rate": 8.578888875242494e-05, "loss": 1.8471, "step": 123800 }, { "epoch": 1.2831001522322214, "grad_norm": 0.2502405643463135, "learning_rate": 8.556845385291406e-05, "loss": 1.8186, "step": 123900 }, { "epoch": 1.2841357455754274, "grad_norm": 0.23341676592826843, "learning_rate": 8.534818945215699e-05, "loss": 1.8365, "step": 124000 }, { "epoch": 1.2851713389186334, "grad_norm": 0.23874728381633759, "learning_rate": 8.512809613301747e-05, "loss": 1.8259, "step": 124100 }, { "epoch": 1.2862069322618395, "grad_norm": 0.24095730483531952, "learning_rate": 8.490817447790637e-05, "loss": 1.8366, "step": 124200 }, { "epoch": 1.2872425256050455, "grad_norm": 0.2649783492088318, "learning_rate": 8.468842506878039e-05, "loss": 1.8129, "step": 124300 }, { "epoch": 1.2882781189482513, "grad_norm": 0.24592097103595734, "learning_rate": 8.446884848714043e-05, "loss": 1.8309, "step": 124400 }, { "epoch": 1.2893137122914573, "grad_norm": 0.3560178875923157, "learning_rate": 8.424944531402997e-05, "loss": 1.8296, "step": 124500 }, { "epoch": 1.2903493056346633, "grad_norm": 0.2414814978837967, "learning_rate": 8.403021613003374e-05, "loss": 1.8351, "step": 124600 }, { "epoch": 1.2913848989778693, "grad_norm": 0.24838803708553314, "learning_rate": 8.381116151527589e-05, "loss": 1.8251, "step": 124700 }, { "epoch": 1.2924204923210754, "grad_norm": 0.2467070072889328, "learning_rate": 8.359228204941886e-05, "loss": 1.8337, "step": 124800 }, { "epoch": 1.2934560856642814, "grad_norm": 0.220038503408432, "learning_rate": 8.337357831166138e-05, "loss": 1.8215, "step": 124900 }, { "epoch": 1.2944916790074874, "grad_norm": 0.24743519723415375, "learning_rate": 8.315505088073738e-05, "loss": 1.8247, "step": 125000 }, { "epoch": 1.2955272723506934, "grad_norm": 0.23501497507095337, "learning_rate": 8.293670033491402e-05, "loss": 1.8122, "step": 125100 }, { "epoch": 1.2965628656938994, "grad_norm": 0.24010924994945526, "learning_rate": 8.271852725199052e-05, "loss": 1.8286, "step": 125200 }, { "epoch": 1.2975984590371052, "grad_norm": 0.24931704998016357, "learning_rate": 8.250053220929652e-05, "loss": 1.8207, "step": 125300 }, { "epoch": 1.2986340523803113, "grad_norm": 0.256802499294281, "learning_rate": 8.228271578369052e-05, "loss": 1.829, "step": 125400 }, { "epoch": 1.2996696457235173, "grad_norm": 0.23014986515045166, "learning_rate": 8.206507855155832e-05, "loss": 1.835, "step": 125500 }, { "epoch": 1.3007052390667233, "grad_norm": 0.24331720173358917, "learning_rate": 8.184762108881145e-05, "loss": 1.8234, "step": 125600 }, { "epoch": 1.3017408324099293, "grad_norm": 0.23897334933280945, "learning_rate": 8.16303439708859e-05, "loss": 1.8319, "step": 125700 }, { "epoch": 1.3027764257531351, "grad_norm": 0.2967206537723541, "learning_rate": 8.14132477727404e-05, "loss": 1.8168, "step": 125800 }, { "epoch": 1.3038120190963411, "grad_norm": 0.2319442629814148, "learning_rate": 8.119850131561205e-05, "loss": 1.8195, "step": 125900 }, { "epoch": 1.3048476124395472, "grad_norm": 0.24683183431625366, "learning_rate": 8.098176685646383e-05, "loss": 1.8332, "step": 126000 }, { "epoch": 1.3058832057827532, "grad_norm": 0.23843346536159515, "learning_rate": 8.076521503336037e-05, "loss": 1.8198, "step": 126100 }, { "epoch": 1.3069187991259592, "grad_norm": 0.23205947875976562, "learning_rate": 8.05488464193411e-05, "loss": 1.8175, "step": 126200 }, { "epoch": 1.3079543924691652, "grad_norm": 0.2476016879081726, "learning_rate": 8.033266158696067e-05, "loss": 1.8349, "step": 126300 }, { "epoch": 1.3089899858123712, "grad_norm": 0.22830481827259064, "learning_rate": 8.011666110828724e-05, "loss": 1.8228, "step": 126400 }, { "epoch": 1.3100255791555773, "grad_norm": 0.26087597012519836, "learning_rate": 7.990084555490141e-05, "loss": 1.8261, "step": 126500 }, { "epoch": 1.3110611724987833, "grad_norm": 0.26444897055625916, "learning_rate": 7.96852154978942e-05, "loss": 1.819, "step": 126600 }, { "epoch": 1.312096765841989, "grad_norm": 0.22989632189273834, "learning_rate": 7.946977150786582e-05, "loss": 1.8272, "step": 126700 }, { "epoch": 1.313132359185195, "grad_norm": 0.2406211495399475, "learning_rate": 7.925451415492428e-05, "loss": 1.8303, "step": 126800 }, { "epoch": 1.3141679525284011, "grad_norm": 0.22954131662845612, "learning_rate": 7.903944400868344e-05, "loss": 1.8319, "step": 126900 }, { "epoch": 1.3152035458716071, "grad_norm": 0.23300564289093018, "learning_rate": 7.882456163826202e-05, "loss": 1.8095, "step": 127000 }, { "epoch": 1.3162391392148132, "grad_norm": 0.23716017603874207, "learning_rate": 7.860986761228157e-05, "loss": 1.8243, "step": 127100 }, { "epoch": 1.3172747325580192, "grad_norm": 0.23768557608127594, "learning_rate": 7.839536249886563e-05, "loss": 1.8198, "step": 127200 }, { "epoch": 1.318310325901225, "grad_norm": 0.24191433191299438, "learning_rate": 7.818104686563746e-05, "loss": 1.8192, "step": 127300 }, { "epoch": 1.319345919244431, "grad_norm": 0.24112993478775024, "learning_rate": 7.796692127971917e-05, "loss": 1.8257, "step": 127400 }, { "epoch": 1.320381512587637, "grad_norm": 0.23616741597652435, "learning_rate": 7.775298630772978e-05, "loss": 1.8197, "step": 127500 }, { "epoch": 1.321417105930843, "grad_norm": 0.2344331294298172, "learning_rate": 7.753924251578404e-05, "loss": 1.8179, "step": 127600 }, { "epoch": 1.322452699274049, "grad_norm": 0.2370009571313858, "learning_rate": 7.732569046949085e-05, "loss": 1.8057, "step": 127700 }, { "epoch": 1.323488292617255, "grad_norm": 0.24072659015655518, "learning_rate": 7.711233073395147e-05, "loss": 1.8304, "step": 127800 }, { "epoch": 1.3245238859604611, "grad_norm": 0.24806253612041473, "learning_rate": 7.689916387375855e-05, "loss": 1.8281, "step": 127900 }, { "epoch": 1.3255594793036671, "grad_norm": 0.26090526580810547, "learning_rate": 7.668619045299412e-05, "loss": 1.8225, "step": 128000 }, { "epoch": 1.3265950726468732, "grad_norm": 0.24256770312786102, "learning_rate": 7.64734110352285e-05, "loss": 1.8249, "step": 128100 }, { "epoch": 1.327630665990079, "grad_norm": 0.23833513259887695, "learning_rate": 7.626082618351854e-05, "loss": 1.8276, "step": 128200 }, { "epoch": 1.328666259333285, "grad_norm": 0.24293087422847748, "learning_rate": 7.604843646040634e-05, "loss": 1.8132, "step": 128300 }, { "epoch": 1.329701852676491, "grad_norm": 0.2451597899198532, "learning_rate": 7.583624242791745e-05, "loss": 1.8337, "step": 128400 }, { "epoch": 1.330737446019697, "grad_norm": 0.25867322087287903, "learning_rate": 7.562424464755976e-05, "loss": 1.8235, "step": 128500 }, { "epoch": 1.331773039362903, "grad_norm": 0.23849640786647797, "learning_rate": 7.541244368032183e-05, "loss": 1.8127, "step": 128600 }, { "epoch": 1.3328086327061088, "grad_norm": 0.2913399636745453, "learning_rate": 7.520084008667137e-05, "loss": 1.8284, "step": 128700 }, { "epoch": 1.3338442260493149, "grad_norm": 0.246061310172081, "learning_rate": 7.498943442655377e-05, "loss": 1.8367, "step": 128800 }, { "epoch": 1.3348798193925209, "grad_norm": 0.23302733898162842, "learning_rate": 7.47782272593906e-05, "loss": 1.8178, "step": 128900 }, { "epoch": 1.335915412735727, "grad_norm": 0.24501657485961914, "learning_rate": 7.456721914407832e-05, "loss": 1.8152, "step": 129000 }, { "epoch": 1.336951006078933, "grad_norm": 0.25347840785980225, "learning_rate": 7.43564106389866e-05, "loss": 1.8327, "step": 129100 }, { "epoch": 1.337986599422139, "grad_norm": 0.2331436276435852, "learning_rate": 7.414790739266496e-05, "loss": 1.8556, "step": 129200 }, { "epoch": 1.339022192765345, "grad_norm": 0.22294217348098755, "learning_rate": 7.393749777099832e-05, "loss": 1.8246, "step": 129300 }, { "epoch": 1.340057786108551, "grad_norm": 0.23584097623825073, "learning_rate": 7.3727289425921e-05, "loss": 1.8149, "step": 129400 }, { "epoch": 1.341093379451757, "grad_norm": 0.23438577353954315, "learning_rate": 7.351728291368607e-05, "loss": 1.8323, "step": 129500 }, { "epoch": 1.3421289727949628, "grad_norm": 0.23464149236679077, "learning_rate": 7.330957582760294e-05, "loss": 1.8026, "step": 129600 }, { "epoch": 1.3431645661381688, "grad_norm": 0.25033196806907654, "learning_rate": 7.309997261549137e-05, "loss": 1.815, "step": 129700 }, { "epoch": 1.3442001594813748, "grad_norm": 0.2338024526834488, "learning_rate": 7.289057289622811e-05, "loss": 1.8215, "step": 129800 }, { "epoch": 1.3452357528245809, "grad_norm": 0.2362312227487564, "learning_rate": 7.26813772239266e-05, "loss": 1.8346, "step": 129900 }, { "epoch": 1.3462713461677869, "grad_norm": 0.24308595061302185, "learning_rate": 7.247238615216032e-05, "loss": 1.8262, "step": 130000 }, { "epoch": 1.347306939510993, "grad_norm": 0.2429390400648117, "learning_rate": 7.226360023396166e-05, "loss": 1.808, "step": 130100 }, { "epoch": 1.3483425328541987, "grad_norm": 0.2305445820093155, "learning_rate": 7.205502002181972e-05, "loss": 1.8141, "step": 130200 }, { "epoch": 1.3493781261974047, "grad_norm": 0.2584957480430603, "learning_rate": 7.184664606767958e-05, "loss": 1.8112, "step": 130300 }, { "epoch": 1.3504137195406107, "grad_norm": 0.24177442491054535, "learning_rate": 7.163847892294025e-05, "loss": 1.8219, "step": 130400 }, { "epoch": 1.3514493128838168, "grad_norm": 0.24516649544239044, "learning_rate": 7.143051913845371e-05, "loss": 1.8354, "step": 130500 }, { "epoch": 1.3524849062270228, "grad_norm": 0.25100788474082947, "learning_rate": 7.122276726452309e-05, "loss": 1.8037, "step": 130600 }, { "epoch": 1.3535204995702288, "grad_norm": 0.2580251395702362, "learning_rate": 7.101522385090146e-05, "loss": 1.8176, "step": 130700 }, { "epoch": 1.3545560929134348, "grad_norm": 0.24578389525413513, "learning_rate": 7.080788944679009e-05, "loss": 1.827, "step": 130800 }, { "epoch": 1.3555916862566408, "grad_norm": 0.24453017115592957, "learning_rate": 7.060076460083738e-05, "loss": 1.8281, "step": 130900 }, { "epoch": 1.3566272795998469, "grad_norm": 0.2459542155265808, "learning_rate": 7.039384986113694e-05, "loss": 1.8131, "step": 131000 }, { "epoch": 1.3576628729430527, "grad_norm": 0.23702123761177063, "learning_rate": 7.018714577522663e-05, "loss": 1.8232, "step": 131100 }, { "epoch": 1.3586984662862587, "grad_norm": 0.23571869730949402, "learning_rate": 6.998065289008682e-05, "loss": 1.8073, "step": 131200 }, { "epoch": 1.3597340596294647, "grad_norm": 0.24064995348453522, "learning_rate": 6.97743717521389e-05, "loss": 1.8271, "step": 131300 }, { "epoch": 1.3607696529726707, "grad_norm": 0.24271716177463531, "learning_rate": 6.956830290724403e-05, "loss": 1.8207, "step": 131400 }, { "epoch": 1.3618052463158767, "grad_norm": 0.24056479334831238, "learning_rate": 6.936244690070159e-05, "loss": 1.81, "step": 131500 }, { "epoch": 1.3628408396590825, "grad_norm": 0.24118492007255554, "learning_rate": 6.91568042772478e-05, "loss": 1.8165, "step": 131600 }, { "epoch": 1.3638764330022886, "grad_norm": 0.24606949090957642, "learning_rate": 6.895137558105401e-05, "loss": 1.8259, "step": 131700 }, { "epoch": 1.3649120263454946, "grad_norm": 0.2296934276819229, "learning_rate": 6.874616135572576e-05, "loss": 1.8034, "step": 131800 }, { "epoch": 1.3659476196887006, "grad_norm": 0.29468992352485657, "learning_rate": 6.854116214430081e-05, "loss": 1.8063, "step": 131900 }, { "epoch": 1.3669832130319066, "grad_norm": 0.24799683690071106, "learning_rate": 6.833637848924813e-05, "loss": 1.8317, "step": 132000 }, { "epoch": 1.3680188063751126, "grad_norm": 0.23728682100772858, "learning_rate": 6.813181093246624e-05, "loss": 1.8137, "step": 132100 }, { "epoch": 1.3690543997183187, "grad_norm": 0.24109381437301636, "learning_rate": 6.79274600152817e-05, "loss": 1.8145, "step": 132200 }, { "epoch": 1.3700899930615247, "grad_norm": 0.23594675958156586, "learning_rate": 6.772332627844804e-05, "loss": 1.7994, "step": 132300 }, { "epoch": 1.3711255864047307, "grad_norm": 0.2506548762321472, "learning_rate": 6.751941026214373e-05, "loss": 1.8272, "step": 132400 }, { "epoch": 1.3721611797479365, "grad_norm": 0.2326715886592865, "learning_rate": 6.731571250597157e-05, "loss": 1.8368, "step": 132500 }, { "epoch": 1.3731967730911425, "grad_norm": 0.236336350440979, "learning_rate": 6.711223354895637e-05, "loss": 1.8081, "step": 132600 }, { "epoch": 1.3742323664343485, "grad_norm": 0.25050321221351624, "learning_rate": 6.690897392954425e-05, "loss": 1.8085, "step": 132700 }, { "epoch": 1.3752679597775546, "grad_norm": 0.8197193145751953, "learning_rate": 6.67059341856007e-05, "loss": 1.8035, "step": 132800 }, { "epoch": 1.3763035531207606, "grad_norm": 0.23058059811592102, "learning_rate": 6.650311485440952e-05, "loss": 1.8112, "step": 132900 }, { "epoch": 1.3773391464639666, "grad_norm": 0.24961444735527039, "learning_rate": 6.630051647267123e-05, "loss": 1.7953, "step": 133000 }, { "epoch": 1.3783747398071724, "grad_norm": 0.23716160655021667, "learning_rate": 6.60981395765017e-05, "loss": 1.8195, "step": 133100 }, { "epoch": 1.3794103331503784, "grad_norm": 0.23246720433235168, "learning_rate": 6.589598470143063e-05, "loss": 1.8013, "step": 133200 }, { "epoch": 1.3804459264935844, "grad_norm": 0.24064932763576508, "learning_rate": 6.569405238240015e-05, "loss": 1.8254, "step": 133300 }, { "epoch": 1.3814815198367905, "grad_norm": 0.25524094700813293, "learning_rate": 6.549234315376363e-05, "loss": 1.8107, "step": 133400 }, { "epoch": 1.3825171131799965, "grad_norm": 0.23669590055942535, "learning_rate": 6.5290857549284e-05, "loss": 1.8086, "step": 133500 }, { "epoch": 1.3835527065232025, "grad_norm": 0.23468001186847687, "learning_rate": 6.508959610213251e-05, "loss": 1.8292, "step": 133600 }, { "epoch": 1.3845882998664085, "grad_norm": 0.2441907674074173, "learning_rate": 6.488855934488712e-05, "loss": 1.8107, "step": 133700 }, { "epoch": 1.3856238932096145, "grad_norm": 0.24992570281028748, "learning_rate": 6.46877478095313e-05, "loss": 1.8151, "step": 133800 }, { "epoch": 1.3866594865528206, "grad_norm": 0.23866009712219238, "learning_rate": 6.448716202745254e-05, "loss": 1.7957, "step": 133900 }, { "epoch": 1.3876950798960264, "grad_norm": 0.2343151718378067, "learning_rate": 6.428880500257358e-05, "loss": 1.8146, "step": 134000 }, { "epoch": 1.3887306732392324, "grad_norm": 0.2239018827676773, "learning_rate": 6.408867004805533e-05, "loss": 1.7844, "step": 134100 }, { "epoch": 1.3897662665824384, "grad_norm": 0.2355658859014511, "learning_rate": 6.388876243209373e-05, "loss": 1.8247, "step": 134200 }, { "epoch": 1.3908018599256444, "grad_norm": 0.24689391255378723, "learning_rate": 6.36890826836842e-05, "loss": 1.7997, "step": 134300 }, { "epoch": 1.3918374532688504, "grad_norm": 0.23993763327598572, "learning_rate": 6.348963133121932e-05, "loss": 1.8189, "step": 134400 }, { "epoch": 1.3928730466120562, "grad_norm": 0.2346121370792389, "learning_rate": 6.329040890248734e-05, "loss": 1.8094, "step": 134500 }, { "epoch": 1.3939086399552623, "grad_norm": 0.24085171520709991, "learning_rate": 6.309141592467047e-05, "loss": 1.8059, "step": 134600 }, { "epoch": 1.3949442332984683, "grad_norm": 0.2389710694551468, "learning_rate": 6.289265292434406e-05, "loss": 1.8142, "step": 134700 }, { "epoch": 1.3959798266416743, "grad_norm": 0.24992823600769043, "learning_rate": 6.26941204274746e-05, "loss": 1.8055, "step": 134800 }, { "epoch": 1.3970154199848803, "grad_norm": 0.2472657710313797, "learning_rate": 6.249581895941897e-05, "loss": 1.8171, "step": 134900 }, { "epoch": 1.3980510133280863, "grad_norm": 0.22984735667705536, "learning_rate": 6.229774904492235e-05, "loss": 1.7997, "step": 135000 }, { "epoch": 1.3990866066712924, "grad_norm": 0.23540957272052765, "learning_rate": 6.209991120811744e-05, "loss": 1.8262, "step": 135100 }, { "epoch": 1.4001222000144984, "grad_norm": 0.23757830262184143, "learning_rate": 6.190230597252259e-05, "loss": 1.8203, "step": 135200 }, { "epoch": 1.4011577933577044, "grad_norm": 0.24814659357070923, "learning_rate": 6.170493386104086e-05, "loss": 1.8237, "step": 135300 }, { "epoch": 1.4021933867009102, "grad_norm": 0.2580169141292572, "learning_rate": 6.15077953959583e-05, "loss": 1.8198, "step": 135400 }, { "epoch": 1.4032289800441162, "grad_norm": 0.24208001792430878, "learning_rate": 6.131089109894269e-05, "loss": 1.7969, "step": 135500 }, { "epoch": 1.4042645733873222, "grad_norm": 0.2293931245803833, "learning_rate": 6.111422149104221e-05, "loss": 1.8016, "step": 135600 }, { "epoch": 1.4053001667305283, "grad_norm": 0.2415732741355896, "learning_rate": 6.091778709268389e-05, "loss": 1.8176, "step": 135700 }, { "epoch": 1.4063357600737343, "grad_norm": 0.2541373372077942, "learning_rate": 6.072158842367248e-05, "loss": 1.8016, "step": 135800 }, { "epoch": 1.4073713534169403, "grad_norm": 0.29783177375793457, "learning_rate": 6.052562600318886e-05, "loss": 1.794, "step": 135900 }, { "epoch": 1.408406946760146, "grad_norm": 0.22567243874073029, "learning_rate": 6.032990034978885e-05, "loss": 1.8002, "step": 136000 }, { "epoch": 1.4094425401033521, "grad_norm": 0.26253440976142883, "learning_rate": 6.013441198140155e-05, "loss": 1.794, "step": 136100 }, { "epoch": 1.4104781334465581, "grad_norm": 0.24990223348140717, "learning_rate": 5.993916141532832e-05, "loss": 1.806, "step": 136200 }, { "epoch": 1.4115137267897642, "grad_norm": 0.25153446197509766, "learning_rate": 5.974414916824121e-05, "loss": 1.8057, "step": 136300 }, { "epoch": 1.4125493201329702, "grad_norm": 0.24260573089122772, "learning_rate": 5.954937575618166e-05, "loss": 1.7934, "step": 136400 }, { "epoch": 1.4135849134761762, "grad_norm": 0.24124100804328918, "learning_rate": 5.935484169455903e-05, "loss": 1.7905, "step": 136500 }, { "epoch": 1.4146205068193822, "grad_norm": 0.2359512746334076, "learning_rate": 5.9160547498149244e-05, "loss": 1.8098, "step": 136600 }, { "epoch": 1.4156561001625882, "grad_norm": 0.23579341173171997, "learning_rate": 5.896649368109369e-05, "loss": 1.806, "step": 136700 }, { "epoch": 1.4166916935057943, "grad_norm": 0.2555437982082367, "learning_rate": 5.877268075689755e-05, "loss": 1.8241, "step": 136800 }, { "epoch": 1.417727286849, "grad_norm": 0.23964126408100128, "learning_rate": 5.8579109238428635e-05, "loss": 1.8127, "step": 136900 }, { "epoch": 1.418762880192206, "grad_norm": 0.2559371888637543, "learning_rate": 5.838577963791577e-05, "loss": 1.8269, "step": 137000 }, { "epoch": 1.419798473535412, "grad_norm": 0.23887130618095398, "learning_rate": 5.8194622136953044e-05, "loss": 1.8132, "step": 137100 }, { "epoch": 1.4208340668786181, "grad_norm": 0.2338080257177353, "learning_rate": 5.800177547454517e-05, "loss": 1.7929, "step": 137200 }, { "epoch": 1.4218696602218241, "grad_norm": 0.23470595479011536, "learning_rate": 5.780917225783406e-05, "loss": 1.8035, "step": 137300 }, { "epoch": 1.42290525356503, "grad_norm": 0.24527758359909058, "learning_rate": 5.761681299648623e-05, "loss": 1.8173, "step": 137400 }, { "epoch": 1.423940846908236, "grad_norm": 0.2501756250858307, "learning_rate": 5.7424698199522806e-05, "loss": 1.8014, "step": 137500 }, { "epoch": 1.424976440251442, "grad_norm": 0.24957391619682312, "learning_rate": 5.7232828375317875e-05, "loss": 1.8056, "step": 137600 }, { "epoch": 1.426012033594648, "grad_norm": 0.24483001232147217, "learning_rate": 5.704120403159738e-05, "loss": 1.8064, "step": 137700 }, { "epoch": 1.427047626937854, "grad_norm": 0.25191882252693176, "learning_rate": 5.684982567543774e-05, "loss": 1.7879, "step": 137800 }, { "epoch": 1.42808322028106, "grad_norm": 0.22922788560390472, "learning_rate": 5.665869381326421e-05, "loss": 1.8109, "step": 137900 }, { "epoch": 1.429118813624266, "grad_norm": 0.2467735856771469, "learning_rate": 5.646780895085004e-05, "loss": 1.82, "step": 138000 }, { "epoch": 1.430154406967472, "grad_norm": 0.23988230526447296, "learning_rate": 5.627717159331463e-05, "loss": 1.7823, "step": 138100 }, { "epoch": 1.4311900003106781, "grad_norm": 0.2461780458688736, "learning_rate": 5.60867822451226e-05, "loss": 1.7982, "step": 138200 }, { "epoch": 1.432225593653884, "grad_norm": 0.24287040531635284, "learning_rate": 5.589664141008224e-05, "loss": 1.808, "step": 138300 }, { "epoch": 1.43326118699709, "grad_norm": 0.25252607464790344, "learning_rate": 5.570674959134424e-05, "loss": 1.8123, "step": 138400 }, { "epoch": 1.434296780340296, "grad_norm": 0.24427302181720734, "learning_rate": 5.5517107291400255e-05, "loss": 1.7801, "step": 138500 }, { "epoch": 1.435332373683502, "grad_norm": 0.2358008623123169, "learning_rate": 5.5327715012081736e-05, "loss": 1.7793, "step": 138600 }, { "epoch": 1.436367967026708, "grad_norm": 0.24484987556934357, "learning_rate": 5.513857325455863e-05, "loss": 1.7887, "step": 138700 }, { "epoch": 1.437403560369914, "grad_norm": 0.2484014481306076, "learning_rate": 5.49496825193377e-05, "loss": 1.7802, "step": 138800 }, { "epoch": 1.4384391537131198, "grad_norm": 0.2412441223859787, "learning_rate": 5.476104330626171e-05, "loss": 1.7913, "step": 138900 }, { "epoch": 1.4394747470563258, "grad_norm": 0.24294838309288025, "learning_rate": 5.457265611450763e-05, "loss": 1.7857, "step": 139000 }, { "epoch": 1.4405103403995319, "grad_norm": 0.24046778678894043, "learning_rate": 5.4384521442585704e-05, "loss": 1.784, "step": 139100 }, { "epoch": 1.4415459337427379, "grad_norm": 0.23786774277687073, "learning_rate": 5.419663978833789e-05, "loss": 1.8045, "step": 139200 }, { "epoch": 1.442581527085944, "grad_norm": 0.25582677125930786, "learning_rate": 5.4009011648936634e-05, "loss": 1.8149, "step": 139300 }, { "epoch": 1.44361712042915, "grad_norm": 0.23847201466560364, "learning_rate": 5.382163752088344e-05, "loss": 1.8006, "step": 139400 }, { "epoch": 1.444652713772356, "grad_norm": 0.2423977106809616, "learning_rate": 5.363451790000781e-05, "loss": 1.7898, "step": 139500 }, { "epoch": 1.445688307115562, "grad_norm": 0.24309056997299194, "learning_rate": 5.344765328146549e-05, "loss": 1.7917, "step": 139600 }, { "epoch": 1.446723900458768, "grad_norm": 0.2543005347251892, "learning_rate": 5.326104415973786e-05, "loss": 1.7904, "step": 139700 }, { "epoch": 1.4477594938019738, "grad_norm": 0.2338806390762329, "learning_rate": 5.3074691028629873e-05, "loss": 1.804, "step": 139800 }, { "epoch": 1.4487950871451798, "grad_norm": 0.24888822436332703, "learning_rate": 5.288859438126909e-05, "loss": 1.8073, "step": 139900 }, { "epoch": 1.4498306804883858, "grad_norm": 0.23515139520168304, "learning_rate": 5.270275471010453e-05, "loss": 1.7871, "step": 140000 }, { "epoch": 1.4508662738315918, "grad_norm": 0.2794770300388336, "learning_rate": 5.2517172506905095e-05, "loss": 1.7973, "step": 140100 }, { "epoch": 1.4519018671747979, "grad_norm": 0.2449367195367813, "learning_rate": 5.233184826275847e-05, "loss": 1.7984, "step": 140200 }, { "epoch": 1.4529374605180037, "grad_norm": 0.24057435989379883, "learning_rate": 5.214678246806956e-05, "loss": 1.7887, "step": 140300 }, { "epoch": 1.4539730538612097, "grad_norm": 0.24254485964775085, "learning_rate": 5.196197561255955e-05, "loss": 1.7983, "step": 140400 }, { "epoch": 1.4550086472044157, "grad_norm": 0.23187781870365143, "learning_rate": 5.177742818526423e-05, "loss": 1.8008, "step": 140500 }, { "epoch": 1.4560442405476217, "grad_norm": 0.22786159813404083, "learning_rate": 5.159314067453305e-05, "loss": 1.7933, "step": 140600 }, { "epoch": 1.4570798338908277, "grad_norm": 0.33232197165489197, "learning_rate": 5.1410952548492195e-05, "loss": 1.7899, "step": 140700 }, { "epoch": 1.4581154272340338, "grad_norm": 0.2314329445362091, "learning_rate": 5.122718372186482e-05, "loss": 1.7751, "step": 140800 }, { "epoch": 1.4591510205772398, "grad_norm": 0.24640795588493347, "learning_rate": 5.104367626785847e-05, "loss": 1.7918, "step": 140900 }, { "epoch": 1.4601866139204458, "grad_norm": 0.24772091209888458, "learning_rate": 5.086043067207043e-05, "loss": 1.7938, "step": 141000 }, { "epoch": 1.4612222072636518, "grad_norm": 0.24709665775299072, "learning_rate": 5.0677447419405395e-05, "loss": 1.7861, "step": 141100 }, { "epoch": 1.4622578006068576, "grad_norm": 0.25762251019477844, "learning_rate": 5.049472699407348e-05, "loss": 1.7944, "step": 141200 }, { "epoch": 1.4632933939500636, "grad_norm": 0.24278350174427032, "learning_rate": 5.031226987958959e-05, "loss": 1.7778, "step": 141300 }, { "epoch": 1.4643289872932697, "grad_norm": 0.23248988389968872, "learning_rate": 5.013007655877161e-05, "loss": 1.7864, "step": 141400 }, { "epoch": 1.4653645806364757, "grad_norm": 0.2553103566169739, "learning_rate": 4.994814751373955e-05, "loss": 1.7774, "step": 141500 }, { "epoch": 1.4664001739796817, "grad_norm": 0.24579177796840668, "learning_rate": 4.976648322591405e-05, "loss": 1.8075, "step": 141600 }, { "epoch": 1.4674357673228877, "grad_norm": 0.24161840975284576, "learning_rate": 4.958508417601517e-05, "loss": 1.7776, "step": 141700 }, { "epoch": 1.4684713606660935, "grad_norm": 0.29268351197242737, "learning_rate": 4.940395084406097e-05, "loss": 1.7831, "step": 141800 }, { "epoch": 1.4695069540092995, "grad_norm": 0.24907058477401733, "learning_rate": 4.922308370936656e-05, "loss": 1.7935, "step": 141900 }, { "epoch": 1.4705425473525056, "grad_norm": 0.2521110773086548, "learning_rate": 4.904248325054247e-05, "loss": 1.7873, "step": 142000 }, { "epoch": 1.4715781406957116, "grad_norm": 0.28933998942375183, "learning_rate": 4.8862149945493644e-05, "loss": 1.8004, "step": 142100 }, { "epoch": 1.4726137340389176, "grad_norm": 0.2380319982767105, "learning_rate": 4.868208427141812e-05, "loss": 1.8069, "step": 142200 }, { "epoch": 1.4736493273821236, "grad_norm": 0.2450261116027832, "learning_rate": 4.850228670480558e-05, "loss": 1.7803, "step": 142300 }, { "epoch": 1.4746849207253296, "grad_norm": 0.2685863673686981, "learning_rate": 4.832275772143638e-05, "loss": 1.7918, "step": 142400 }, { "epoch": 1.4757205140685357, "grad_norm": 0.25436270236968994, "learning_rate": 4.814349779638011e-05, "loss": 1.7779, "step": 142500 }, { "epoch": 1.4767561074117417, "grad_norm": 0.24914506077766418, "learning_rate": 4.796450740399442e-05, "loss": 1.7839, "step": 142600 }, { "epoch": 1.4777917007549475, "grad_norm": 0.23973897099494934, "learning_rate": 4.778578701792358e-05, "loss": 1.7846, "step": 142700 }, { "epoch": 1.4788272940981535, "grad_norm": 0.23286214470863342, "learning_rate": 4.760733711109757e-05, "loss": 1.7884, "step": 142800 }, { "epoch": 1.4798628874413595, "grad_norm": 0.24671752750873566, "learning_rate": 4.7429158155730404e-05, "loss": 1.7942, "step": 142900 }, { "epoch": 1.4808984807845655, "grad_norm": 0.2548174560070038, "learning_rate": 4.725125062331931e-05, "loss": 1.802, "step": 143000 }, { "epoch": 1.4819340741277716, "grad_norm": 0.24280865490436554, "learning_rate": 4.70736149846432e-05, "loss": 1.7976, "step": 143100 }, { "epoch": 1.4829696674709774, "grad_norm": 0.2529030442237854, "learning_rate": 4.689625170976142e-05, "loss": 1.7688, "step": 143200 }, { "epoch": 1.4840052608141834, "grad_norm": 0.25887125730514526, "learning_rate": 4.671916126801275e-05, "loss": 1.7814, "step": 143300 }, { "epoch": 1.4850408541573894, "grad_norm": 0.2425132393836975, "learning_rate": 4.6542344128013746e-05, "loss": 1.7741, "step": 143400 }, { "epoch": 1.4860764475005954, "grad_norm": 0.25903186202049255, "learning_rate": 4.636580075765813e-05, "loss": 1.7863, "step": 143500 }, { "epoch": 1.4871120408438014, "grad_norm": 0.24791219830513, "learning_rate": 4.6189531624114814e-05, "loss": 1.7846, "step": 143600 }, { "epoch": 1.4881476341870075, "grad_norm": 0.24058634042739868, "learning_rate": 4.601353719382725e-05, "loss": 1.7746, "step": 143700 }, { "epoch": 1.4891832275302135, "grad_norm": 0.2484053373336792, "learning_rate": 4.5837817932511795e-05, "loss": 1.7841, "step": 143800 }, { "epoch": 1.4902188208734195, "grad_norm": 0.2500329613685608, "learning_rate": 4.566237430515679e-05, "loss": 1.7868, "step": 143900 }, { "epoch": 1.4912544142166255, "grad_norm": 0.23086591064929962, "learning_rate": 4.5487206776021185e-05, "loss": 1.7723, "step": 144000 }, { "epoch": 1.4922900075598315, "grad_norm": 0.23380248248577118, "learning_rate": 4.531406334780633e-05, "loss": 1.7835, "step": 144100 }, { "epoch": 1.4933256009030373, "grad_norm": 0.22701475024223328, "learning_rate": 4.513944663242872e-05, "loss": 1.7828, "step": 144200 }, { "epoch": 1.4943611942462434, "grad_norm": 0.2667444050312042, "learning_rate": 4.496510739904162e-05, "loss": 1.7876, "step": 144300 }, { "epoch": 1.4953967875894494, "grad_norm": 0.2382553368806839, "learning_rate": 4.479104610898155e-05, "loss": 1.7726, "step": 144400 }, { "epoch": 1.4964323809326554, "grad_norm": 0.24219807982444763, "learning_rate": 4.461726322284948e-05, "loss": 1.8045, "step": 144500 }, { "epoch": 1.4974679742758614, "grad_norm": 0.24782179296016693, "learning_rate": 4.4443759200509725e-05, "loss": 1.8004, "step": 144600 }, { "epoch": 1.4985035676190672, "grad_norm": 0.23586855828762054, "learning_rate": 4.4270534501088526e-05, "loss": 1.783, "step": 144700 }, { "epoch": 1.4995391609622732, "grad_norm": 0.23829668760299683, "learning_rate": 4.4097589582973176e-05, "loss": 1.7859, "step": 144800 }, { "epoch": 1.5005747543054793, "grad_norm": 0.23418094217777252, "learning_rate": 4.392492490381034e-05, "loss": 1.7707, "step": 144900 }, { "epoch": 1.5016103476486853, "grad_norm": 0.24407939612865448, "learning_rate": 4.3752540920505494e-05, "loss": 1.7964, "step": 145000 }, { "epoch": 1.5026459409918913, "grad_norm": 0.23413275182247162, "learning_rate": 4.358043808922097e-05, "loss": 1.7729, "step": 145100 }, { "epoch": 1.5036815343350973, "grad_norm": 0.24184474349021912, "learning_rate": 4.340861686537538e-05, "loss": 1.7842, "step": 145200 }, { "epoch": 1.5047171276783033, "grad_norm": 0.23974494636058807, "learning_rate": 4.323707770364195e-05, "loss": 1.7787, "step": 145300 }, { "epoch": 1.5057527210215094, "grad_norm": 0.23322482407093048, "learning_rate": 4.306582105794761e-05, "loss": 1.7845, "step": 145400 }, { "epoch": 1.5067883143647154, "grad_norm": 0.24521291255950928, "learning_rate": 4.2894847381471754e-05, "loss": 1.7833, "step": 145500 }, { "epoch": 1.5078239077079214, "grad_norm": 0.24620464444160461, "learning_rate": 4.2724157126644845e-05, "loss": 1.7892, "step": 145600 }, { "epoch": 1.5088595010511272, "grad_norm": 0.24380235373973846, "learning_rate": 4.2553750745147476e-05, "loss": 1.7943, "step": 145700 }, { "epoch": 1.5098950943943332, "grad_norm": 0.23417936265468597, "learning_rate": 4.238362868790895e-05, "loss": 1.7798, "step": 145800 }, { "epoch": 1.5109306877375392, "grad_norm": 0.2358783632516861, "learning_rate": 4.221379140510625e-05, "loss": 1.7996, "step": 145900 }, { "epoch": 1.5119662810807453, "grad_norm": 0.2538042366504669, "learning_rate": 4.204423934616283e-05, "loss": 1.7706, "step": 146000 }, { "epoch": 1.513001874423951, "grad_norm": 0.23899398744106293, "learning_rate": 4.187497295974736e-05, "loss": 1.7854, "step": 146100 }, { "epoch": 1.514037467767157, "grad_norm": 0.24248957633972168, "learning_rate": 4.170599269377245e-05, "loss": 1.7863, "step": 146200 }, { "epoch": 1.515073061110363, "grad_norm": 0.23839721083641052, "learning_rate": 4.153729899539372e-05, "loss": 1.7836, "step": 146300 }, { "epoch": 1.5161086544535691, "grad_norm": 0.2508465647697449, "learning_rate": 4.136889231100847e-05, "loss": 1.7838, "step": 146400 }, { "epoch": 1.5171442477967751, "grad_norm": 0.2585816979408264, "learning_rate": 4.120077308625439e-05, "loss": 1.7666, "step": 146500 }, { "epoch": 1.5181798411399812, "grad_norm": 0.2363991141319275, "learning_rate": 4.103294176600861e-05, "loss": 1.7938, "step": 146600 }, { "epoch": 1.5192154344831872, "grad_norm": 0.2528984546661377, "learning_rate": 4.0865398794386304e-05, "loss": 1.7831, "step": 146700 }, { "epoch": 1.5202510278263932, "grad_norm": 0.258318156003952, "learning_rate": 4.0698144614739714e-05, "loss": 1.7822, "step": 146800 }, { "epoch": 1.5212866211695992, "grad_norm": 0.2890702784061432, "learning_rate": 4.053117966965682e-05, "loss": 1.7982, "step": 146900 }, { "epoch": 1.5223222145128052, "grad_norm": 0.2451907992362976, "learning_rate": 4.036450440096034e-05, "loss": 1.7897, "step": 147000 }, { "epoch": 1.523357807856011, "grad_norm": 0.2518916726112366, "learning_rate": 4.019811924970623e-05, "loss": 1.7957, "step": 147100 }, { "epoch": 1.524393401199217, "grad_norm": 0.2348620742559433, "learning_rate": 4.003202465618291e-05, "loss": 1.7717, "step": 147200 }, { "epoch": 1.525428994542423, "grad_norm": 0.24071717262268066, "learning_rate": 3.986622105990992e-05, "loss": 1.7938, "step": 147300 }, { "epoch": 1.526464587885629, "grad_norm": 0.25559526681900024, "learning_rate": 3.970070889963675e-05, "loss": 1.7805, "step": 147400 }, { "epoch": 1.527500181228835, "grad_norm": 0.24412083625793457, "learning_rate": 3.953548861334161e-05, "loss": 1.7954, "step": 147500 }, { "epoch": 1.528535774572041, "grad_norm": 0.23359213769435883, "learning_rate": 3.93705606382304e-05, "loss": 1.7666, "step": 147600 }, { "epoch": 1.529571367915247, "grad_norm": 0.2603493630886078, "learning_rate": 3.920592541073555e-05, "loss": 1.7809, "step": 147700 }, { "epoch": 1.530606961258453, "grad_norm": 0.23860156536102295, "learning_rate": 3.904158336651478e-05, "loss": 1.7825, "step": 147800 }, { "epoch": 1.531642554601659, "grad_norm": 0.24123388528823853, "learning_rate": 3.8877534940450064e-05, "loss": 1.7707, "step": 147900 }, { "epoch": 1.532678147944865, "grad_norm": 0.25643640756607056, "learning_rate": 3.871378056664622e-05, "loss": 1.779, "step": 148000 }, { "epoch": 1.533713741288071, "grad_norm": 0.23661455512046814, "learning_rate": 3.8550320678430176e-05, "loss": 1.7645, "step": 148100 }, { "epoch": 1.534749334631277, "grad_norm": 0.24772033095359802, "learning_rate": 3.838715570834934e-05, "loss": 1.7673, "step": 148200 }, { "epoch": 1.535784927974483, "grad_norm": 0.246613547205925, "learning_rate": 3.8224286088171015e-05, "loss": 1.7783, "step": 148300 }, { "epoch": 1.536820521317689, "grad_norm": 0.2527683973312378, "learning_rate": 3.806171224888066e-05, "loss": 1.7722, "step": 148400 }, { "epoch": 1.5378561146608951, "grad_norm": 0.2445138841867447, "learning_rate": 3.789943462068124e-05, "loss": 1.7748, "step": 148500 }, { "epoch": 1.538891708004101, "grad_norm": 0.23080067336559296, "learning_rate": 3.773745363299172e-05, "loss": 1.7715, "step": 148600 }, { "epoch": 1.539927301347307, "grad_norm": 0.24305923283100128, "learning_rate": 3.757576971444624e-05, "loss": 1.7654, "step": 148700 }, { "epoch": 1.540962894690513, "grad_norm": 0.23672178387641907, "learning_rate": 3.741438329289279e-05, "loss": 1.781, "step": 148800 }, { "epoch": 1.541998488033719, "grad_norm": 0.2409089356660843, "learning_rate": 3.725329479539204e-05, "loss": 1.7757, "step": 148900 }, { "epoch": 1.5430340813769248, "grad_norm": 0.25203487277030945, "learning_rate": 3.7092504648216454e-05, "loss": 1.7772, "step": 149000 }, { "epoch": 1.5440696747201308, "grad_norm": 0.2366684079170227, "learning_rate": 3.693201327684883e-05, "loss": 1.7666, "step": 149100 }, { "epoch": 1.5451052680633368, "grad_norm": 0.24480077624320984, "learning_rate": 3.677182110598144e-05, "loss": 1.7821, "step": 149200 }, { "epoch": 1.5461408614065428, "grad_norm": 0.24038438498973846, "learning_rate": 3.661352600044876e-05, "loss": 1.7755, "step": 149300 }, { "epoch": 1.5471764547497489, "grad_norm": 0.25305137038230896, "learning_rate": 3.6453930498923747e-05, "loss": 1.782, "step": 149400 }, { "epoch": 1.5482120480929549, "grad_norm": 0.2378205507993698, "learning_rate": 3.6294635463001543e-05, "loss": 1.7696, "step": 149500 }, { "epoch": 1.549247641436161, "grad_norm": 0.24967072904109955, "learning_rate": 3.613564131420879e-05, "loss": 1.771, "step": 149600 }, { "epoch": 1.550283234779367, "grad_norm": 0.23712900280952454, "learning_rate": 3.597694847327568e-05, "loss": 1.7637, "step": 149700 }, { "epoch": 1.551318828122573, "grad_norm": 0.2321092188358307, "learning_rate": 3.581855736013523e-05, "loss": 1.7802, "step": 149800 }, { "epoch": 1.552354421465779, "grad_norm": 0.2432144582271576, "learning_rate": 3.56604683939221e-05, "loss": 1.7837, "step": 149900 }, { "epoch": 1.5533900148089848, "grad_norm": 0.2497885823249817, "learning_rate": 3.550268199297114e-05, "loss": 1.7729, "step": 150000 }, { "epoch": 1.5544256081521908, "grad_norm": 0.24619407951831818, "learning_rate": 3.534519857481682e-05, "loss": 1.7663, "step": 150100 }, { "epoch": 1.5554612014953968, "grad_norm": 0.2394361048936844, "learning_rate": 3.5188018556191735e-05, "loss": 1.7525, "step": 150200 }, { "epoch": 1.5564967948386028, "grad_norm": 0.2429407685995102, "learning_rate": 3.503114235302566e-05, "loss": 1.7646, "step": 150300 }, { "epoch": 1.5575323881818086, "grad_norm": 0.24376606941223145, "learning_rate": 3.487457038044434e-05, "loss": 1.7844, "step": 150400 }, { "epoch": 1.5585679815250146, "grad_norm": 0.24951650202274323, "learning_rate": 3.4718303052768585e-05, "loss": 1.7673, "step": 150500 }, { "epoch": 1.5596035748682207, "grad_norm": 0.2464444786310196, "learning_rate": 3.4562340783512926e-05, "loss": 1.7814, "step": 150600 }, { "epoch": 1.5606391682114267, "grad_norm": 0.24928238987922668, "learning_rate": 3.4406683985384724e-05, "loss": 1.7613, "step": 150700 }, { "epoch": 1.5616747615546327, "grad_norm": 0.24191848933696747, "learning_rate": 3.4251333070283066e-05, "loss": 1.7752, "step": 150800 }, { "epoch": 1.5627103548978387, "grad_norm": 0.25779271125793457, "learning_rate": 3.409628844929743e-05, "loss": 1.7747, "step": 150900 }, { "epoch": 1.5637459482410447, "grad_norm": 0.23987020552158356, "learning_rate": 3.394155053270695e-05, "loss": 1.7552, "step": 151000 }, { "epoch": 1.5647815415842508, "grad_norm": 0.24696967005729675, "learning_rate": 3.378711972997902e-05, "loss": 1.7799, "step": 151100 }, { "epoch": 1.5658171349274568, "grad_norm": 0.2517462968826294, "learning_rate": 3.3632996449768536e-05, "loss": 1.7721, "step": 151200 }, { "epoch": 1.5668527282706628, "grad_norm": 0.24277019500732422, "learning_rate": 3.34791810999164e-05, "loss": 1.7684, "step": 151300 }, { "epoch": 1.5678883216138688, "grad_norm": 0.23690778017044067, "learning_rate": 3.332720762996904e-05, "loss": 1.7658, "step": 151400 }, { "epoch": 1.5689239149570746, "grad_norm": 0.251127153635025, "learning_rate": 3.3174006271652297e-05, "loss": 1.7693, "step": 151500 }, { "epoch": 1.5699595083002806, "grad_norm": 0.25691816210746765, "learning_rate": 3.3021114058273725e-05, "loss": 1.7678, "step": 151600 }, { "epoch": 1.5709951016434867, "grad_norm": 0.2550021708011627, "learning_rate": 3.2868531394416715e-05, "loss": 1.7718, "step": 151700 }, { "epoch": 1.5720306949866927, "grad_norm": 0.25155770778656006, "learning_rate": 3.271777987535873e-05, "loss": 1.774, "step": 151800 }, { "epoch": 1.5730662883298985, "grad_norm": 0.25734907388687134, "learning_rate": 3.256581441546317e-05, "loss": 1.7855, "step": 151900 }, { "epoch": 1.5741018816731045, "grad_norm": 0.2586398422718048, "learning_rate": 3.2414159709903036e-05, "loss": 1.7595, "step": 152000 }, { "epoch": 1.5751374750163105, "grad_norm": 0.24072247743606567, "learning_rate": 3.2262816159987017e-05, "loss": 1.7731, "step": 152100 }, { "epoch": 1.5761730683595165, "grad_norm": 0.2409285604953766, "learning_rate": 3.211178416620039e-05, "loss": 1.7737, "step": 152200 }, { "epoch": 1.5772086617027226, "grad_norm": 0.2313113808631897, "learning_rate": 3.1961064128204064e-05, "loss": 1.7743, "step": 152300 }, { "epoch": 1.5782442550459286, "grad_norm": 0.24274508655071259, "learning_rate": 3.1810656444833315e-05, "loss": 1.76, "step": 152400 }, { "epoch": 1.5792798483891346, "grad_norm": 0.24286475777626038, "learning_rate": 3.166056151409698e-05, "loss": 1.7934, "step": 152500 }, { "epoch": 1.5803154417323406, "grad_norm": 0.24502409994602203, "learning_rate": 3.151077973317627e-05, "loss": 1.7597, "step": 152600 }, { "epoch": 1.5813510350755466, "grad_norm": 0.24534182250499725, "learning_rate": 3.1361311498423755e-05, "loss": 1.7792, "step": 152700 }, { "epoch": 1.5823866284187527, "grad_norm": 0.24993400275707245, "learning_rate": 3.12121572053622e-05, "loss": 1.7666, "step": 152800 }, { "epoch": 1.5834222217619585, "grad_norm": 0.27619731426239014, "learning_rate": 3.106331724868375e-05, "loss": 1.7623, "step": 152900 }, { "epoch": 1.5844578151051645, "grad_norm": 0.25150030851364136, "learning_rate": 3.091479202224862e-05, "loss": 1.7683, "step": 153000 }, { "epoch": 1.5854934084483705, "grad_norm": 0.24227070808410645, "learning_rate": 3.076658191908428e-05, "loss": 1.7469, "step": 153100 }, { "epoch": 1.5865290017915765, "grad_norm": 0.25041335821151733, "learning_rate": 3.0618687331384364e-05, "loss": 1.765, "step": 153200 }, { "epoch": 1.5875645951347823, "grad_norm": 0.2585553824901581, "learning_rate": 3.0471108650507435e-05, "loss": 1.7562, "step": 153300 }, { "epoch": 1.5886001884779883, "grad_norm": 0.25099217891693115, "learning_rate": 3.032384626697627e-05, "loss": 1.778, "step": 153400 }, { "epoch": 1.5896357818211944, "grad_norm": 0.24548770487308502, "learning_rate": 3.0176900570476438e-05, "loss": 1.7775, "step": 153500 }, { "epoch": 1.5906713751644004, "grad_norm": 0.242392435669899, "learning_rate": 3.0030271949855822e-05, "loss": 1.7687, "step": 153600 }, { "epoch": 1.5917069685076064, "grad_norm": 0.25239792466163635, "learning_rate": 2.9883960793122937e-05, "loss": 1.7682, "step": 153700 }, { "epoch": 1.5927425618508124, "grad_norm": 0.24055072665214539, "learning_rate": 2.973796748744644e-05, "loss": 1.7767, "step": 153800 }, { "epoch": 1.5937781551940184, "grad_norm": 0.23068878054618835, "learning_rate": 2.9592292419153725e-05, "loss": 1.7626, "step": 153900 }, { "epoch": 1.5948137485372245, "grad_norm": 0.253451406955719, "learning_rate": 2.944693597373018e-05, "loss": 1.7775, "step": 154000 }, { "epoch": 1.5958493418804305, "grad_norm": 0.25171977281570435, "learning_rate": 2.930189853581809e-05, "loss": 1.7882, "step": 154100 }, { "epoch": 1.5968849352236365, "grad_norm": 0.24790294468402863, "learning_rate": 2.915718048921542e-05, "loss": 1.7624, "step": 154200 }, { "epoch": 1.5979205285668425, "grad_norm": 0.24984526634216309, "learning_rate": 2.9012782216875134e-05, "loss": 1.7799, "step": 154300 }, { "epoch": 1.5989561219100483, "grad_norm": 0.254639208316803, "learning_rate": 2.886870410090385e-05, "loss": 1.769, "step": 154400 }, { "epoch": 1.5999917152532543, "grad_norm": 0.2468196153640747, "learning_rate": 2.872494652256112e-05, "loss": 1.7595, "step": 154500 }, { "epoch": 1.6010273085964604, "grad_norm": 0.2362591028213501, "learning_rate": 2.8581509862258234e-05, "loss": 1.7781, "step": 154600 }, { "epoch": 1.6020629019396664, "grad_norm": 0.24382278323173523, "learning_rate": 2.843839449955732e-05, "loss": 1.7606, "step": 154700 }, { "epoch": 1.6030984952828722, "grad_norm": 0.26309606432914734, "learning_rate": 2.829560081317016e-05, "loss": 1.7598, "step": 154800 }, { "epoch": 1.6041340886260782, "grad_norm": 0.24677832424640656, "learning_rate": 2.8153129180957445e-05, "loss": 1.7575, "step": 154900 }, { "epoch": 1.6051696819692842, "grad_norm": 0.2583439350128174, "learning_rate": 2.8010979979927584e-05, "loss": 1.7764, "step": 155000 }, { "epoch": 1.6062052753124902, "grad_norm": 0.26433682441711426, "learning_rate": 2.786915358623583e-05, "loss": 1.7702, "step": 155100 }, { "epoch": 1.6072408686556963, "grad_norm": 0.2388899028301239, "learning_rate": 2.7727650375183146e-05, "loss": 1.7705, "step": 155200 }, { "epoch": 1.6082764619989023, "grad_norm": 0.3385526239871979, "learning_rate": 2.7586470721215285e-05, "loss": 1.7473, "step": 155300 }, { "epoch": 1.6093120553421083, "grad_norm": 0.2706485092639923, "learning_rate": 2.7445614997921877e-05, "loss": 1.7514, "step": 155400 }, { "epoch": 1.6103476486853143, "grad_norm": 0.2527751326560974, "learning_rate": 2.7305083578035315e-05, "loss": 1.7483, "step": 155500 }, { "epoch": 1.6113832420285203, "grad_norm": 0.24500173330307007, "learning_rate": 2.716627729251436e-05, "loss": 1.7602, "step": 155600 }, { "epoch": 1.6124188353717264, "grad_norm": 0.24991624057292938, "learning_rate": 2.7026392341908404e-05, "loss": 1.7596, "step": 155700 }, { "epoch": 1.6134544287149322, "grad_norm": 0.24678780138492584, "learning_rate": 2.6886832804056307e-05, "loss": 1.7681, "step": 155800 }, { "epoch": 1.6144900220581382, "grad_norm": 0.24445167183876038, "learning_rate": 2.6747599048260443e-05, "loss": 1.7676, "step": 155900 }, { "epoch": 1.6155256154013442, "grad_norm": 0.24923637509346008, "learning_rate": 2.660869144296119e-05, "loss": 1.7552, "step": 156000 }, { "epoch": 1.6165612087445502, "grad_norm": 0.2577793598175049, "learning_rate": 2.647011035573588e-05, "loss": 1.7681, "step": 156100 }, { "epoch": 1.617596802087756, "grad_norm": 0.24265843629837036, "learning_rate": 2.6331856153297775e-05, "loss": 1.7605, "step": 156200 }, { "epoch": 1.618632395430962, "grad_norm": 0.22805728018283844, "learning_rate": 2.6193929201495128e-05, "loss": 1.756, "step": 156300 }, { "epoch": 1.619667988774168, "grad_norm": 0.24224853515625, "learning_rate": 2.605632986531015e-05, "loss": 1.7579, "step": 156400 }, { "epoch": 1.620703582117374, "grad_norm": 0.2548943758010864, "learning_rate": 2.5919058508858342e-05, "loss": 1.7657, "step": 156500 }, { "epoch": 1.62173917546058, "grad_norm": 0.25534626841545105, "learning_rate": 2.5782115495387056e-05, "loss": 1.763, "step": 156600 }, { "epoch": 1.6227747688037861, "grad_norm": 0.2515535354614258, "learning_rate": 2.5645501187274936e-05, "loss": 1.7664, "step": 156700 }, { "epoch": 1.6238103621469921, "grad_norm": 0.25652220845222473, "learning_rate": 2.550921594603072e-05, "loss": 1.7503, "step": 156800 }, { "epoch": 1.6248459554901982, "grad_norm": 0.25507649779319763, "learning_rate": 2.537326013229243e-05, "loss": 1.7474, "step": 156900 }, { "epoch": 1.6258815488334042, "grad_norm": 0.22381198406219482, "learning_rate": 2.523763410582631e-05, "loss": 1.7676, "step": 157000 }, { "epoch": 1.6269171421766102, "grad_norm": 0.29624587297439575, "learning_rate": 2.5102338225526015e-05, "loss": 1.7611, "step": 157100 }, { "epoch": 1.6279527355198162, "grad_norm": 0.25737154483795166, "learning_rate": 2.4967372849411386e-05, "loss": 1.7562, "step": 157200 }, { "epoch": 1.628988328863022, "grad_norm": 0.23857314884662628, "learning_rate": 2.4832738334627895e-05, "loss": 1.7574, "step": 157300 }, { "epoch": 1.630023922206228, "grad_norm": 0.25536367297172546, "learning_rate": 2.4698435037445314e-05, "loss": 1.78, "step": 157400 }, { "epoch": 1.631059515549434, "grad_norm": 0.23744933307170868, "learning_rate": 2.4564463313257043e-05, "loss": 1.7749, "step": 157500 }, { "epoch": 1.63209510889264, "grad_norm": 0.2469826191663742, "learning_rate": 2.4430823516579102e-05, "loss": 1.7542, "step": 157600 }, { "epoch": 1.6331307022358459, "grad_norm": 0.26329708099365234, "learning_rate": 2.4297516001049023e-05, "loss": 1.7533, "step": 157700 }, { "epoch": 1.634166295579052, "grad_norm": 0.2397632896900177, "learning_rate": 2.4164541119425196e-05, "loss": 1.7602, "step": 157800 }, { "epoch": 1.635201888922258, "grad_norm": 0.25183120369911194, "learning_rate": 2.403322399311128e-05, "loss": 1.755, "step": 157900 }, { "epoch": 1.636237482265464, "grad_norm": 0.24743783473968506, "learning_rate": 2.3900912098950846e-05, "loss": 1.7663, "step": 158000 }, { "epoch": 1.63727307560867, "grad_norm": 0.2639857530593872, "learning_rate": 2.376893388818986e-05, "loss": 1.7619, "step": 158100 }, { "epoch": 1.638308668951876, "grad_norm": 0.24265305697917938, "learning_rate": 2.3637289710069124e-05, "loss": 1.7705, "step": 158200 }, { "epoch": 1.639344262295082, "grad_norm": 0.24554692208766937, "learning_rate": 2.3505979912945376e-05, "loss": 1.7727, "step": 158300 }, { "epoch": 1.640379855638288, "grad_norm": 0.22941891849040985, "learning_rate": 2.3375004844290624e-05, "loss": 1.7445, "step": 158400 }, { "epoch": 1.641415448981494, "grad_norm": 0.2553333640098572, "learning_rate": 2.3244364850691122e-05, "loss": 1.7661, "step": 158500 }, { "epoch": 1.6424510423247, "grad_norm": 0.25999927520751953, "learning_rate": 2.3114060277846356e-05, "loss": 1.7631, "step": 158600 }, { "epoch": 1.643486635667906, "grad_norm": 0.255823016166687, "learning_rate": 2.2984091470568345e-05, "loss": 1.7557, "step": 158700 }, { "epoch": 1.644522229011112, "grad_norm": 0.252580851316452, "learning_rate": 2.2854458772780438e-05, "loss": 1.762, "step": 158800 }, { "epoch": 1.645557822354318, "grad_norm": 0.23650795221328735, "learning_rate": 2.2725162527516788e-05, "loss": 1.7598, "step": 158900 }, { "epoch": 1.646593415697524, "grad_norm": 0.26243850588798523, "learning_rate": 2.2596203076921044e-05, "loss": 1.7718, "step": 159000 }, { "epoch": 1.6476290090407297, "grad_norm": 0.2543872594833374, "learning_rate": 2.2467580762245746e-05, "loss": 1.7567, "step": 159100 }, { "epoch": 1.6486646023839358, "grad_norm": 0.2603439390659332, "learning_rate": 2.2339295923851186e-05, "loss": 1.7639, "step": 159200 }, { "epoch": 1.6497001957271418, "grad_norm": 0.24593275785446167, "learning_rate": 2.2211348901204733e-05, "loss": 1.7718, "step": 159300 }, { "epoch": 1.6507357890703478, "grad_norm": 0.25501835346221924, "learning_rate": 2.208374003287981e-05, "loss": 1.7642, "step": 159400 }, { "epoch": 1.6517713824135538, "grad_norm": 0.25209614634513855, "learning_rate": 2.1956469656554998e-05, "loss": 1.7515, "step": 159500 }, { "epoch": 1.6528069757567598, "grad_norm": 0.2521548271179199, "learning_rate": 2.182953810901315e-05, "loss": 1.7459, "step": 159600 }, { "epoch": 1.6538425690999659, "grad_norm": 0.23090225458145142, "learning_rate": 2.1702945726140487e-05, "loss": 1.7482, "step": 159700 }, { "epoch": 1.6548781624431719, "grad_norm": 0.24230700731277466, "learning_rate": 2.1576692842925774e-05, "loss": 1.7724, "step": 159800 }, { "epoch": 1.655913755786378, "grad_norm": 0.23758786916732788, "learning_rate": 2.1450779793459405e-05, "loss": 1.7452, "step": 159900 }, { "epoch": 1.656949349129584, "grad_norm": 0.237613245844841, "learning_rate": 2.132520691093252e-05, "loss": 1.7473, "step": 160000 }, { "epoch": 1.65798494247279, "grad_norm": 0.24359528720378876, "learning_rate": 2.119997452763601e-05, "loss": 1.7681, "step": 160100 }, { "epoch": 1.6590205358159957, "grad_norm": 0.23198646306991577, "learning_rate": 2.1075082974959813e-05, "loss": 1.7589, "step": 160200 }, { "epoch": 1.6600561291592018, "grad_norm": 0.28807970881462097, "learning_rate": 2.095053258339196e-05, "loss": 1.765, "step": 160300 }, { "epoch": 1.6610917225024078, "grad_norm": 0.24801871180534363, "learning_rate": 2.0826323682517733e-05, "loss": 1.7487, "step": 160400 }, { "epoch": 1.6621273158456138, "grad_norm": 0.26022928953170776, "learning_rate": 2.0702456601018625e-05, "loss": 1.7493, "step": 160500 }, { "epoch": 1.6631629091888196, "grad_norm": 0.25012028217315674, "learning_rate": 2.057893166667181e-05, "loss": 1.7443, "step": 160600 }, { "epoch": 1.6641985025320256, "grad_norm": 0.24709323048591614, "learning_rate": 2.0455749206348853e-05, "loss": 1.7431, "step": 160700 }, { "epoch": 1.6652340958752316, "grad_norm": 0.24734225869178772, "learning_rate": 2.033290954601524e-05, "loss": 1.7651, "step": 160800 }, { "epoch": 1.6662696892184377, "grad_norm": 0.24593950808048248, "learning_rate": 2.0210413010729286e-05, "loss": 1.7441, "step": 160900 }, { "epoch": 1.6673052825616437, "grad_norm": 0.25336503982543945, "learning_rate": 2.0088259924641247e-05, "loss": 1.7569, "step": 161000 }, { "epoch": 1.6683408759048497, "grad_norm": 0.24552571773529053, "learning_rate": 1.9966450610992708e-05, "loss": 1.7555, "step": 161100 }, { "epoch": 1.6693764692480557, "grad_norm": 0.24618370831012726, "learning_rate": 1.9844985392115325e-05, "loss": 1.7385, "step": 161200 }, { "epoch": 1.6704120625912617, "grad_norm": 0.24182170629501343, "learning_rate": 1.9723864589430527e-05, "loss": 1.738, "step": 161300 }, { "epoch": 1.6714476559344678, "grad_norm": 0.2792712450027466, "learning_rate": 1.9603088523448048e-05, "loss": 1.7593, "step": 161400 }, { "epoch": 1.6724832492776738, "grad_norm": 0.2384859174489975, "learning_rate": 1.9482657513765582e-05, "loss": 1.7578, "step": 161500 }, { "epoch": 1.6735188426208798, "grad_norm": 0.24319148063659668, "learning_rate": 1.9362571879067612e-05, "loss": 1.751, "step": 161600 }, { "epoch": 1.6745544359640856, "grad_norm": 0.24496915936470032, "learning_rate": 1.9242831937124747e-05, "loss": 1.7428, "step": 161700 }, { "epoch": 1.6755900293072916, "grad_norm": 0.24361230432987213, "learning_rate": 1.9123438004792846e-05, "loss": 1.7575, "step": 161800 }, { "epoch": 1.6766256226504976, "grad_norm": 0.25244462490081787, "learning_rate": 1.900439039801208e-05, "loss": 1.7506, "step": 161900 }, { "epoch": 1.6776612159937034, "grad_norm": 0.25072553753852844, "learning_rate": 1.888568943180626e-05, "loss": 1.7426, "step": 162000 }, { "epoch": 1.6786968093369095, "grad_norm": 0.23424604535102844, "learning_rate": 1.8767335420281843e-05, "loss": 1.758, "step": 162100 }, { "epoch": 1.6797324026801155, "grad_norm": 0.2311488538980484, "learning_rate": 1.864932867662722e-05, "loss": 1.7615, "step": 162200 }, { "epoch": 1.6807679960233215, "grad_norm": 0.45073550939559937, "learning_rate": 1.853166951311183e-05, "loss": 1.7346, "step": 162300 }, { "epoch": 1.6818035893665275, "grad_norm": 0.2481175661087036, "learning_rate": 1.8414358241085393e-05, "loss": 1.7478, "step": 162400 }, { "epoch": 1.6828391827097335, "grad_norm": 0.247162327170372, "learning_rate": 1.8297395170976902e-05, "loss": 1.7593, "step": 162500 }, { "epoch": 1.6838747760529396, "grad_norm": 0.23251807689666748, "learning_rate": 1.8180780612294065e-05, "loss": 1.7508, "step": 162600 }, { "epoch": 1.6849103693961456, "grad_norm": 0.2509516775608063, "learning_rate": 1.806451487362234e-05, "loss": 1.7547, "step": 162700 }, { "epoch": 1.6859459627393516, "grad_norm": 0.2523420453071594, "learning_rate": 1.7948598262624114e-05, "loss": 1.751, "step": 162800 }, { "epoch": 1.6869815560825576, "grad_norm": 0.25625839829444885, "learning_rate": 1.783303108603793e-05, "loss": 1.7554, "step": 162900 }, { "epoch": 1.6880171494257636, "grad_norm": 0.23896996676921844, "learning_rate": 1.7717813649677575e-05, "loss": 1.7215, "step": 163000 }, { "epoch": 1.6890527427689694, "grad_norm": 0.2533254027366638, "learning_rate": 1.7602946258431483e-05, "loss": 1.7569, "step": 163100 }, { "epoch": 1.6900883361121755, "grad_norm": 0.25137391686439514, "learning_rate": 1.7488429216261707e-05, "loss": 1.7596, "step": 163200 }, { "epoch": 1.6911239294553815, "grad_norm": 0.25722604990005493, "learning_rate": 1.7374262826203296e-05, "loss": 1.7321, "step": 163300 }, { "epoch": 1.6921595227985875, "grad_norm": 0.24843713641166687, "learning_rate": 1.7260447390363286e-05, "loss": 1.7366, "step": 163400 }, { "epoch": 1.6931951161417933, "grad_norm": Infinity, "learning_rate": 1.7148116112023692e-05, "loss": 1.7574, "step": 163500 }, { "epoch": 1.6942307094849993, "grad_norm": 0.250113844871521, "learning_rate": 1.7034999970186536e-05, "loss": 1.7458, "step": 163600 }, { "epoch": 1.6952663028282053, "grad_norm": 0.2713249623775482, "learning_rate": 1.6922235680325236e-05, "loss": 1.7515, "step": 163700 }, { "epoch": 1.6963018961714114, "grad_norm": 0.2715998888015747, "learning_rate": 1.6809823540836516e-05, "loss": 1.7613, "step": 163800 }, { "epoch": 1.6973374895146174, "grad_norm": 0.24439376592636108, "learning_rate": 1.6697763849185458e-05, "loss": 1.7654, "step": 163900 }, { "epoch": 1.6983730828578234, "grad_norm": 0.3625621497631073, "learning_rate": 1.6586056901904348e-05, "loss": 1.7465, "step": 164000 }, { "epoch": 1.6994086762010294, "grad_norm": 0.2390156388282776, "learning_rate": 1.6474702994592133e-05, "loss": 1.7439, "step": 164100 }, { "epoch": 1.7004442695442354, "grad_norm": 0.23618732392787933, "learning_rate": 1.6363702421913534e-05, "loss": 1.7348, "step": 164200 }, { "epoch": 1.7014798628874415, "grad_norm": 0.24577060341835022, "learning_rate": 1.6253055477598225e-05, "loss": 1.7435, "step": 164300 }, { "epoch": 1.7025154562306475, "grad_norm": 0.2895578444004059, "learning_rate": 1.6142762454440166e-05, "loss": 1.7353, "step": 164400 }, { "epoch": 1.7035510495738535, "grad_norm": 0.24581684172153473, "learning_rate": 1.6032823644296726e-05, "loss": 1.7483, "step": 164500 }, { "epoch": 1.7045866429170593, "grad_norm": 0.23628471791744232, "learning_rate": 1.592323933808801e-05, "loss": 1.7391, "step": 164600 }, { "epoch": 1.7056222362602653, "grad_norm": 0.24244676530361176, "learning_rate": 1.5814009825795977e-05, "loss": 1.7301, "step": 164700 }, { "epoch": 1.7066578296034713, "grad_norm": 0.2440883070230484, "learning_rate": 1.5705135396463792e-05, "loss": 1.7306, "step": 164800 }, { "epoch": 1.7076934229466771, "grad_norm": 0.24032579362392426, "learning_rate": 1.5596616338194902e-05, "loss": 1.7438, "step": 164900 }, { "epoch": 1.7087290162898832, "grad_norm": 0.24512411653995514, "learning_rate": 1.548845293815248e-05, "loss": 1.7493, "step": 165000 }, { "epoch": 1.7097646096330892, "grad_norm": 0.2512228786945343, "learning_rate": 1.5380645482558522e-05, "loss": 1.7487, "step": 165100 }, { "epoch": 1.7108002029762952, "grad_norm": 0.243143230676651, "learning_rate": 1.5273194256693065e-05, "loss": 1.7612, "step": 165200 }, { "epoch": 1.7118357963195012, "grad_norm": 0.24956101179122925, "learning_rate": 1.5166099544893596e-05, "loss": 1.7396, "step": 165300 }, { "epoch": 1.7128713896627072, "grad_norm": 0.26991623640060425, "learning_rate": 1.5059361630554067e-05, "loss": 1.7438, "step": 165400 }, { "epoch": 1.7139069830059133, "grad_norm": 0.23952555656433105, "learning_rate": 1.4952980796124364e-05, "loss": 1.7495, "step": 165500 }, { "epoch": 1.7149425763491193, "grad_norm": 0.24728748202323914, "learning_rate": 1.484695732310947e-05, "loss": 1.7632, "step": 165600 }, { "epoch": 1.7159781696923253, "grad_norm": 0.2485940158367157, "learning_rate": 1.4741291492068696e-05, "loss": 1.7454, "step": 165700 }, { "epoch": 1.7170137630355313, "grad_norm": 0.24040621519088745, "learning_rate": 1.4635983582614925e-05, "loss": 1.7503, "step": 165800 }, { "epoch": 1.7180493563787373, "grad_norm": 0.27126428484916687, "learning_rate": 1.4532081596502066e-05, "loss": 1.7463, "step": 165900 }, { "epoch": 1.7190849497219431, "grad_norm": 0.24948278069496155, "learning_rate": 1.4427486779120467e-05, "loss": 1.7451, "step": 166000 }, { "epoch": 1.7201205430651492, "grad_norm": 0.24100621044635773, "learning_rate": 1.4323250713715977e-05, "loss": 1.73, "step": 166100 }, { "epoch": 1.7211561364083552, "grad_norm": 0.23740434646606445, "learning_rate": 1.421937367611799e-05, "loss": 1.7391, "step": 166200 }, { "epoch": 1.7221917297515612, "grad_norm": 0.24519295990467072, "learning_rate": 1.4115855941205889e-05, "loss": 1.7371, "step": 166300 }, { "epoch": 1.723227323094767, "grad_norm": 0.24833451211452484, "learning_rate": 1.4012697782908322e-05, "loss": 1.7383, "step": 166400 }, { "epoch": 1.724262916437973, "grad_norm": 0.2670252323150635, "learning_rate": 1.3909899474202413e-05, "loss": 1.7622, "step": 166500 }, { "epoch": 1.725298509781179, "grad_norm": 0.23879726231098175, "learning_rate": 1.3807461287113064e-05, "loss": 1.7492, "step": 166600 }, { "epoch": 1.726334103124385, "grad_norm": 0.2573123276233673, "learning_rate": 1.3705383492712141e-05, "loss": 1.7412, "step": 166700 }, { "epoch": 1.727369696467591, "grad_norm": 0.25297072529792786, "learning_rate": 1.3603666361117954e-05, "loss": 1.7424, "step": 166800 }, { "epoch": 1.728405289810797, "grad_norm": 0.23402030766010284, "learning_rate": 1.3502310161494313e-05, "loss": 1.7378, "step": 166900 }, { "epoch": 1.7294408831540031, "grad_norm": 0.2456459105014801, "learning_rate": 1.340131516205002e-05, "loss": 1.7492, "step": 167000 }, { "epoch": 1.7304764764972091, "grad_norm": 0.2443302571773529, "learning_rate": 1.3300681630038045e-05, "loss": 1.7278, "step": 167100 }, { "epoch": 1.7315120698404152, "grad_norm": 0.2518988251686096, "learning_rate": 1.3200409831754844e-05, "loss": 1.7486, "step": 167200 }, { "epoch": 1.7325476631836212, "grad_norm": 0.24611739814281464, "learning_rate": 1.3100500032539629e-05, "loss": 1.7303, "step": 167300 }, { "epoch": 1.7335832565268272, "grad_norm": 0.2465163916349411, "learning_rate": 1.3000952496773653e-05, "loss": 1.7415, "step": 167400 }, { "epoch": 1.734618849870033, "grad_norm": 0.24747945368289948, "learning_rate": 1.2901767487879716e-05, "loss": 1.753, "step": 167500 }, { "epoch": 1.735654443213239, "grad_norm": 0.24191400408744812, "learning_rate": 1.2802945268321068e-05, "loss": 1.7421, "step": 167600 }, { "epoch": 1.736690036556445, "grad_norm": 0.24076144397258759, "learning_rate": 1.2704486099601146e-05, "loss": 1.749, "step": 167700 }, { "epoch": 1.7377256298996508, "grad_norm": 0.25473907589912415, "learning_rate": 1.2606390242262537e-05, "loss": 1.734, "step": 167800 }, { "epoch": 1.7387612232428569, "grad_norm": 0.2578435242176056, "learning_rate": 1.2508657955886513e-05, "loss": 1.7498, "step": 167900 }, { "epoch": 1.7397968165860629, "grad_norm": 0.23923861980438232, "learning_rate": 1.2411289499092252e-05, "loss": 1.7295, "step": 168000 }, { "epoch": 1.740832409929269, "grad_norm": 0.23466624319553375, "learning_rate": 1.2314285129536166e-05, "loss": 1.7515, "step": 168100 }, { "epoch": 1.741868003272475, "grad_norm": 0.2501499056816101, "learning_rate": 1.221764510391119e-05, "loss": 1.7391, "step": 168200 }, { "epoch": 1.742903596615681, "grad_norm": 0.26512864232063293, "learning_rate": 1.2121369677946191e-05, "loss": 1.7455, "step": 168300 }, { "epoch": 1.743939189958887, "grad_norm": 0.25231823325157166, "learning_rate": 1.202545910640515e-05, "loss": 1.7458, "step": 168400 }, { "epoch": 1.744974783302093, "grad_norm": 0.2606634497642517, "learning_rate": 1.192991364308663e-05, "loss": 1.7323, "step": 168500 }, { "epoch": 1.746010376645299, "grad_norm": 0.24372422695159912, "learning_rate": 1.1834733540823072e-05, "loss": 1.7523, "step": 168600 }, { "epoch": 1.747045969988505, "grad_norm": 0.25090333819389343, "learning_rate": 1.173991905148e-05, "loss": 1.7354, "step": 168700 }, { "epoch": 1.748081563331711, "grad_norm": 0.25763875246047974, "learning_rate": 1.1645470425955522e-05, "loss": 1.741, "step": 168800 }, { "epoch": 1.7491171566749169, "grad_norm": 0.23834986984729767, "learning_rate": 1.155138791417961e-05, "loss": 1.7453, "step": 168900 }, { "epoch": 1.7501527500181229, "grad_norm": 0.24636706709861755, "learning_rate": 1.1458607112293577e-05, "loss": 1.7387, "step": 169000 }, { "epoch": 1.751188343361329, "grad_norm": 0.260471373796463, "learning_rate": 1.136525390659731e-05, "loss": 1.7381, "step": 169100 }, { "epoch": 1.752223936704535, "grad_norm": 0.25572189688682556, "learning_rate": 1.1273195602920703e-05, "loss": 1.7218, "step": 169200 }, { "epoch": 1.7532595300477407, "grad_norm": 0.26938962936401367, "learning_rate": 1.1180572681571915e-05, "loss": 1.7348, "step": 169300 }, { "epoch": 1.7542951233909467, "grad_norm": 0.27723193168640137, "learning_rate": 1.1088317104184075e-05, "loss": 1.7288, "step": 169400 }, { "epoch": 1.7553307167341528, "grad_norm": 0.24941597878932953, "learning_rate": 1.0996429114883886e-05, "loss": 1.7351, "step": 169500 }, { "epoch": 1.7563663100773588, "grad_norm": 0.24211610853672028, "learning_rate": 1.090490895682533e-05, "loss": 1.7294, "step": 169600 }, { "epoch": 1.7574019034205648, "grad_norm": 0.25503847002983093, "learning_rate": 1.081375687218905e-05, "loss": 1.7392, "step": 169700 }, { "epoch": 1.7584374967637708, "grad_norm": 0.2460925132036209, "learning_rate": 1.072297310218168e-05, "loss": 1.7286, "step": 169800 }, { "epoch": 1.7594730901069768, "grad_norm": 0.25148114562034607, "learning_rate": 1.0632557887035226e-05, "loss": 1.7425, "step": 169900 }, { "epoch": 1.7605086834501829, "grad_norm": 0.2684800326824188, "learning_rate": 1.054251146600643e-05, "loss": 1.7277, "step": 170000 }, { "epoch": 1.7615442767933889, "grad_norm": 0.25314539670944214, "learning_rate": 1.0452834077376144e-05, "loss": 1.7201, "step": 170100 }, { "epoch": 1.762579870136595, "grad_norm": 0.27334508299827576, "learning_rate": 1.0363525958448637e-05, "loss": 1.734, "step": 170200 }, { "epoch": 1.763615463479801, "grad_norm": 0.2437390834093094, "learning_rate": 1.0275474901851594e-05, "loss": 1.7353, "step": 170300 }, { "epoch": 1.7646510568230067, "grad_norm": 0.2533189356327057, "learning_rate": 1.0186902331757746e-05, "loss": 1.7357, "step": 170400 }, { "epoch": 1.7656866501662127, "grad_norm": 0.23939764499664307, "learning_rate": 1.009869973507529e-05, "loss": 1.7356, "step": 170500 }, { "epoch": 1.7667222435094188, "grad_norm": 0.24917815625667572, "learning_rate": 1.0010867345205903e-05, "loss": 1.7251, "step": 170600 }, { "epoch": 1.7677578368526246, "grad_norm": 0.24000729620456696, "learning_rate": 9.92340539457162e-06, "loss": 1.7361, "step": 170700 }, { "epoch": 1.7687934301958306, "grad_norm": 0.2496235966682434, "learning_rate": 9.836314114614274e-06, "loss": 1.7306, "step": 170800 }, { "epoch": 1.7698290235390366, "grad_norm": 0.2598992884159088, "learning_rate": 9.74959373579483e-06, "loss": 1.7494, "step": 170900 }, { "epoch": 1.7708646168822426, "grad_norm": 0.23713208734989166, "learning_rate": 9.663244487592759e-06, "loss": 1.7323, "step": 171000 }, { "epoch": 1.7719002102254486, "grad_norm": 0.2455018311738968, "learning_rate": 9.577266598505379e-06, "loss": 1.72, "step": 171100 }, { "epoch": 1.7729358035686547, "grad_norm": 0.2538827657699585, "learning_rate": 9.491660296047432e-06, "loss": 1.7368, "step": 171200 }, { "epoch": 1.7739713969118607, "grad_norm": 0.25636816024780273, "learning_rate": 9.406425806750246e-06, "loss": 1.715, "step": 171300 }, { "epoch": 1.7750069902550667, "grad_norm": 0.2576570510864258, "learning_rate": 9.321563356161399e-06, "loss": 1.7396, "step": 171400 }, { "epoch": 1.7760425835982727, "grad_norm": 0.2527580261230469, "learning_rate": 9.237073168843846e-06, "loss": 1.7625, "step": 171500 }, { "epoch": 1.7770781769414787, "grad_norm": 0.26507019996643066, "learning_rate": 9.152955468375556e-06, "loss": 1.7438, "step": 171600 }, { "epoch": 1.7781137702846848, "grad_norm": 0.23915159702301025, "learning_rate": 9.069210477348743e-06, "loss": 1.7399, "step": 171700 }, { "epoch": 1.7791493636278906, "grad_norm": 0.24180366098880768, "learning_rate": 8.985838417369424e-06, "loss": 1.7227, "step": 171800 }, { "epoch": 1.7801849569710966, "grad_norm": 0.25869354605674744, "learning_rate": 8.902839509056753e-06, "loss": 1.7381, "step": 171900 }, { "epoch": 1.7812205503143026, "grad_norm": 0.2603427469730377, "learning_rate": 8.820213972042394e-06, "loss": 1.7373, "step": 172000 }, { "epoch": 1.7822561436575086, "grad_norm": 0.23622602224349976, "learning_rate": 8.737962024970085e-06, "loss": 1.7265, "step": 172100 }, { "epoch": 1.7832917370007144, "grad_norm": 0.2887721359729767, "learning_rate": 8.656083885494886e-06, "loss": 1.7533, "step": 172200 }, { "epoch": 1.7843273303439204, "grad_norm": 0.2555456757545471, "learning_rate": 8.57457977028273e-06, "loss": 1.7379, "step": 172300 }, { "epoch": 1.7853629236871265, "grad_norm": 0.26440414786338806, "learning_rate": 8.493449895009813e-06, "loss": 1.7297, "step": 172400 }, { "epoch": 1.7863985170303325, "grad_norm": 0.2500864863395691, "learning_rate": 8.412694474362035e-06, "loss": 1.7451, "step": 172500 }, { "epoch": 1.7874341103735385, "grad_norm": 0.26459088921546936, "learning_rate": 8.332313722034329e-06, "loss": 1.7306, "step": 172600 }, { "epoch": 1.7884697037167445, "grad_norm": 0.24095329642295837, "learning_rate": 8.252307850730261e-06, "loss": 1.7433, "step": 172700 }, { "epoch": 1.7895052970599505, "grad_norm": 0.24373869597911835, "learning_rate": 8.172677072161399e-06, "loss": 1.7364, "step": 172800 }, { "epoch": 1.7905408904031566, "grad_norm": 0.24337276816368103, "learning_rate": 8.09342159704665e-06, "loss": 1.7331, "step": 172900 }, { "epoch": 1.7915764837463626, "grad_norm": 0.24975727498531342, "learning_rate": 8.014541635111887e-06, "loss": 1.7121, "step": 173000 }, { "epoch": 1.7926120770895686, "grad_norm": 0.2415221929550171, "learning_rate": 7.936037395089222e-06, "loss": 1.7317, "step": 173100 }, { "epoch": 1.7936476704327746, "grad_norm": 0.2467115819454193, "learning_rate": 7.857909084716601e-06, "loss": 1.7365, "step": 173200 }, { "epoch": 1.7946832637759804, "grad_norm": 0.2650807499885559, "learning_rate": 7.780156910737128e-06, "loss": 1.7366, "step": 173300 }, { "epoch": 1.7957188571191864, "grad_norm": 0.26315975189208984, "learning_rate": 7.702781078898657e-06, "loss": 1.7373, "step": 173400 }, { "epoch": 1.7967544504623925, "grad_norm": 0.25150394439697266, "learning_rate": 7.62578179395305e-06, "loss": 1.7352, "step": 173500 }, { "epoch": 1.7977900438055983, "grad_norm": 0.25394633412361145, "learning_rate": 7.549159259655857e-06, "loss": 1.728, "step": 173600 }, { "epoch": 1.7988256371488043, "grad_norm": 0.25245386362075806, "learning_rate": 7.472913678765569e-06, "loss": 1.7653, "step": 173700 }, { "epoch": 1.7998612304920103, "grad_norm": 0.24983125925064087, "learning_rate": 7.397045253043293e-06, "loss": 1.7307, "step": 173800 }, { "epoch": 1.8008968238352163, "grad_norm": 0.25751256942749023, "learning_rate": 7.321554183252038e-06, "loss": 1.736, "step": 173900 }, { "epoch": 1.8019324171784223, "grad_norm": 0.27547821402549744, "learning_rate": 7.246440669156239e-06, "loss": 1.7239, "step": 174000 }, { "epoch": 1.8029680105216284, "grad_norm": 0.25231435894966125, "learning_rate": 7.171704909521286e-06, "loss": 1.7374, "step": 174100 }, { "epoch": 1.8040036038648344, "grad_norm": 0.25071924924850464, "learning_rate": 7.097347102112966e-06, "loss": 1.7282, "step": 174200 }, { "epoch": 1.8050391972080404, "grad_norm": 0.25460386276245117, "learning_rate": 7.023367443696898e-06, "loss": 1.7252, "step": 174300 }, { "epoch": 1.8060747905512464, "grad_norm": 0.26784348487854004, "learning_rate": 6.9497661300380365e-06, "loss": 1.7404, "step": 174400 }, { "epoch": 1.8071103838944524, "grad_norm": 0.26381611824035645, "learning_rate": 6.8765433559001885e-06, "loss": 1.7316, "step": 174500 }, { "epoch": 1.8081459772376585, "grad_norm": 0.2510668635368347, "learning_rate": 6.803699315045425e-06, "loss": 1.7297, "step": 174600 }, { "epoch": 1.8091815705808643, "grad_norm": 0.25210174918174744, "learning_rate": 6.731234200233654e-06, "loss": 1.7193, "step": 174700 }, { "epoch": 1.8102171639240703, "grad_norm": 0.24519681930541992, "learning_rate": 6.659148203222087e-06, "loss": 1.7271, "step": 174800 }, { "epoch": 1.8112527572672763, "grad_norm": 0.24689030647277832, "learning_rate": 6.587441514764669e-06, "loss": 1.7348, "step": 174900 }, { "epoch": 1.8122883506104823, "grad_norm": 0.24037405848503113, "learning_rate": 6.5161143246116485e-06, "loss": 1.7188, "step": 175000 }, { "epoch": 1.8133239439536881, "grad_norm": 0.25457215309143066, "learning_rate": 6.445166821508962e-06, "loss": 1.7213, "step": 175100 }, { "epoch": 1.8143595372968941, "grad_norm": 0.24461062252521515, "learning_rate": 6.374599193197982e-06, "loss": 1.7443, "step": 175200 }, { "epoch": 1.8153951306401002, "grad_norm": 0.24130922555923462, "learning_rate": 6.304411626414701e-06, "loss": 1.7157, "step": 175300 }, { "epoch": 1.8164307239833062, "grad_norm": 0.2508300542831421, "learning_rate": 6.234604306889501e-06, "loss": 1.7263, "step": 175400 }, { "epoch": 1.8174663173265122, "grad_norm": 0.24455414712429047, "learning_rate": 6.165177419346467e-06, "loss": 1.7167, "step": 175500 }, { "epoch": 1.8185019106697182, "grad_norm": 0.24553796648979187, "learning_rate": 6.096819725572987e-06, "loss": 1.7358, "step": 175600 }, { "epoch": 1.8195375040129242, "grad_norm": 0.248016819357872, "learning_rate": 6.028150443254193e-06, "loss": 1.7334, "step": 175700 }, { "epoch": 1.8205730973561303, "grad_norm": 0.257741242647171, "learning_rate": 5.959862139235805e-06, "loss": 1.735, "step": 175800 }, { "epoch": 1.8216086906993363, "grad_norm": 0.2504454553127289, "learning_rate": 5.8919549942222925e-06, "loss": 1.7355, "step": 175900 }, { "epoch": 1.8226442840425423, "grad_norm": 0.24617452919483185, "learning_rate": 5.824429187909563e-06, "loss": 1.7123, "step": 176000 }, { "epoch": 1.8236798773857483, "grad_norm": 0.2585538625717163, "learning_rate": 5.757284898984366e-06, "loss": 1.7259, "step": 176100 }, { "epoch": 1.8247154707289541, "grad_norm": 0.2528209984302521, "learning_rate": 5.690522305123956e-06, "loss": 1.7154, "step": 176200 }, { "epoch": 1.8257510640721601, "grad_norm": 0.2657474875450134, "learning_rate": 5.6241415829955e-06, "loss": 1.7304, "step": 176300 }, { "epoch": 1.8267866574153662, "grad_norm": 0.26939284801483154, "learning_rate": 5.558142908255636e-06, "loss": 1.7401, "step": 176400 }, { "epoch": 1.827822250758572, "grad_norm": 0.24804717302322388, "learning_rate": 5.492526455550078e-06, "loss": 1.7235, "step": 176500 }, { "epoch": 1.828857844101778, "grad_norm": 0.270504355430603, "learning_rate": 5.42729239851305e-06, "loss": 1.7396, "step": 176600 }, { "epoch": 1.829893437444984, "grad_norm": 0.24506253004074097, "learning_rate": 5.362440909766969e-06, "loss": 1.7515, "step": 176700 }, { "epoch": 1.83092903078819, "grad_norm": 0.2618677020072937, "learning_rate": 5.2979721609217766e-06, "loss": 1.7332, "step": 176800 }, { "epoch": 1.831964624131396, "grad_norm": 0.27091890573501587, "learning_rate": 5.2338863225747094e-06, "loss": 1.7301, "step": 176900 }, { "epoch": 1.833000217474602, "grad_norm": 0.24357031285762787, "learning_rate": 5.170183564309682e-06, "loss": 1.7223, "step": 177000 }, { "epoch": 1.834035810817808, "grad_norm": 0.24201589822769165, "learning_rate": 5.106864054696952e-06, "loss": 1.7086, "step": 177100 }, { "epoch": 1.835071404161014, "grad_norm": 0.2429751455783844, "learning_rate": 5.043927961292604e-06, "loss": 1.7255, "step": 177200 }, { "epoch": 1.8361069975042201, "grad_norm": 0.25993308424949646, "learning_rate": 4.98137545063812e-06, "loss": 1.7217, "step": 177300 }, { "epoch": 1.8371425908474261, "grad_norm": 0.233745738863945, "learning_rate": 4.919206688259941e-06, "loss": 1.7082, "step": 177400 }, { "epoch": 1.8381781841906322, "grad_norm": 0.25459179282188416, "learning_rate": 4.857421838669023e-06, "loss": 1.7474, "step": 177500 }, { "epoch": 1.839213777533838, "grad_norm": 0.23842667043209076, "learning_rate": 4.796021065360501e-06, "loss": 1.7195, "step": 177600 }, { "epoch": 1.840249370877044, "grad_norm": 0.25140854716300964, "learning_rate": 4.735004530813019e-06, "loss": 1.7142, "step": 177700 }, { "epoch": 1.84128496422025, "grad_norm": 0.26586881279945374, "learning_rate": 4.6743723964885905e-06, "loss": 1.738, "step": 177800 }, { "epoch": 1.842320557563456, "grad_norm": 0.2546684145927429, "learning_rate": 4.614124822831922e-06, "loss": 1.7202, "step": 177900 }, { "epoch": 1.8433561509066618, "grad_norm": 0.24856257438659668, "learning_rate": 4.554261969270167e-06, "loss": 1.726, "step": 178000 }, { "epoch": 1.8443917442498678, "grad_norm": 0.24501903355121613, "learning_rate": 4.494783994212381e-06, "loss": 1.724, "step": 178100 }, { "epoch": 1.8454273375930739, "grad_norm": 0.2519833445549011, "learning_rate": 4.435691055049212e-06, "loss": 1.7082, "step": 178200 }, { "epoch": 1.84646293093628, "grad_norm": 0.27225735783576965, "learning_rate": 4.37698330815236e-06, "loss": 1.722, "step": 178300 }, { "epoch": 1.847498524279486, "grad_norm": 0.23850208520889282, "learning_rate": 4.319242224888747e-06, "loss": 1.7129, "step": 178400 }, { "epoch": 1.848534117622692, "grad_norm": 0.2479841113090515, "learning_rate": 4.261301471782025e-06, "loss": 1.7263, "step": 178500 }, { "epoch": 1.849569710965898, "grad_norm": 0.2530396282672882, "learning_rate": 4.203746372411348e-06, "loss": 1.7459, "step": 178600 }, { "epoch": 1.850605304309104, "grad_norm": 0.2529071271419525, "learning_rate": 4.146577079079022e-06, "loss": 1.7277, "step": 178700 }, { "epoch": 1.85164089765231, "grad_norm": 0.24860121309757233, "learning_rate": 4.0897937430664e-06, "loss": 1.734, "step": 178800 }, { "epoch": 1.852676490995516, "grad_norm": 0.25037863850593567, "learning_rate": 4.033396514633569e-06, "loss": 1.72, "step": 178900 }, { "epoch": 1.853712084338722, "grad_norm": 0.24811726808547974, "learning_rate": 3.9773855430188355e-06, "loss": 1.7363, "step": 179000 }, { "epoch": 1.8547476776819278, "grad_norm": 0.24256454408168793, "learning_rate": 3.92176097643847e-06, "loss": 1.7251, "step": 179100 }, { "epoch": 1.8557832710251339, "grad_norm": 0.24250365793704987, "learning_rate": 3.866522962086166e-06, "loss": 1.7414, "step": 179200 }, { "epoch": 1.8568188643683399, "grad_norm": 0.24205045402050018, "learning_rate": 3.81167164613278e-06, "loss": 1.7329, "step": 179300 }, { "epoch": 1.8578544577115457, "grad_norm": 0.24427984654903412, "learning_rate": 3.75720717372584e-06, "loss": 1.7292, "step": 179400 }, { "epoch": 1.8588900510547517, "grad_norm": 0.2530161440372467, "learning_rate": 3.7031296889892424e-06, "loss": 1.7345, "step": 179500 }, { "epoch": 1.8599256443979577, "grad_norm": 0.24390803277492523, "learning_rate": 3.649439335022819e-06, "loss": 1.7528, "step": 179600 }, { "epoch": 1.8609612377411637, "grad_norm": 0.265918105840683, "learning_rate": 3.596136253901971e-06, "loss": 1.7135, "step": 179700 }, { "epoch": 1.8619968310843698, "grad_norm": 0.25493451952934265, "learning_rate": 3.5432205866773533e-06, "loss": 1.7282, "step": 179800 }, { "epoch": 1.8630324244275758, "grad_norm": 0.2533169686794281, "learning_rate": 3.4906924733743403e-06, "loss": 1.7344, "step": 179900 }, { "epoch": 1.8640680177707818, "grad_norm": 0.25054511427879333, "learning_rate": 3.4385520529928434e-06, "loss": 1.7029, "step": 180000 }, { "epoch": 1.8651036111139878, "grad_norm": 0.2525639533996582, "learning_rate": 3.3867994635068617e-06, "loss": 1.7245, "step": 180100 }, { "epoch": 1.8661392044571938, "grad_norm": 0.2689187228679657, "learning_rate": 3.3354348418640985e-06, "loss": 1.7143, "step": 180200 }, { "epoch": 1.8671747978003999, "grad_norm": 0.26085108518600464, "learning_rate": 3.284458323985578e-06, "loss": 1.7192, "step": 180300 }, { "epoch": 1.8682103911436059, "grad_norm": 0.258137047290802, "learning_rate": 3.233870044765363e-06, "loss": 1.7256, "step": 180400 }, { "epoch": 1.8692459844868117, "grad_norm": 0.2497711330652237, "learning_rate": 3.1836701380701877e-06, "loss": 1.716, "step": 180500 }, { "epoch": 1.8702815778300177, "grad_norm": 0.25558340549468994, "learning_rate": 3.1338587367390087e-06, "loss": 1.7287, "step": 180600 }, { "epoch": 1.8713171711732237, "grad_norm": 0.24171043932437897, "learning_rate": 3.0844359725827884e-06, "loss": 1.7275, "step": 180700 }, { "epoch": 1.8723527645164297, "grad_norm": 0.24762311577796936, "learning_rate": 3.035401976384011e-06, "loss": 1.7305, "step": 180800 }, { "epoch": 1.8733883578596355, "grad_norm": 0.253679484128952, "learning_rate": 2.9867568778964345e-06, "loss": 1.7123, "step": 180900 }, { "epoch": 1.8744239512028416, "grad_norm": 0.24100255966186523, "learning_rate": 2.93850080584474e-06, "loss": 1.7379, "step": 181000 }, { "epoch": 1.8754595445460476, "grad_norm": 0.24306176602840424, "learning_rate": 2.890633887924132e-06, "loss": 1.7233, "step": 181100 }, { "epoch": 1.8764951378892536, "grad_norm": 0.25511589646339417, "learning_rate": 2.8431562508000383e-06, "loss": 1.7209, "step": 181200 }, { "epoch": 1.8775307312324596, "grad_norm": 0.2663876414299011, "learning_rate": 2.7960680201077945e-06, "loss": 1.7207, "step": 181300 }, { "epoch": 1.8785663245756656, "grad_norm": 0.27625206112861633, "learning_rate": 2.749369320452227e-06, "loss": 1.7259, "step": 181400 }, { "epoch": 1.8796019179188717, "grad_norm": 0.2517870366573334, "learning_rate": 2.7030602754075037e-06, "loss": 1.726, "step": 181500 }, { "epoch": 1.8806375112620777, "grad_norm": 0.2484750598669052, "learning_rate": 2.6571410075166e-06, "loss": 1.7127, "step": 181600 }, { "epoch": 1.8816731046052837, "grad_norm": 0.24114204943180084, "learning_rate": 2.6116116382910335e-06, "loss": 1.7251, "step": 181700 }, { "epoch": 1.8827086979484897, "grad_norm": 0.25338923931121826, "learning_rate": 2.566472288210647e-06, "loss": 1.7255, "step": 181800 }, { "epoch": 1.8837442912916957, "grad_norm": 0.25289592146873474, "learning_rate": 2.5217230767231923e-06, "loss": 1.7302, "step": 181900 }, { "epoch": 1.8847798846349015, "grad_norm": 0.262483686208725, "learning_rate": 2.47736412224403e-06, "loss": 1.7011, "step": 182000 }, { "epoch": 1.8858154779781076, "grad_norm": 0.2437547892332077, "learning_rate": 2.433395542155814e-06, "loss": 1.713, "step": 182100 }, { "epoch": 1.8868510713213136, "grad_norm": 0.2552061378955841, "learning_rate": 2.3898174528082238e-06, "loss": 1.708, "step": 182200 }, { "epoch": 1.8878866646645194, "grad_norm": 0.25636234879493713, "learning_rate": 2.3466299695175493e-06, "loss": 1.7191, "step": 182300 }, { "epoch": 1.8889222580077254, "grad_norm": 0.24827295541763306, "learning_rate": 2.303833206566541e-06, "loss": 1.7262, "step": 182400 }, { "epoch": 1.8899578513509314, "grad_norm": 0.2628612518310547, "learning_rate": 2.2614272772039755e-06, "loss": 1.718, "step": 182500 }, { "epoch": 1.8909934446941374, "grad_norm": 0.25748297572135925, "learning_rate": 2.219412293644457e-06, "loss": 1.7273, "step": 182600 }, { "epoch": 1.8920290380373435, "grad_norm": 0.25174692273139954, "learning_rate": 2.177788367067984e-06, "loss": 1.7275, "step": 182700 }, { "epoch": 1.8930646313805495, "grad_norm": 0.2603454887866974, "learning_rate": 2.1365556076198e-06, "loss": 1.7384, "step": 182800 }, { "epoch": 1.8941002247237555, "grad_norm": 0.2296743392944336, "learning_rate": 2.0957141244100585e-06, "loss": 1.7122, "step": 182900 }, { "epoch": 1.8951358180669615, "grad_norm": 0.2433532178401947, "learning_rate": 2.055264025513459e-06, "loss": 1.7128, "step": 183000 }, { "epoch": 1.8961714114101675, "grad_norm": 0.23824208974838257, "learning_rate": 2.0152054179690623e-06, "loss": 1.7117, "step": 183100 }, { "epoch": 1.8972070047533736, "grad_norm": 0.25119879841804504, "learning_rate": 1.9759331391294318e-06, "loss": 1.7393, "step": 183200 }, { "epoch": 1.8982425980965796, "grad_norm": 0.2623230814933777, "learning_rate": 1.9366539137230464e-06, "loss": 1.7365, "step": 183300 }, { "epoch": 1.8992781914397854, "grad_norm": 0.25543296337127686, "learning_rate": 1.89776649353493e-06, "loss": 1.7162, "step": 183400 }, { "epoch": 1.9003137847829914, "grad_norm": 0.254328191280365, "learning_rate": 1.8592709814690022e-06, "loss": 1.7169, "step": 183500 }, { "epoch": 1.9013493781261974, "grad_norm": 0.25180038809776306, "learning_rate": 1.8211674793920438e-06, "loss": 1.7177, "step": 183600 }, { "epoch": 1.9023849714694034, "grad_norm": 0.24226798117160797, "learning_rate": 1.7834560881335668e-06, "loss": 1.7289, "step": 183700 }, { "epoch": 1.9034205648126092, "grad_norm": 0.2629595994949341, "learning_rate": 1.7465081575242733e-06, "loss": 1.7006, "step": 183800 }, { "epoch": 1.9044561581558153, "grad_norm": 0.2631705701351166, "learning_rate": 1.7095773626614518e-06, "loss": 1.7289, "step": 183900 }, { "epoch": 1.9054917514990213, "grad_norm": 0.37448614835739136, "learning_rate": 1.673038973906854e-06, "loss": 1.719, "step": 184000 }, { "epoch": 1.9065273448422273, "grad_norm": 0.24209128320217133, "learning_rate": 1.636893087948371e-06, "loss": 1.7172, "step": 184100 }, { "epoch": 1.9075629381854333, "grad_norm": 0.24312196671962738, "learning_rate": 1.601139800435225e-06, "loss": 1.721, "step": 184200 }, { "epoch": 1.9085985315286393, "grad_norm": 0.29315465688705444, "learning_rate": 1.5657792059778019e-06, "loss": 1.7195, "step": 184300 }, { "epoch": 1.9096341248718454, "grad_norm": 0.2611297369003296, "learning_rate": 1.5308113981472858e-06, "loss": 1.7295, "step": 184400 }, { "epoch": 1.9106697182150514, "grad_norm": 0.24749819934368134, "learning_rate": 1.4962364694754758e-06, "loss": 1.7426, "step": 184500 }, { "epoch": 1.9117053115582574, "grad_norm": 0.23790644109249115, "learning_rate": 1.462054511454569e-06, "loss": 1.7358, "step": 184600 }, { "epoch": 1.9127409049014634, "grad_norm": 0.2628982365131378, "learning_rate": 1.4282656145368443e-06, "loss": 1.7064, "step": 184700 }, { "epoch": 1.9137764982446694, "grad_norm": 0.2515333294868469, "learning_rate": 1.3948698681344793e-06, "loss": 1.7105, "step": 184800 }, { "epoch": 1.9148120915878752, "grad_norm": 0.2586974799633026, "learning_rate": 1.3618673606193175e-06, "loss": 1.7318, "step": 184900 }, { "epoch": 1.9158476849310813, "grad_norm": 0.29544249176979065, "learning_rate": 1.3292581793225677e-06, "loss": 1.721, "step": 185000 }, { "epoch": 1.9168832782742873, "grad_norm": 0.25116056203842163, "learning_rate": 1.2970424105346543e-06, "loss": 1.7313, "step": 185100 }, { "epoch": 1.917918871617493, "grad_norm": 0.26117655634880066, "learning_rate": 1.2652201395049187e-06, "loss": 1.7036, "step": 185200 }, { "epoch": 1.918954464960699, "grad_norm": 0.2596001923084259, "learning_rate": 1.2337914504415014e-06, "loss": 1.7097, "step": 185300 }, { "epoch": 1.9199900583039051, "grad_norm": 0.24250894784927368, "learning_rate": 1.2027564265109591e-06, "loss": 1.7213, "step": 185400 }, { "epoch": 1.9210256516471111, "grad_norm": 0.24968720972537994, "learning_rate": 1.172115149838182e-06, "loss": 1.7146, "step": 185500 }, { "epoch": 1.9220612449903172, "grad_norm": 0.2559947073459625, "learning_rate": 1.1418677015061105e-06, "loss": 1.7124, "step": 185600 }, { "epoch": 1.9230968383335232, "grad_norm": 0.24344287812709808, "learning_rate": 1.1120141615555523e-06, "loss": 1.7214, "step": 185700 }, { "epoch": 1.9241324316767292, "grad_norm": 0.2743147015571594, "learning_rate": 1.0825546089849313e-06, "loss": 1.7169, "step": 185800 }, { "epoch": 1.9251680250199352, "grad_norm": 0.25657331943511963, "learning_rate": 1.0534891217501228e-06, "loss": 1.715, "step": 185900 }, { "epoch": 1.9262036183631412, "grad_norm": 0.26103025674819946, "learning_rate": 1.0248177767642196e-06, "loss": 1.708, "step": 186000 }, { "epoch": 1.9272392117063473, "grad_norm": 0.2607838809490204, "learning_rate": 9.96540649897315e-07, "loss": 1.7112, "step": 186100 }, { "epoch": 1.9282748050495533, "grad_norm": 0.2540147304534912, "learning_rate": 9.686578159763536e-07, "loss": 1.7229, "step": 186200 }, { "epoch": 1.929310398392759, "grad_norm": 0.2423514723777771, "learning_rate": 9.411693487848982e-07, "loss": 1.6914, "step": 186300 }, { "epoch": 1.930345991735965, "grad_norm": 0.25359782576560974, "learning_rate": 9.140753210629126e-07, "loss": 1.7034, "step": 186400 }, { "epoch": 1.9313815850791711, "grad_norm": 0.2813738286495209, "learning_rate": 8.873758045066126e-07, "loss": 1.7254, "step": 186500 }, { "epoch": 1.9324171784223771, "grad_norm": 0.24106921255588531, "learning_rate": 8.610708697682655e-07, "loss": 1.7203, "step": 186600 }, { "epoch": 1.933452771765583, "grad_norm": 0.2799108326435089, "learning_rate": 8.351605864559741e-07, "loss": 1.73, "step": 186700 }, { "epoch": 1.934488365108789, "grad_norm": 0.23353151977062225, "learning_rate": 8.096450231335594e-07, "loss": 1.7078, "step": 186800 }, { "epoch": 1.935523958451995, "grad_norm": 0.24738764762878418, "learning_rate": 7.845242473202784e-07, "loss": 1.7194, "step": 186900 }, { "epoch": 1.936559551795201, "grad_norm": 0.24557389318943024, "learning_rate": 7.597983254907403e-07, "loss": 1.7238, "step": 187000 }, { "epoch": 1.937595145138407, "grad_norm": 0.25373733043670654, "learning_rate": 7.354673230746566e-07, "loss": 1.7145, "step": 187100 }, { "epoch": 1.938630738481613, "grad_norm": 0.24786674976348877, "learning_rate": 7.11531304456725e-07, "loss": 1.7102, "step": 187200 }, { "epoch": 1.939666331824819, "grad_norm": 0.24621756374835968, "learning_rate": 6.879903329764791e-07, "loss": 1.7347, "step": 187300 }, { "epoch": 1.940701925168025, "grad_norm": 0.24734346568584442, "learning_rate": 6.648444709279888e-07, "loss": 1.7406, "step": 187400 }, { "epoch": 1.941737518511231, "grad_norm": 0.23889899253845215, "learning_rate": 6.420937795598601e-07, "loss": 1.7212, "step": 187500 }, { "epoch": 1.9427731118544371, "grad_norm": 0.25224629044532776, "learning_rate": 6.197383190749527e-07, "loss": 1.7326, "step": 187600 }, { "epoch": 1.9438087051976431, "grad_norm": 0.2478015273809433, "learning_rate": 5.977781486302957e-07, "loss": 1.7116, "step": 187700 }, { "epoch": 1.944844298540849, "grad_norm": 0.2490726262331009, "learning_rate": 5.762133263369051e-07, "loss": 1.7181, "step": 187800 }, { "epoch": 1.945879891884055, "grad_norm": 0.2604832947254181, "learning_rate": 5.550439092595837e-07, "loss": 1.721, "step": 187900 }, { "epoch": 1.946915485227261, "grad_norm": 0.25435760617256165, "learning_rate": 5.342699534168548e-07, "loss": 1.7192, "step": 188000 }, { "epoch": 1.9479510785704668, "grad_norm": 0.2836054563522339, "learning_rate": 5.13891513780762e-07, "loss": 1.719, "step": 188100 }, { "epoch": 1.9489866719136728, "grad_norm": 0.2612624764442444, "learning_rate": 4.939086442767359e-07, "loss": 1.7227, "step": 188200 }, { "epoch": 1.9500222652568788, "grad_norm": 0.25656262040138245, "learning_rate": 4.7432139778342814e-07, "loss": 1.7126, "step": 188300 }, { "epoch": 1.9510578586000848, "grad_norm": 0.24304702877998352, "learning_rate": 4.551298261326108e-07, "loss": 1.7096, "step": 188400 }, { "epoch": 1.9520934519432909, "grad_norm": 0.24936571717262268, "learning_rate": 4.3651997956323236e-07, "loss": 1.7156, "step": 188500 }, { "epoch": 1.953129045286497, "grad_norm": 0.2527887225151062, "learning_rate": 4.1811595090804893e-07, "loss": 1.6902, "step": 188600 }, { "epoch": 1.954164638629703, "grad_norm": 0.24470685422420502, "learning_rate": 4.00107745826217e-07, "loss": 1.72, "step": 188700 }, { "epoch": 1.955200231972909, "grad_norm": 0.25062036514282227, "learning_rate": 3.8249541197105103e-07, "loss": 1.7067, "step": 188800 }, { "epoch": 1.956235825316115, "grad_norm": 0.2412998229265213, "learning_rate": 3.652789959482871e-07, "loss": 1.6993, "step": 188900 }, { "epoch": 1.957271418659321, "grad_norm": 0.25504937767982483, "learning_rate": 3.484585433160325e-07, "loss": 1.7319, "step": 189000 }, { "epoch": 1.958307012002527, "grad_norm": 0.26475054025650024, "learning_rate": 3.3203409858453265e-07, "loss": 1.7102, "step": 189100 }, { "epoch": 1.9593426053457328, "grad_norm": 0.2512475848197937, "learning_rate": 3.160057052162046e-07, "loss": 1.7155, "step": 189200 }, { "epoch": 1.9603781986889388, "grad_norm": 0.2476617395877838, "learning_rate": 3.0037340562533705e-07, "loss": 1.7046, "step": 189300 }, { "epoch": 1.9614137920321448, "grad_norm": 0.23183654248714447, "learning_rate": 2.8513724117812385e-07, "loss": 1.697, "step": 189400 }, { "epoch": 1.9624493853753509, "grad_norm": 0.24840115010738373, "learning_rate": 2.702972521925306e-07, "loss": 1.7115, "step": 189500 }, { "epoch": 1.9634849787185567, "grad_norm": 0.26077279448509216, "learning_rate": 2.558534779381116e-07, "loss": 1.7191, "step": 189600 }, { "epoch": 1.9645205720617627, "grad_norm": 0.24906188249588013, "learning_rate": 2.418059566359931e-07, "loss": 1.7244, "step": 189700 }, { "epoch": 1.9655561654049687, "grad_norm": 0.24613256752490997, "learning_rate": 2.2815472545870683e-07, "loss": 1.7164, "step": 189800 }, { "epoch": 1.9665917587481747, "grad_norm": 0.2525598406791687, "learning_rate": 2.1489982053017328e-07, "loss": 1.7219, "step": 189900 }, { "epoch": 1.9676273520913807, "grad_norm": 0.2424430251121521, "learning_rate": 2.0204127692548532e-07, "loss": 1.7077, "step": 190000 }, { "epoch": 1.9686629454345868, "grad_norm": 0.25314489006996155, "learning_rate": 1.8957912867094138e-07, "loss": 1.7129, "step": 190100 }, { "epoch": 1.9696985387777928, "grad_norm": 0.2495175302028656, "learning_rate": 1.7751340874387898e-07, "loss": 1.716, "step": 190200 }, { "epoch": 1.9707341321209988, "grad_norm": 0.2518186569213867, "learning_rate": 1.6584414907259146e-07, "loss": 1.7056, "step": 190300 }, { "epoch": 1.9717697254642048, "grad_norm": 0.26035502552986145, "learning_rate": 1.5457138053627804e-07, "loss": 1.7015, "step": 190400 }, { "epoch": 1.9728053188074108, "grad_norm": 0.24790236353874207, "learning_rate": 1.4380193256652295e-07, "loss": 1.7065, "step": 190500 }, { "epoch": 1.9738409121506169, "grad_norm": 0.2572283446788788, "learning_rate": 1.333182691044099e-07, "loss": 1.7159, "step": 190600 }, { "epoch": 1.9748765054938227, "grad_norm": 0.2538398504257202, "learning_rate": 1.2323118284727207e-07, "loss": 1.7318, "step": 190700 }, { "epoch": 1.9759120988370287, "grad_norm": 0.24133770167827606, "learning_rate": 1.1354070048752439e-07, "loss": 1.7076, "step": 190800 }, { "epoch": 1.9769476921802347, "grad_norm": 0.24368494749069214, "learning_rate": 1.0433782279848369e-07, "loss": 1.722, "step": 190900 }, { "epoch": 1.9779832855234405, "grad_norm": 0.25969383120536804, "learning_rate": 9.543665745323503e-08, "loss": 1.7379, "step": 191000 }, { "epoch": 1.9790188788666465, "grad_norm": 0.2562360465526581, "learning_rate": 8.693216955524274e-08, "loss": 1.7124, "step": 191100 }, { "epoch": 1.9800544722098525, "grad_norm": 0.27190956473350525, "learning_rate": 7.882438160912164e-08, "loss": 1.7132, "step": 191200 }, { "epoch": 1.9810900655530586, "grad_norm": 0.24458324909210205, "learning_rate": 7.111331506967632e-08, "loss": 1.74, "step": 191300 }, { "epoch": 1.9821256588962646, "grad_norm": 0.24449186027050018, "learning_rate": 6.379899034193447e-08, "loss": 1.7171, "step": 191400 }, { "epoch": 1.9831612522394706, "grad_norm": 0.2626330554485321, "learning_rate": 5.6881426781063555e-08, "loss": 1.7171, "step": 191500 }, { "epoch": 1.9841968455826766, "grad_norm": 0.2562822997570038, "learning_rate": 5.036064269232087e-08, "loss": 1.7183, "step": 191600 }, { "epoch": 1.9852324389258826, "grad_norm": 0.24267880618572235, "learning_rate": 4.423665533098697e-08, "loss": 1.7104, "step": 191700 }, { "epoch": 1.9862680322690887, "grad_norm": 0.25357675552368164, "learning_rate": 3.850948090239892e-08, "loss": 1.7285, "step": 191800 }, { "epoch": 1.9873036256122947, "grad_norm": 0.25857415795326233, "learning_rate": 3.323047367894127e-08, "loss": 1.7217, "step": 191900 }, { "epoch": 1.9883392189555007, "grad_norm": 0.2695390582084656, "learning_rate": 2.8293001043144802e-08, "loss": 1.742, "step": 192000 }, { "epoch": 1.9893748122987065, "grad_norm": 0.24768592417240143, "learning_rate": 2.3752383530145636e-08, "loss": 1.7264, "step": 192100 }, { "epoch": 1.9904104056419125, "grad_norm": 0.25121697783470154, "learning_rate": 1.960863315536576e-08, "loss": 1.7203, "step": 192200 }, { "epoch": 1.9914459989851185, "grad_norm": 0.2912360727787018, "learning_rate": 1.586176088396729e-08, "loss": 1.703, "step": 192300 }, { "epoch": 1.9924815923283246, "grad_norm": 0.24859221279621124, "learning_rate": 1.2511776630935721e-08, "loss": 1.7248, "step": 192400 }, { "epoch": 1.9935171856715304, "grad_norm": 0.24313363432884216, "learning_rate": 9.558689260996677e-09, "loss": 1.7027, "step": 192500 }, { "epoch": 1.9945527790147364, "grad_norm": 0.26493141055107117, "learning_rate": 7.002506588599244e-09, "loss": 1.721, "step": 192600 }, { "epoch": 1.9955883723579424, "grad_norm": 0.26495832204818726, "learning_rate": 4.843235377932631e-09, "loss": 1.723, "step": 192700 }, { "epoch": 1.9966239657011484, "grad_norm": 0.24487806856632233, "learning_rate": 3.0808813428595534e-09, "loss": 1.7172, "step": 192800 }, { "epoch": 1.9976595590443544, "grad_norm": 0.25483769178390503, "learning_rate": 1.7154491468995834e-09, "loss": 1.7069, "step": 192900 }, { "epoch": 1.9986951523875605, "grad_norm": 0.25405511260032654, "learning_rate": 7.469424032791049e-10, "loss": 1.7017, "step": 193000 }, { "epoch": 1.9997307457307665, "grad_norm": 0.26587197184562683, "learning_rate": 1.7536367486470358e-10, "loss": 1.7223, "step": 193100 } ], "logging_steps": 100, "max_steps": 386250, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 48282, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.9375009954373566e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }