{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 868560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004605323754259924, "grad_norm": 2.7894906997680664, "learning_rate": 1.3746891406465874e-07, "loss": 0.6038, "step": 200 }, { "epoch": 0.009210647508519848, "grad_norm": 3.5113091468811035, "learning_rate": 2.756286266924565e-07, "loss": 0.5979, "step": 400 }, { "epoch": 0.013815971262779773, "grad_norm": 3.262929677963257, "learning_rate": 4.137883393202542e-07, "loss": 0.6018, "step": 600 }, { "epoch": 0.018421295017039697, "grad_norm": 2.736161231994629, "learning_rate": 5.51948051948052e-07, "loss": 0.5985, "step": 800 }, { "epoch": 0.023026618771299624, "grad_norm": 3.1657299995422363, "learning_rate": 6.901077645758497e-07, "loss": 0.6069, "step": 1000 }, { "epoch": 0.027631942525559547, "grad_norm": 2.5964059829711914, "learning_rate": 8.282674772036474e-07, "loss": 0.6026, "step": 1200 }, { "epoch": 0.032237266279819474, "grad_norm": 2.9468190670013428, "learning_rate": 9.664271898314452e-07, "loss": 0.6118, "step": 1400 }, { "epoch": 0.036842590034079394, "grad_norm": 3.6232540607452393, "learning_rate": 1.1045869024592427e-06, "loss": 0.5998, "step": 1600 }, { "epoch": 0.04144791378833932, "grad_norm": 2.8659310340881348, "learning_rate": 1.2427466150870407e-06, "loss": 0.6092, "step": 1800 }, { "epoch": 0.04605323754259925, "grad_norm": 3.2160274982452393, "learning_rate": 1.3809063277148384e-06, "loss": 0.6177, "step": 2000 }, { "epoch": 0.05065856129685917, "grad_norm": 2.949619770050049, "learning_rate": 1.519066040342636e-06, "loss": 0.6005, "step": 2200 }, { "epoch": 0.055263885051119094, "grad_norm": 3.6632375717163086, "learning_rate": 1.6572257529704338e-06, "loss": 0.6053, "step": 2400 }, { "epoch": 0.05986920880537902, "grad_norm": 3.9618542194366455, "learning_rate": 1.7953854655982317e-06, "loss": 0.6257, "step": 2600 }, { "epoch": 0.06447453255963895, "grad_norm": 3.0995821952819824, "learning_rate": 1.9335451782260294e-06, "loss": 0.6203, "step": 2800 }, { "epoch": 0.06907985631389886, "grad_norm": 3.9655020236968994, "learning_rate": 2.071704890853827e-06, "loss": 0.6165, "step": 3000 }, { "epoch": 0.07368518006815879, "grad_norm": 2.8039631843566895, "learning_rate": 2.209864603481625e-06, "loss": 0.6196, "step": 3200 }, { "epoch": 0.07829050382241871, "grad_norm": 3.1764214038848877, "learning_rate": 2.3473335175462836e-06, "loss": 0.61, "step": 3400 }, { "epoch": 0.08289582757667864, "grad_norm": 3.0694644451141357, "learning_rate": 2.4854932301740813e-06, "loss": 0.6113, "step": 3600 }, { "epoch": 0.08750115133093857, "grad_norm": 2.9657208919525146, "learning_rate": 2.623652942801879e-06, "loss": 0.6129, "step": 3800 }, { "epoch": 0.0921064750851985, "grad_norm": 3.0443339347839355, "learning_rate": 2.7618126554296768e-06, "loss": 0.6021, "step": 4000 }, { "epoch": 0.09671179883945841, "grad_norm": 3.029796838760376, "learning_rate": 2.8992815694943355e-06, "loss": 0.6093, "step": 4200 }, { "epoch": 0.10131712259371833, "grad_norm": 3.724391222000122, "learning_rate": 3.0374412821221332e-06, "loss": 0.61, "step": 4400 }, { "epoch": 0.10592244634797826, "grad_norm": 3.0700559616088867, "learning_rate": 3.175600994749931e-06, "loss": 0.612, "step": 4600 }, { "epoch": 0.11052777010223819, "grad_norm": 2.6112992763519287, "learning_rate": 3.313760707377729e-06, "loss": 0.6075, "step": 4800 }, { "epoch": 0.11513309385649811, "grad_norm": 3.0309786796569824, "learning_rate": 3.4519204200055268e-06, "loss": 0.6075, "step": 5000 }, { "epoch": 0.11973841761075804, "grad_norm": 4.39963960647583, "learning_rate": 3.590080132633324e-06, "loss": 0.616, "step": 5200 }, { "epoch": 0.12434374136501795, "grad_norm": 3.1912529468536377, "learning_rate": 3.7282398452611218e-06, "loss": 0.607, "step": 5400 }, { "epoch": 0.1289490651192779, "grad_norm": 3.3386693000793457, "learning_rate": 3.8663995578889195e-06, "loss": 0.6155, "step": 5600 }, { "epoch": 0.13355438887353782, "grad_norm": 3.854520082473755, "learning_rate": 4.004559270516718e-06, "loss": 0.6154, "step": 5800 }, { "epoch": 0.13815971262779772, "grad_norm": 3.730701208114624, "learning_rate": 4.142028184581376e-06, "loss": 0.6088, "step": 6000 }, { "epoch": 0.14276503638205765, "grad_norm": 4.679981231689453, "learning_rate": 4.2801878972091745e-06, "loss": 0.6186, "step": 6200 }, { "epoch": 0.14737036013631757, "grad_norm": 3.9258837699890137, "learning_rate": 4.418347609836971e-06, "loss": 0.6166, "step": 6400 }, { "epoch": 0.1519756838905775, "grad_norm": 3.088724136352539, "learning_rate": 4.556507322464769e-06, "loss": 0.6173, "step": 6600 }, { "epoch": 0.15658100764483743, "grad_norm": 3.408620595932007, "learning_rate": 4.694667035092567e-06, "loss": 0.6005, "step": 6800 }, { "epoch": 0.16118633139909735, "grad_norm": 3.2880802154541016, "learning_rate": 4.8328267477203645e-06, "loss": 0.614, "step": 7000 }, { "epoch": 0.16579165515335728, "grad_norm": 2.6477365493774414, "learning_rate": 4.970986460348163e-06, "loss": 0.6111, "step": 7200 }, { "epoch": 0.1703969789076172, "grad_norm": 3.5173377990722656, "learning_rate": 5.10914617297596e-06, "loss": 0.6173, "step": 7400 }, { "epoch": 0.17500230266187713, "grad_norm": 2.5049519538879395, "learning_rate": 5.247305885603758e-06, "loss": 0.6016, "step": 7600 }, { "epoch": 0.17960762641613706, "grad_norm": 4.8469157218933105, "learning_rate": 5.385465598231556e-06, "loss": 0.6142, "step": 7800 }, { "epoch": 0.184212950170397, "grad_norm": 3.006009101867676, "learning_rate": 5.5236253108593535e-06, "loss": 0.6147, "step": 8000 }, { "epoch": 0.18881827392465692, "grad_norm": 3.286186456680298, "learning_rate": 5.661785023487152e-06, "loss": 0.6037, "step": 8200 }, { "epoch": 0.19342359767891681, "grad_norm": 2.788668394088745, "learning_rate": 5.799944736114949e-06, "loss": 0.6148, "step": 8400 }, { "epoch": 0.19802892143317674, "grad_norm": 3.9425911903381348, "learning_rate": 5.938104448742747e-06, "loss": 0.6246, "step": 8600 }, { "epoch": 0.20263424518743667, "grad_norm": 3.2533152103424072, "learning_rate": 6.076264161370544e-06, "loss": 0.6109, "step": 8800 }, { "epoch": 0.2072395689416966, "grad_norm": 3.5510547161102295, "learning_rate": 6.214423873998342e-06, "loss": 0.6226, "step": 9000 }, { "epoch": 0.21184489269595652, "grad_norm": 2.908961057662964, "learning_rate": 6.35258358662614e-06, "loss": 0.6152, "step": 9200 }, { "epoch": 0.21645021645021645, "grad_norm": 3.2206296920776367, "learning_rate": 6.490743299253938e-06, "loss": 0.6119, "step": 9400 }, { "epoch": 0.22105554020447638, "grad_norm": 2.8051042556762695, "learning_rate": 6.628212213318597e-06, "loss": 0.6112, "step": 9600 }, { "epoch": 0.2256608639587363, "grad_norm": 3.4578733444213867, "learning_rate": 6.766371925946394e-06, "loss": 0.5983, "step": 9800 }, { "epoch": 0.23026618771299623, "grad_norm": 3.29227614402771, "learning_rate": 6.904531638574192e-06, "loss": 0.6135, "step": 10000 }, { "epoch": 0.23487151146725616, "grad_norm": 3.175769567489624, "learning_rate": 7.04269135120199e-06, "loss": 0.604, "step": 10200 }, { "epoch": 0.23947683522151608, "grad_norm": 2.6275556087493896, "learning_rate": 7.1808510638297875e-06, "loss": 0.6145, "step": 10400 }, { "epoch": 0.244082158975776, "grad_norm": 3.551853895187378, "learning_rate": 7.319010776457585e-06, "loss": 0.6302, "step": 10600 }, { "epoch": 0.2486874827300359, "grad_norm": 2.5962212085723877, "learning_rate": 7.457170489085383e-06, "loss": 0.6079, "step": 10800 }, { "epoch": 0.25329280648429586, "grad_norm": 3.54152774810791, "learning_rate": 7.59533020171318e-06, "loss": 0.626, "step": 11000 }, { "epoch": 0.2578981302385558, "grad_norm": 3.380774974822998, "learning_rate": 7.733489914340978e-06, "loss": 0.6205, "step": 11200 }, { "epoch": 0.2625034539928157, "grad_norm": 3.3264219760894775, "learning_rate": 7.871649626968777e-06, "loss": 0.6121, "step": 11400 }, { "epoch": 0.26710877774707564, "grad_norm": 2.831803798675537, "learning_rate": 8.009809339596573e-06, "loss": 0.5994, "step": 11600 }, { "epoch": 0.27171410150133557, "grad_norm": 3.301412582397461, "learning_rate": 8.147969052224371e-06, "loss": 0.6236, "step": 11800 }, { "epoch": 0.27631942525559544, "grad_norm": 3.9409468173980713, "learning_rate": 8.28612876485217e-06, "loss": 0.6208, "step": 12000 }, { "epoch": 0.28092474900985537, "grad_norm": 2.723642349243164, "learning_rate": 8.424288477479967e-06, "loss": 0.6137, "step": 12200 }, { "epoch": 0.2855300727641153, "grad_norm": 3.9278056621551514, "learning_rate": 8.562448190107766e-06, "loss": 0.6156, "step": 12400 }, { "epoch": 0.2901353965183752, "grad_norm": 3.5097107887268066, "learning_rate": 8.700607902735564e-06, "loss": 0.618, "step": 12600 }, { "epoch": 0.29474072027263515, "grad_norm": 3.150387763977051, "learning_rate": 8.83876761536336e-06, "loss": 0.6165, "step": 12800 }, { "epoch": 0.2993460440268951, "grad_norm": 3.5158212184906006, "learning_rate": 8.976927327991158e-06, "loss": 0.6156, "step": 13000 }, { "epoch": 0.303951367781155, "grad_norm": 3.6976168155670166, "learning_rate": 9.115087040618956e-06, "loss": 0.611, "step": 13200 }, { "epoch": 0.30855669153541493, "grad_norm": 2.7496871948242188, "learning_rate": 9.253246753246755e-06, "loss": 0.6202, "step": 13400 }, { "epoch": 0.31316201528967486, "grad_norm": 2.973073959350586, "learning_rate": 9.391406465874553e-06, "loss": 0.6151, "step": 13600 }, { "epoch": 0.3177673390439348, "grad_norm": 4.366604804992676, "learning_rate": 9.529566178502349e-06, "loss": 0.6264, "step": 13800 }, { "epoch": 0.3223726627981947, "grad_norm": 2.6890974044799805, "learning_rate": 9.667725891130147e-06, "loss": 0.6149, "step": 14000 }, { "epoch": 0.32697798655245464, "grad_norm": 2.9724693298339844, "learning_rate": 9.805885603757944e-06, "loss": 0.6301, "step": 14200 }, { "epoch": 0.33158331030671456, "grad_norm": 2.7939202785491943, "learning_rate": 9.944045316385742e-06, "loss": 0.6275, "step": 14400 }, { "epoch": 0.3361886340609745, "grad_norm": 3.3362252712249756, "learning_rate": 1.008220502901354e-05, "loss": 0.6207, "step": 14600 }, { "epoch": 0.3407939578152344, "grad_norm": 3.060162305831909, "learning_rate": 1.0220364741641336e-05, "loss": 0.6169, "step": 14800 }, { "epoch": 0.34539928156949434, "grad_norm": 2.850121021270752, "learning_rate": 1.0358524454269135e-05, "loss": 0.6269, "step": 15000 }, { "epoch": 0.35000460532375427, "grad_norm": 3.7358944416046143, "learning_rate": 1.0496684166896933e-05, "loss": 0.6223, "step": 15200 }, { "epoch": 0.3546099290780142, "grad_norm": 3.362393379211426, "learning_rate": 1.063484387952473e-05, "loss": 0.6271, "step": 15400 }, { "epoch": 0.3592152528322741, "grad_norm": 2.7046902179718018, "learning_rate": 1.0773003592152529e-05, "loss": 0.6194, "step": 15600 }, { "epoch": 0.36382057658653405, "grad_norm": 3.0208888053894043, "learning_rate": 1.0911163304780325e-05, "loss": 0.6249, "step": 15800 }, { "epoch": 0.368425900340794, "grad_norm": 3.113605260848999, "learning_rate": 1.1049323017408124e-05, "loss": 0.6144, "step": 16000 }, { "epoch": 0.3730312240950539, "grad_norm": 3.3108925819396973, "learning_rate": 1.1186791931472782e-05, "loss": 0.6209, "step": 16200 }, { "epoch": 0.37763654784931383, "grad_norm": 2.8616628646850586, "learning_rate": 1.132495164410058e-05, "loss": 0.6106, "step": 16400 }, { "epoch": 0.38224187160357376, "grad_norm": 3.016742706298828, "learning_rate": 1.1463111356728379e-05, "loss": 0.6179, "step": 16600 }, { "epoch": 0.38684719535783363, "grad_norm": 3.3050427436828613, "learning_rate": 1.1601271069356177e-05, "loss": 0.6225, "step": 16800 }, { "epoch": 0.39145251911209356, "grad_norm": 2.2596371173858643, "learning_rate": 1.1739430781983975e-05, "loss": 0.6282, "step": 17000 }, { "epoch": 0.3960578428663535, "grad_norm": 3.506793260574341, "learning_rate": 1.1877590494611771e-05, "loss": 0.6213, "step": 17200 }, { "epoch": 0.4006631666206134, "grad_norm": 3.41317081451416, "learning_rate": 1.2015059408676432e-05, "loss": 0.6143, "step": 17400 }, { "epoch": 0.40526849037487334, "grad_norm": 2.875344753265381, "learning_rate": 1.2153219121304228e-05, "loss": 0.6113, "step": 17600 }, { "epoch": 0.40987381412913326, "grad_norm": 2.982757568359375, "learning_rate": 1.2291378833932026e-05, "loss": 0.6151, "step": 17800 }, { "epoch": 0.4144791378833932, "grad_norm": 3.633155107498169, "learning_rate": 1.2429538546559823e-05, "loss": 0.6338, "step": 18000 }, { "epoch": 0.4190844616376531, "grad_norm": 2.6880545616149902, "learning_rate": 1.2567698259187621e-05, "loss": 0.6159, "step": 18200 }, { "epoch": 0.42368978539191304, "grad_norm": 2.981095790863037, "learning_rate": 1.2705857971815419e-05, "loss": 0.6312, "step": 18400 }, { "epoch": 0.42829510914617297, "grad_norm": 2.658784866333008, "learning_rate": 1.2844017684443215e-05, "loss": 0.6183, "step": 18600 }, { "epoch": 0.4329004329004329, "grad_norm": 2.975275754928589, "learning_rate": 1.2982177397071014e-05, "loss": 0.6246, "step": 18800 }, { "epoch": 0.4375057566546928, "grad_norm": 2.6521835327148438, "learning_rate": 1.3120337109698812e-05, "loss": 0.6213, "step": 19000 }, { "epoch": 0.44211108040895275, "grad_norm": 3.5868492126464844, "learning_rate": 1.325849682232661e-05, "loss": 0.6221, "step": 19200 }, { "epoch": 0.4467164041632127, "grad_norm": 2.928968667984009, "learning_rate": 1.3396656534954408e-05, "loss": 0.6291, "step": 19400 }, { "epoch": 0.4513217279174726, "grad_norm": 2.7386858463287354, "learning_rate": 1.3534816247582204e-05, "loss": 0.6154, "step": 19600 }, { "epoch": 0.45592705167173253, "grad_norm": 2.974900245666504, "learning_rate": 1.3672975960210003e-05, "loss": 0.6147, "step": 19800 }, { "epoch": 0.46053237542599246, "grad_norm": 3.2995572090148926, "learning_rate": 1.38111356728378e-05, "loss": 0.6335, "step": 20000 }, { "epoch": 0.4651376991802524, "grad_norm": 2.7065649032592773, "learning_rate": 1.3949295385465599e-05, "loss": 0.6236, "step": 20200 }, { "epoch": 0.4697430229345123, "grad_norm": 3.2409005165100098, "learning_rate": 1.4086764299530258e-05, "loss": 0.6245, "step": 20400 }, { "epoch": 0.47434834668877224, "grad_norm": 3.296063184738159, "learning_rate": 1.4224924012158056e-05, "loss": 0.6198, "step": 20600 }, { "epoch": 0.47895367044303216, "grad_norm": 3.3391969203948975, "learning_rate": 1.4363083724785854e-05, "loss": 0.6339, "step": 20800 }, { "epoch": 0.4835589941972921, "grad_norm": 3.536306858062744, "learning_rate": 1.450124343741365e-05, "loss": 0.6264, "step": 21000 }, { "epoch": 0.488164317951552, "grad_norm": 2.2294647693634033, "learning_rate": 1.4639403150041448e-05, "loss": 0.6182, "step": 21200 }, { "epoch": 0.49276964170581194, "grad_norm": 2.4579017162323, "learning_rate": 1.4777562862669247e-05, "loss": 0.6287, "step": 21400 }, { "epoch": 0.4973749654600718, "grad_norm": 3.0496151447296143, "learning_rate": 1.4915722575297045e-05, "loss": 0.6272, "step": 21600 }, { "epoch": 0.5019802892143318, "grad_norm": 2.970477819442749, "learning_rate": 1.5053882287924841e-05, "loss": 0.6364, "step": 21800 }, { "epoch": 0.5065856129685917, "grad_norm": 3.1261672973632812, "learning_rate": 1.5192042000552638e-05, "loss": 0.6387, "step": 22000 }, { "epoch": 0.5111909367228517, "grad_norm": 3.239424228668213, "learning_rate": 1.5330201713180437e-05, "loss": 0.6235, "step": 22200 }, { "epoch": 0.5157962604771116, "grad_norm": 3.2637672424316406, "learning_rate": 1.5468361425808236e-05, "loss": 0.6302, "step": 22400 }, { "epoch": 0.5204015842313715, "grad_norm": 3.604191541671753, "learning_rate": 1.560652113843603e-05, "loss": 0.6193, "step": 22600 }, { "epoch": 0.5250069079856314, "grad_norm": 3.1682140827178955, "learning_rate": 1.574468085106383e-05, "loss": 0.6231, "step": 22800 }, { "epoch": 0.5296122317398914, "grad_norm": 2.781855583190918, "learning_rate": 1.5882840563691627e-05, "loss": 0.6283, "step": 23000 }, { "epoch": 0.5342175554941513, "grad_norm": 2.9197840690612793, "learning_rate": 1.6021000276319425e-05, "loss": 0.6377, "step": 23200 }, { "epoch": 0.5388228792484112, "grad_norm": 3.9863662719726562, "learning_rate": 1.6159159988947223e-05, "loss": 0.6231, "step": 23400 }, { "epoch": 0.5434282030026711, "grad_norm": 2.8455443382263184, "learning_rate": 1.629731970157502e-05, "loss": 0.6227, "step": 23600 }, { "epoch": 0.5480335267569311, "grad_norm": 2.6754982471466064, "learning_rate": 1.643547941420282e-05, "loss": 0.6314, "step": 23800 }, { "epoch": 0.5526388505111909, "grad_norm": 3.931835651397705, "learning_rate": 1.6573639126830617e-05, "loss": 0.6298, "step": 24000 }, { "epoch": 0.5572441742654508, "grad_norm": 3.3282952308654785, "learning_rate": 1.6711798839458415e-05, "loss": 0.6149, "step": 24200 }, { "epoch": 0.5618494980197107, "grad_norm": 3.1311309337615967, "learning_rate": 1.6849958552086214e-05, "loss": 0.6408, "step": 24400 }, { "epoch": 0.5664548217739707, "grad_norm": 4.448089122772217, "learning_rate": 1.698811826471401e-05, "loss": 0.6316, "step": 24600 }, { "epoch": 0.5710601455282306, "grad_norm": 3.4819765090942383, "learning_rate": 1.7126277977341806e-05, "loss": 0.6361, "step": 24800 }, { "epoch": 0.5756654692824905, "grad_norm": 3.762315034866333, "learning_rate": 1.7264437689969605e-05, "loss": 0.6267, "step": 25000 }, { "epoch": 0.5802707930367504, "grad_norm": 3.5128722190856934, "learning_rate": 1.7402597402597403e-05, "loss": 0.6363, "step": 25200 }, { "epoch": 0.5848761167910104, "grad_norm": 2.8912715911865234, "learning_rate": 1.75407571152252e-05, "loss": 0.6381, "step": 25400 }, { "epoch": 0.5894814405452703, "grad_norm": 3.6335270404815674, "learning_rate": 1.767753523072672e-05, "loss": 0.6278, "step": 25600 }, { "epoch": 0.5940867642995302, "grad_norm": 3.1860055923461914, "learning_rate": 1.781569494335452e-05, "loss": 0.6321, "step": 25800 }, { "epoch": 0.5986920880537902, "grad_norm": 2.7040374279022217, "learning_rate": 1.7953854655982317e-05, "loss": 0.636, "step": 26000 }, { "epoch": 0.6032974118080501, "grad_norm": 3.322178840637207, "learning_rate": 1.8092014368610115e-05, "loss": 0.636, "step": 26200 }, { "epoch": 0.60790273556231, "grad_norm": 3.2014033794403076, "learning_rate": 1.8230174081237913e-05, "loss": 0.6369, "step": 26400 }, { "epoch": 0.6125080593165699, "grad_norm": 3.6202449798583984, "learning_rate": 1.836833379386571e-05, "loss": 0.6269, "step": 26600 }, { "epoch": 0.6171133830708299, "grad_norm": 3.62076735496521, "learning_rate": 1.850649350649351e-05, "loss": 0.6307, "step": 26800 }, { "epoch": 0.6217187068250898, "grad_norm": 2.5848195552825928, "learning_rate": 1.8644653219121307e-05, "loss": 0.6311, "step": 27000 }, { "epoch": 0.6263240305793497, "grad_norm": 3.199153184890747, "learning_rate": 1.8782812931749105e-05, "loss": 0.639, "step": 27200 }, { "epoch": 0.6309293543336096, "grad_norm": 2.6851189136505127, "learning_rate": 1.89209726443769e-05, "loss": 0.6422, "step": 27400 }, { "epoch": 0.6355346780878696, "grad_norm": 3.650444507598877, "learning_rate": 1.9059132357004698e-05, "loss": 0.6412, "step": 27600 }, { "epoch": 0.6401400018421295, "grad_norm": 2.841759204864502, "learning_rate": 1.9197292069632496e-05, "loss": 0.6419, "step": 27800 }, { "epoch": 0.6447453255963894, "grad_norm": 3.3673157691955566, "learning_rate": 1.9335451782260294e-05, "loss": 0.6237, "step": 28000 }, { "epoch": 0.6493506493506493, "grad_norm": 3.8618879318237305, "learning_rate": 1.9473611494888093e-05, "loss": 0.6369, "step": 28200 }, { "epoch": 0.6539559731049093, "grad_norm": 2.8501274585723877, "learning_rate": 1.9611771207515887e-05, "loss": 0.6509, "step": 28400 }, { "epoch": 0.6585612968591692, "grad_norm": 2.839573621749878, "learning_rate": 1.9749930920143686e-05, "loss": 0.6326, "step": 28600 }, { "epoch": 0.6631666206134291, "grad_norm": 2.609498977661133, "learning_rate": 1.9888090632771484e-05, "loss": 0.6436, "step": 28800 }, { "epoch": 0.667771944367689, "grad_norm": 3.8914794921875, "learning_rate": 2.0026250345399282e-05, "loss": 0.6317, "step": 29000 }, { "epoch": 0.672377268121949, "grad_norm": 3.074779987335205, "learning_rate": 2.016441005802708e-05, "loss": 0.6473, "step": 29200 }, { "epoch": 0.6769825918762089, "grad_norm": 3.1600515842437744, "learning_rate": 2.0302569770654875e-05, "loss": 0.6418, "step": 29400 }, { "epoch": 0.6815879156304688, "grad_norm": 3.190645694732666, "learning_rate": 2.0440729483282673e-05, "loss": 0.6289, "step": 29600 }, { "epoch": 0.6861932393847288, "grad_norm": 3.1118228435516357, "learning_rate": 2.057888919591047e-05, "loss": 0.6495, "step": 29800 }, { "epoch": 0.6907985631389887, "grad_norm": 3.027270555496216, "learning_rate": 2.071635810997513e-05, "loss": 0.6393, "step": 30000 }, { "epoch": 0.6954038868932486, "grad_norm": 2.9312691688537598, "learning_rate": 2.085451782260293e-05, "loss": 0.645, "step": 30200 }, { "epoch": 0.7000092106475085, "grad_norm": 3.4496731758117676, "learning_rate": 2.0992677535230728e-05, "loss": 0.6522, "step": 30400 }, { "epoch": 0.7046145344017685, "grad_norm": 3.7810165882110596, "learning_rate": 2.1130837247858526e-05, "loss": 0.653, "step": 30600 }, { "epoch": 0.7092198581560284, "grad_norm": 2.8673808574676514, "learning_rate": 2.126899696048632e-05, "loss": 0.6249, "step": 30800 }, { "epoch": 0.7138251819102883, "grad_norm": 2.933802843093872, "learning_rate": 2.140715667311412e-05, "loss": 0.6381, "step": 31000 }, { "epoch": 0.7184305056645482, "grad_norm": 2.904672861099243, "learning_rate": 2.1545316385741917e-05, "loss": 0.643, "step": 31200 }, { "epoch": 0.7230358294188082, "grad_norm": 3.2614898681640625, "learning_rate": 2.1683476098369715e-05, "loss": 0.6401, "step": 31400 }, { "epoch": 0.7276411531730681, "grad_norm": 4.081573963165283, "learning_rate": 2.1821635810997513e-05, "loss": 0.6401, "step": 31600 }, { "epoch": 0.732246476927328, "grad_norm": 2.736985206604004, "learning_rate": 2.195979552362531e-05, "loss": 0.6373, "step": 31800 }, { "epoch": 0.736851800681588, "grad_norm": 3.4728550910949707, "learning_rate": 2.209795523625311e-05, "loss": 0.6488, "step": 32000 }, { "epoch": 0.7414571244358479, "grad_norm": 2.758512496948242, "learning_rate": 2.2236114948880908e-05, "loss": 0.6492, "step": 32200 }, { "epoch": 0.7460624481901078, "grad_norm": 3.4446616172790527, "learning_rate": 2.2374274661508706e-05, "loss": 0.6468, "step": 32400 }, { "epoch": 0.7506677719443677, "grad_norm": 3.721409320831299, "learning_rate": 2.2512434374136504e-05, "loss": 0.6364, "step": 32600 }, { "epoch": 0.7552730956986277, "grad_norm": 2.9497594833374023, "learning_rate": 2.26505940867643e-05, "loss": 0.6337, "step": 32800 }, { "epoch": 0.7598784194528876, "grad_norm": 4.462852954864502, "learning_rate": 2.2788753799392097e-05, "loss": 0.6592, "step": 33000 }, { "epoch": 0.7644837432071475, "grad_norm": 2.867154836654663, "learning_rate": 2.2926913512019895e-05, "loss": 0.6472, "step": 33200 }, { "epoch": 0.7690890669614074, "grad_norm": 3.1840884685516357, "learning_rate": 2.3065073224647693e-05, "loss": 0.6454, "step": 33400 }, { "epoch": 0.7736943907156673, "grad_norm": 2.719569206237793, "learning_rate": 2.320323293727549e-05, "loss": 0.6565, "step": 33600 }, { "epoch": 0.7782997144699272, "grad_norm": 3.1424355506896973, "learning_rate": 2.334139264990329e-05, "loss": 0.6333, "step": 33800 }, { "epoch": 0.7829050382241871, "grad_norm": 3.147237539291382, "learning_rate": 2.347886156396795e-05, "loss": 0.6462, "step": 34000 }, { "epoch": 0.787510361978447, "grad_norm": 3.6933629512786865, "learning_rate": 2.3617021276595744e-05, "loss": 0.642, "step": 34200 }, { "epoch": 0.792115685732707, "grad_norm": 3.1719932556152344, "learning_rate": 2.3755180989223543e-05, "loss": 0.6591, "step": 34400 }, { "epoch": 0.7967210094869669, "grad_norm": 4.069468021392822, "learning_rate": 2.389334070185134e-05, "loss": 0.6558, "step": 34600 }, { "epoch": 0.8013263332412268, "grad_norm": 3.7244644165039062, "learning_rate": 2.403150041447914e-05, "loss": 0.6586, "step": 34800 }, { "epoch": 0.8059316569954867, "grad_norm": 2.9359726905822754, "learning_rate": 2.41689693285438e-05, "loss": 0.6425, "step": 35000 }, { "epoch": 0.8105369807497467, "grad_norm": 3.2560675144195557, "learning_rate": 2.4307129041171597e-05, "loss": 0.6535, "step": 35200 }, { "epoch": 0.8151423045040066, "grad_norm": 3.1228187084198, "learning_rate": 2.4445288753799396e-05, "loss": 0.6346, "step": 35400 }, { "epoch": 0.8197476282582665, "grad_norm": 2.620872974395752, "learning_rate": 2.458344846642719e-05, "loss": 0.6412, "step": 35600 }, { "epoch": 0.8243529520125265, "grad_norm": 3.165461540222168, "learning_rate": 2.472160817905499e-05, "loss": 0.643, "step": 35800 }, { "epoch": 0.8289582757667864, "grad_norm": 3.4835033416748047, "learning_rate": 2.4859767891682787e-05, "loss": 0.6503, "step": 36000 }, { "epoch": 0.8335635995210463, "grad_norm": 3.1745879650115967, "learning_rate": 2.4997927604310585e-05, "loss": 0.6572, "step": 36200 }, { "epoch": 0.8381689232753062, "grad_norm": 2.9764230251312256, "learning_rate": 2.513608731693838e-05, "loss": 0.6426, "step": 36400 }, { "epoch": 0.8427742470295662, "grad_norm": 2.6247684955596924, "learning_rate": 2.5274247029566178e-05, "loss": 0.6483, "step": 36600 }, { "epoch": 0.8473795707838261, "grad_norm": 2.999863386154175, "learning_rate": 2.5412406742193976e-05, "loss": 0.6487, "step": 36800 }, { "epoch": 0.851984894538086, "grad_norm": 6.160555839538574, "learning_rate": 2.5550566454821774e-05, "loss": 0.6596, "step": 37000 }, { "epoch": 0.8565902182923459, "grad_norm": 3.588473320007324, "learning_rate": 2.5688726167449572e-05, "loss": 0.6562, "step": 37200 }, { "epoch": 0.8611955420466059, "grad_norm": 2.841158628463745, "learning_rate": 2.582688588007737e-05, "loss": 0.6634, "step": 37400 }, { "epoch": 0.8658008658008658, "grad_norm": 3.2244651317596436, "learning_rate": 2.5965045592705168e-05, "loss": 0.6436, "step": 37600 }, { "epoch": 0.8704061895551257, "grad_norm": 2.9466798305511475, "learning_rate": 2.6103205305332963e-05, "loss": 0.6551, "step": 37800 }, { "epoch": 0.8750115133093856, "grad_norm": 3.141784191131592, "learning_rate": 2.624136501796076e-05, "loss": 0.6607, "step": 38000 }, { "epoch": 0.8796168370636456, "grad_norm": 3.590473175048828, "learning_rate": 2.637952473058856e-05, "loss": 0.6604, "step": 38200 }, { "epoch": 0.8842221608179055, "grad_norm": 3.251824378967285, "learning_rate": 2.6517684443216357e-05, "loss": 0.6656, "step": 38400 }, { "epoch": 0.8888274845721654, "grad_norm": 2.8062353134155273, "learning_rate": 2.6655844155844156e-05, "loss": 0.6494, "step": 38600 }, { "epoch": 0.8934328083264254, "grad_norm": 2.897364616394043, "learning_rate": 2.6794003868471954e-05, "loss": 0.6474, "step": 38800 }, { "epoch": 0.8980381320806853, "grad_norm": 3.172089099884033, "learning_rate": 2.6932163581099752e-05, "loss": 0.6601, "step": 39000 }, { "epoch": 0.9026434558349452, "grad_norm": 3.3217780590057373, "learning_rate": 2.707032329372755e-05, "loss": 0.6426, "step": 39200 }, { "epoch": 0.9072487795892051, "grad_norm": 3.3799307346343994, "learning_rate": 2.7208483006355348e-05, "loss": 0.6614, "step": 39400 }, { "epoch": 0.9118541033434651, "grad_norm": 3.2434334754943848, "learning_rate": 2.7346642718983146e-05, "loss": 0.6546, "step": 39600 }, { "epoch": 0.916459427097725, "grad_norm": 3.017298936843872, "learning_rate": 2.7483420834484666e-05, "loss": 0.6403, "step": 39800 }, { "epoch": 0.9210647508519849, "grad_norm": 2.9930150508880615, "learning_rate": 2.7621580547112464e-05, "loss": 0.6527, "step": 40000 }, { "epoch": 0.9256700746062448, "grad_norm": 2.474975347518921, "learning_rate": 2.7759740259740262e-05, "loss": 0.6559, "step": 40200 }, { "epoch": 0.9302753983605048, "grad_norm": 3.3437626361846924, "learning_rate": 2.789789997236806e-05, "loss": 0.6479, "step": 40400 }, { "epoch": 0.9348807221147647, "grad_norm": 3.64098858833313, "learning_rate": 2.8036059684995855e-05, "loss": 0.6671, "step": 40600 }, { "epoch": 0.9394860458690246, "grad_norm": 3.154405355453491, "learning_rate": 2.8174219397623653e-05, "loss": 0.6498, "step": 40800 }, { "epoch": 0.9440913696232845, "grad_norm": 3.1763336658477783, "learning_rate": 2.831237911025145e-05, "loss": 0.6581, "step": 41000 }, { "epoch": 0.9486966933775445, "grad_norm": 3.8115978240966797, "learning_rate": 2.845053882287925e-05, "loss": 0.6544, "step": 41200 }, { "epoch": 0.9533020171318044, "grad_norm": 3.476557731628418, "learning_rate": 2.8588698535507047e-05, "loss": 0.6607, "step": 41400 }, { "epoch": 0.9579073408860643, "grad_norm": 3.1274263858795166, "learning_rate": 2.8726858248134845e-05, "loss": 0.6557, "step": 41600 }, { "epoch": 0.9625126646403243, "grad_norm": 3.3021888732910156, "learning_rate": 2.8865017960762644e-05, "loss": 0.6529, "step": 41800 }, { "epoch": 0.9671179883945842, "grad_norm": 2.258256435394287, "learning_rate": 2.9003177673390442e-05, "loss": 0.6537, "step": 42000 }, { "epoch": 0.9717233121488441, "grad_norm": 3.9193155765533447, "learning_rate": 2.914133738601824e-05, "loss": 0.6521, "step": 42200 }, { "epoch": 0.976328635903104, "grad_norm": 2.55462384223938, "learning_rate": 2.9279497098646038e-05, "loss": 0.661, "step": 42400 }, { "epoch": 0.980933959657364, "grad_norm": 2.79758358001709, "learning_rate": 2.9417656811273833e-05, "loss": 0.6601, "step": 42600 }, { "epoch": 0.9855392834116239, "grad_norm": 3.391768217086792, "learning_rate": 2.955581652390163e-05, "loss": 0.6502, "step": 42800 }, { "epoch": 0.9901446071658838, "grad_norm": 2.6248672008514404, "learning_rate": 2.969397623652943e-05, "loss": 0.6678, "step": 43000 }, { "epoch": 0.9947499309201436, "grad_norm": 2.8915598392486572, "learning_rate": 2.9832135949157227e-05, "loss": 0.6473, "step": 43200 }, { "epoch": 0.9993552546744036, "grad_norm": 4.923104286193848, "learning_rate": 2.9970295661785025e-05, "loss": 0.6451, "step": 43400 }, { "epoch": 1.0, "eval_loss": 0.6161314845085144, "eval_runtime": 146.2574, "eval_samples_per_second": 193.911, "eval_steps_per_second": 12.122, "step": 43428 }, { "epoch": 1.0039605784286636, "grad_norm": 3.111292600631714, "learning_rate": 2.9994291822399326e-05, "loss": 0.6622, "step": 43600 }, { "epoch": 1.0085659021829234, "grad_norm": 3.6699461936950684, "learning_rate": 2.998702025857681e-05, "loss": 0.6542, "step": 43800 }, { "epoch": 1.0131712259371835, "grad_norm": 3.563108205795288, "learning_rate": 2.997978505257341e-05, "loss": 0.6444, "step": 44000 }, { "epoch": 1.0177765496914433, "grad_norm": 3.8684511184692383, "learning_rate": 2.997251348875089e-05, "loss": 0.6534, "step": 44200 }, { "epoch": 1.0223818734457033, "grad_norm": 2.8675427436828613, "learning_rate": 2.9965241924928375e-05, "loss": 0.65, "step": 44400 }, { "epoch": 1.0269871971999631, "grad_norm": 3.130798101425171, "learning_rate": 2.995797036110586e-05, "loss": 0.6566, "step": 44600 }, { "epoch": 1.0315925209542232, "grad_norm": 3.477407932281494, "learning_rate": 2.9950698797283344e-05, "loss": 0.6701, "step": 44800 }, { "epoch": 1.036197844708483, "grad_norm": 3.613179922103882, "learning_rate": 2.994342723346083e-05, "loss": 0.6636, "step": 45000 }, { "epoch": 1.040803168462743, "grad_norm": 4.168652057647705, "learning_rate": 2.9936155669638315e-05, "loss": 0.656, "step": 45200 }, { "epoch": 1.0454084922170028, "grad_norm": 3.4714102745056152, "learning_rate": 2.992892046363491e-05, "loss": 0.666, "step": 45400 }, { "epoch": 1.0500138159712629, "grad_norm": 3.0046637058258057, "learning_rate": 2.9921648899812396e-05, "loss": 0.6547, "step": 45600 }, { "epoch": 1.0546191397255227, "grad_norm": 2.514378070831299, "learning_rate": 2.991437733598988e-05, "loss": 0.6592, "step": 45800 }, { "epoch": 1.0592244634797827, "grad_norm": 3.263707160949707, "learning_rate": 2.990710577216736e-05, "loss": 0.6544, "step": 46000 }, { "epoch": 1.0638297872340425, "grad_norm": 2.668816566467285, "learning_rate": 2.9899834208344847e-05, "loss": 0.6631, "step": 46200 }, { "epoch": 1.0684351109883026, "grad_norm": 3.3855276107788086, "learning_rate": 2.9892562644522333e-05, "loss": 0.6581, "step": 46400 }, { "epoch": 1.0730404347425624, "grad_norm": 3.417602062225342, "learning_rate": 2.9885291080699816e-05, "loss": 0.6686, "step": 46600 }, { "epoch": 1.0776457584968224, "grad_norm": 3.3175718784332275, "learning_rate": 2.9878019516877302e-05, "loss": 0.6322, "step": 46800 }, { "epoch": 1.0822510822510822, "grad_norm": 3.263648748397827, "learning_rate": 2.9870747953054784e-05, "loss": 0.6482, "step": 47000 }, { "epoch": 1.0868564060053423, "grad_norm": 3.151909589767456, "learning_rate": 2.9863476389232267e-05, "loss": 0.6526, "step": 47200 }, { "epoch": 1.091461729759602, "grad_norm": 3.5228278636932373, "learning_rate": 2.9856241183228865e-05, "loss": 0.6459, "step": 47400 }, { "epoch": 1.096067053513862, "grad_norm": 2.767185926437378, "learning_rate": 2.984896961940635e-05, "loss": 0.6595, "step": 47600 }, { "epoch": 1.100672377268122, "grad_norm": 3.287398099899292, "learning_rate": 2.9841698055583834e-05, "loss": 0.6625, "step": 47800 }, { "epoch": 1.105277701022382, "grad_norm": 3.132371425628662, "learning_rate": 2.983442649176132e-05, "loss": 0.6505, "step": 48000 }, { "epoch": 1.1098830247766418, "grad_norm": 2.7118616104125977, "learning_rate": 2.9827154927938806e-05, "loss": 0.6747, "step": 48200 }, { "epoch": 1.1144883485309016, "grad_norm": 2.996696949005127, "learning_rate": 2.9819883364116285e-05, "loss": 0.6633, "step": 48400 }, { "epoch": 1.1190936722851617, "grad_norm": 3.6714489459991455, "learning_rate": 2.981261180029377e-05, "loss": 0.6554, "step": 48600 }, { "epoch": 1.1236989960394215, "grad_norm": 2.80454421043396, "learning_rate": 2.9805340236471257e-05, "loss": 0.6641, "step": 48800 }, { "epoch": 1.1283043197936815, "grad_norm": 3.167177200317383, "learning_rate": 2.979806867264874e-05, "loss": 0.6508, "step": 49000 }, { "epoch": 1.1329096435479413, "grad_norm": 3.2425365447998047, "learning_rate": 2.9790797108826225e-05, "loss": 0.6508, "step": 49200 }, { "epoch": 1.1375149673022014, "grad_norm": 2.834266424179077, "learning_rate": 2.978352554500371e-05, "loss": 0.6558, "step": 49400 }, { "epoch": 1.1421202910564612, "grad_norm": 2.795851707458496, "learning_rate": 2.9776253981181194e-05, "loss": 0.6567, "step": 49600 }, { "epoch": 1.1467256148107212, "grad_norm": 2.940406084060669, "learning_rate": 2.9768982417358676e-05, "loss": 0.6605, "step": 49800 }, { "epoch": 1.151330938564981, "grad_norm": 3.899711847305298, "learning_rate": 2.9761710853536162e-05, "loss": 0.6639, "step": 50000 }, { "epoch": 1.155936262319241, "grad_norm": 2.7121920585632324, "learning_rate": 2.9754475647532757e-05, "loss": 0.6759, "step": 50200 }, { "epoch": 1.1605415860735009, "grad_norm": 3.220299005508423, "learning_rate": 2.9747204083710243e-05, "loss": 0.6514, "step": 50400 }, { "epoch": 1.165146909827761, "grad_norm": 3.94079852104187, "learning_rate": 2.973993251988773e-05, "loss": 0.661, "step": 50600 }, { "epoch": 1.1697522335820207, "grad_norm": 3.2445309162139893, "learning_rate": 2.973266095606521e-05, "loss": 0.6682, "step": 50800 }, { "epoch": 1.1743575573362808, "grad_norm": 3.0553812980651855, "learning_rate": 2.9725389392242697e-05, "loss": 0.6687, "step": 51000 }, { "epoch": 1.1789628810905406, "grad_norm": 4.24676513671875, "learning_rate": 2.971811782842018e-05, "loss": 0.656, "step": 51200 }, { "epoch": 1.1835682048448006, "grad_norm": 3.447143316268921, "learning_rate": 2.9710882622416775e-05, "loss": 0.6562, "step": 51400 }, { "epoch": 1.1881735285990604, "grad_norm": 2.6866791248321533, "learning_rate": 2.970361105859426e-05, "loss": 0.6684, "step": 51600 }, { "epoch": 1.1927788523533205, "grad_norm": 3.1234331130981445, "learning_rate": 2.9696339494771747e-05, "loss": 0.658, "step": 51800 }, { "epoch": 1.1973841761075803, "grad_norm": 3.407177686691284, "learning_rate": 2.968906793094923e-05, "loss": 0.6628, "step": 52000 }, { "epoch": 1.2019894998618403, "grad_norm": 3.587676763534546, "learning_rate": 2.9681796367126715e-05, "loss": 0.6493, "step": 52200 }, { "epoch": 1.2065948236161002, "grad_norm": 2.697775363922119, "learning_rate": 2.96745248033042e-05, "loss": 0.6637, "step": 52400 }, { "epoch": 1.2112001473703602, "grad_norm": 3.507350206375122, "learning_rate": 2.9667253239481684e-05, "loss": 0.6528, "step": 52600 }, { "epoch": 1.21580547112462, "grad_norm": 3.143338203430176, "learning_rate": 2.9659981675659166e-05, "loss": 0.6521, "step": 52800 }, { "epoch": 1.22041079487888, "grad_norm": 2.8852453231811523, "learning_rate": 2.9652710111836652e-05, "loss": 0.6595, "step": 53000 }, { "epoch": 1.2250161186331399, "grad_norm": 3.039896249771118, "learning_rate": 2.9645438548014135e-05, "loss": 0.6542, "step": 53200 }, { "epoch": 1.2296214423874, "grad_norm": 2.8866536617279053, "learning_rate": 2.963816698419162e-05, "loss": 0.6535, "step": 53400 }, { "epoch": 1.2342267661416597, "grad_norm": 2.5456418991088867, "learning_rate": 2.9630895420369107e-05, "loss": 0.664, "step": 53600 }, { "epoch": 1.2388320898959198, "grad_norm": 2.5279481410980225, "learning_rate": 2.9623623856546593e-05, "loss": 0.6603, "step": 53800 }, { "epoch": 1.2434374136501796, "grad_norm": 3.0540003776550293, "learning_rate": 2.9616352292724072e-05, "loss": 0.6626, "step": 54000 }, { "epoch": 1.2480427374044396, "grad_norm": 2.9297878742218018, "learning_rate": 2.9609080728901558e-05, "loss": 0.6612, "step": 54200 }, { "epoch": 1.2526480611586994, "grad_norm": 3.4049458503723145, "learning_rate": 2.9601809165079044e-05, "loss": 0.6509, "step": 54400 }, { "epoch": 1.2572533849129595, "grad_norm": 4.160104274749756, "learning_rate": 2.9594537601256526e-05, "loss": 0.6533, "step": 54600 }, { "epoch": 1.2618587086672193, "grad_norm": 3.6435911655426025, "learning_rate": 2.9587266037434012e-05, "loss": 0.6703, "step": 54800 }, { "epoch": 1.2664640324214793, "grad_norm": 4.371586322784424, "learning_rate": 2.9579994473611498e-05, "loss": 0.6507, "step": 55000 }, { "epoch": 1.2710693561757391, "grad_norm": 2.840867757797241, "learning_rate": 2.9572722909788977e-05, "loss": 0.6644, "step": 55200 }, { "epoch": 1.275674679929999, "grad_norm": 4.047842502593994, "learning_rate": 2.9565451345966463e-05, "loss": 0.6752, "step": 55400 }, { "epoch": 1.280280003684259, "grad_norm": 4.21028470993042, "learning_rate": 2.955817978214395e-05, "loss": 0.6582, "step": 55600 }, { "epoch": 1.284885327438519, "grad_norm": 2.9886422157287598, "learning_rate": 2.955090821832143e-05, "loss": 0.6634, "step": 55800 }, { "epoch": 1.2894906511927788, "grad_norm": 3.3771917819976807, "learning_rate": 2.9543636654498917e-05, "loss": 0.6485, "step": 56000 }, { "epoch": 1.2940959749470387, "grad_norm": 3.0930802822113037, "learning_rate": 2.9536365090676403e-05, "loss": 0.6665, "step": 56200 }, { "epoch": 1.2987012987012987, "grad_norm": 3.460557222366333, "learning_rate": 2.952909352685389e-05, "loss": 0.6703, "step": 56400 }, { "epoch": 1.3033066224555587, "grad_norm": 3.846440076828003, "learning_rate": 2.9521858320850484e-05, "loss": 0.6626, "step": 56600 }, { "epoch": 1.3079119462098185, "grad_norm": 3.103189468383789, "learning_rate": 2.9514586757027967e-05, "loss": 0.6732, "step": 56800 }, { "epoch": 1.3125172699640784, "grad_norm": 3.5162618160247803, "learning_rate": 2.950731519320545e-05, "loss": 0.6659, "step": 57000 }, { "epoch": 1.3171225937183384, "grad_norm": 3.511310338973999, "learning_rate": 2.9500079987202048e-05, "loss": 0.668, "step": 57200 }, { "epoch": 1.3217279174725984, "grad_norm": 3.5012547969818115, "learning_rate": 2.9492808423379534e-05, "loss": 0.6693, "step": 57400 }, { "epoch": 1.3263332412268583, "grad_norm": 3.130657434463501, "learning_rate": 2.9485536859557016e-05, "loss": 0.6678, "step": 57600 }, { "epoch": 1.330938564981118, "grad_norm": 3.0127837657928467, "learning_rate": 2.9478265295734502e-05, "loss": 0.6606, "step": 57800 }, { "epoch": 1.335543888735378, "grad_norm": 2.949445962905884, "learning_rate": 2.947099373191199e-05, "loss": 0.6663, "step": 58000 }, { "epoch": 1.3401492124896381, "grad_norm": 3.239060640335083, "learning_rate": 2.9463722168089467e-05, "loss": 0.659, "step": 58200 }, { "epoch": 1.344754536243898, "grad_norm": 3.4041852951049805, "learning_rate": 2.9456450604266953e-05, "loss": 0.6652, "step": 58400 }, { "epoch": 1.3493598599981578, "grad_norm": 2.7172391414642334, "learning_rate": 2.944917904044444e-05, "loss": 0.643, "step": 58600 }, { "epoch": 1.3539651837524178, "grad_norm": 3.234712600708008, "learning_rate": 2.9441907476621922e-05, "loss": 0.6552, "step": 58800 }, { "epoch": 1.3585705075066778, "grad_norm": 2.3830983638763428, "learning_rate": 2.9434635912799408e-05, "loss": 0.6704, "step": 59000 }, { "epoch": 1.3631758312609377, "grad_norm": 3.203972578048706, "learning_rate": 2.9427364348976894e-05, "loss": 0.6609, "step": 59200 }, { "epoch": 1.3677811550151975, "grad_norm": 3.8763749599456787, "learning_rate": 2.9420092785154373e-05, "loss": 0.6641, "step": 59400 }, { "epoch": 1.3723864787694575, "grad_norm": 3.5821003913879395, "learning_rate": 2.941282122133186e-05, "loss": 0.6577, "step": 59600 }, { "epoch": 1.3769918025237173, "grad_norm": 3.558241844177246, "learning_rate": 2.9405549657509345e-05, "loss": 0.6564, "step": 59800 }, { "epoch": 1.3815971262779774, "grad_norm": 2.6265041828155518, "learning_rate": 2.939827809368683e-05, "loss": 0.6497, "step": 60000 }, { "epoch": 1.3862024500322372, "grad_norm": 3.8873655796051025, "learning_rate": 2.9391006529864313e-05, "loss": 0.6618, "step": 60200 }, { "epoch": 1.3908077737864972, "grad_norm": 3.367459774017334, "learning_rate": 2.93837349660418e-05, "loss": 0.6427, "step": 60400 }, { "epoch": 1.395413097540757, "grad_norm": 3.5943410396575928, "learning_rate": 2.9376463402219285e-05, "loss": 0.6767, "step": 60600 }, { "epoch": 1.400018421295017, "grad_norm": 3.536818742752075, "learning_rate": 2.9369191838396764e-05, "loss": 0.6474, "step": 60800 }, { "epoch": 1.404623745049277, "grad_norm": 3.1541073322296143, "learning_rate": 2.9361956632393363e-05, "loss": 0.657, "step": 61000 }, { "epoch": 1.409229068803537, "grad_norm": 3.130194902420044, "learning_rate": 2.9354685068570845e-05, "loss": 0.6623, "step": 61200 }, { "epoch": 1.4138343925577967, "grad_norm": 3.6014018058776855, "learning_rate": 2.934741350474833e-05, "loss": 0.6544, "step": 61400 }, { "epoch": 1.4184397163120568, "grad_norm": 3.4205713272094727, "learning_rate": 2.9340141940925817e-05, "loss": 0.663, "step": 61600 }, { "epoch": 1.4230450400663166, "grad_norm": 2.7789885997772217, "learning_rate": 2.9332870377103303e-05, "loss": 0.6607, "step": 61800 }, { "epoch": 1.4276503638205766, "grad_norm": 2.774611234664917, "learning_rate": 2.9325598813280785e-05, "loss": 0.6559, "step": 62000 }, { "epoch": 1.4322556875748365, "grad_norm": 3.5750908851623535, "learning_rate": 2.9318327249458268e-05, "loss": 0.6546, "step": 62200 }, { "epoch": 1.4368610113290965, "grad_norm": 3.758307456970215, "learning_rate": 2.9311055685635754e-05, "loss": 0.6681, "step": 62400 }, { "epoch": 1.4414663350833563, "grad_norm": 2.9125349521636963, "learning_rate": 2.9303784121813236e-05, "loss": 0.6504, "step": 62600 }, { "epoch": 1.4460716588376163, "grad_norm": 3.2125773429870605, "learning_rate": 2.9296512557990722e-05, "loss": 0.6535, "step": 62800 }, { "epoch": 1.4506769825918762, "grad_norm": 3.11696457862854, "learning_rate": 2.9289240994168208e-05, "loss": 0.6425, "step": 63000 }, { "epoch": 1.4552823063461362, "grad_norm": 4.079625606536865, "learning_rate": 2.928196943034569e-05, "loss": 0.6641, "step": 63200 }, { "epoch": 1.459887630100396, "grad_norm": 2.697766065597534, "learning_rate": 2.9274697866523177e-05, "loss": 0.6568, "step": 63400 }, { "epoch": 1.464492953854656, "grad_norm": 2.6162514686584473, "learning_rate": 2.926742630270066e-05, "loss": 0.659, "step": 63600 }, { "epoch": 1.4690982776089159, "grad_norm": 3.08160138130188, "learning_rate": 2.926015473887814e-05, "loss": 0.6527, "step": 63800 }, { "epoch": 1.473703601363176, "grad_norm": 3.38775897026062, "learning_rate": 2.9252883175055628e-05, "loss": 0.6614, "step": 64000 }, { "epoch": 1.4783089251174357, "grad_norm": 3.066603183746338, "learning_rate": 2.9245611611233113e-05, "loss": 0.6607, "step": 64200 }, { "epoch": 1.4829142488716958, "grad_norm": 2.781545400619507, "learning_rate": 2.92383400474106e-05, "loss": 0.6529, "step": 64400 }, { "epoch": 1.4875195726259556, "grad_norm": 3.7093193531036377, "learning_rate": 2.9231068483588082e-05, "loss": 0.6507, "step": 64600 }, { "epoch": 1.4921248963802154, "grad_norm": 3.037850856781006, "learning_rate": 2.9223796919765564e-05, "loss": 0.6593, "step": 64800 }, { "epoch": 1.4967302201344754, "grad_norm": 3.192322254180908, "learning_rate": 2.921652535594305e-05, "loss": 0.6561, "step": 65000 }, { "epoch": 1.5013355438887355, "grad_norm": 3.157817840576172, "learning_rate": 2.9209253792120533e-05, "loss": 0.6488, "step": 65200 }, { "epoch": 1.5059408676429953, "grad_norm": 3.132276773452759, "learning_rate": 2.920198222829802e-05, "loss": 0.6549, "step": 65400 }, { "epoch": 1.510546191397255, "grad_norm": 3.7865426540374756, "learning_rate": 2.9194710664475505e-05, "loss": 0.6534, "step": 65600 }, { "epoch": 1.5151515151515151, "grad_norm": 3.1854352951049805, "learning_rate": 2.9187439100652987e-05, "loss": 0.6592, "step": 65800 }, { "epoch": 1.5197568389057752, "grad_norm": 2.9768927097320557, "learning_rate": 2.918016753683047e-05, "loss": 0.6581, "step": 66000 }, { "epoch": 1.524362162660035, "grad_norm": 3.467859983444214, "learning_rate": 2.9172895973007956e-05, "loss": 0.6599, "step": 66200 }, { "epoch": 1.5289674864142948, "grad_norm": 3.409672975540161, "learning_rate": 2.9165624409185438e-05, "loss": 0.6589, "step": 66400 }, { "epoch": 1.5335728101685548, "grad_norm": 3.304276943206787, "learning_rate": 2.9158352845362924e-05, "loss": 0.6619, "step": 66600 }, { "epoch": 1.5381781339228149, "grad_norm": 2.9806504249572754, "learning_rate": 2.915108128154041e-05, "loss": 0.6686, "step": 66800 }, { "epoch": 1.5427834576770747, "grad_norm": 3.9188714027404785, "learning_rate": 2.9143846075537005e-05, "loss": 0.6599, "step": 67000 }, { "epoch": 1.5473887814313345, "grad_norm": 3.01717209815979, "learning_rate": 2.913657451171449e-05, "loss": 0.6603, "step": 67200 }, { "epoch": 1.5519941051855946, "grad_norm": 3.1575047969818115, "learning_rate": 2.9129302947891977e-05, "loss": 0.6532, "step": 67400 }, { "epoch": 1.5565994289398546, "grad_norm": 2.840865135192871, "learning_rate": 2.9122031384069456e-05, "loss": 0.6537, "step": 67600 }, { "epoch": 1.5612047526941144, "grad_norm": 3.181452989578247, "learning_rate": 2.9114759820246942e-05, "loss": 0.6605, "step": 67800 }, { "epoch": 1.5658100764483742, "grad_norm": 3.727302074432373, "learning_rate": 2.9107488256424428e-05, "loss": 0.6553, "step": 68000 }, { "epoch": 1.5704154002026343, "grad_norm": 3.4746217727661133, "learning_rate": 2.910021669260191e-05, "loss": 0.6603, "step": 68200 }, { "epoch": 1.5750207239568943, "grad_norm": 3.0356459617614746, "learning_rate": 2.9092945128779396e-05, "loss": 0.6705, "step": 68400 }, { "epoch": 1.5796260477111541, "grad_norm": 4.231008529663086, "learning_rate": 2.9085673564956882e-05, "loss": 0.6509, "step": 68600 }, { "epoch": 1.584231371465414, "grad_norm": 2.5000898838043213, "learning_rate": 2.907840200113436e-05, "loss": 0.6577, "step": 68800 }, { "epoch": 1.588836695219674, "grad_norm": 3.202843427658081, "learning_rate": 2.9071130437311847e-05, "loss": 0.6515, "step": 69000 }, { "epoch": 1.593442018973934, "grad_norm": 3.5553150177001953, "learning_rate": 2.9063858873489333e-05, "loss": 0.6569, "step": 69200 }, { "epoch": 1.5980473427281938, "grad_norm": 3.006439208984375, "learning_rate": 2.905658730966682e-05, "loss": 0.6556, "step": 69400 }, { "epoch": 1.6026526664824536, "grad_norm": 2.8993167877197266, "learning_rate": 2.9049352103663414e-05, "loss": 0.6602, "step": 69600 }, { "epoch": 1.6072579902367137, "grad_norm": 3.5758256912231445, "learning_rate": 2.90420805398409e-05, "loss": 0.6512, "step": 69800 }, { "epoch": 1.6118633139909737, "grad_norm": 2.4730918407440186, "learning_rate": 2.9034808976018383e-05, "loss": 0.6479, "step": 70000 }, { "epoch": 1.6164686377452335, "grad_norm": 3.3662991523742676, "learning_rate": 2.902757377001498e-05, "loss": 0.6519, "step": 70200 }, { "epoch": 1.6210739614994933, "grad_norm": 2.8330750465393066, "learning_rate": 2.9020302206192467e-05, "loss": 0.6492, "step": 70400 }, { "epoch": 1.6256792852537534, "grad_norm": 3.55230975151062, "learning_rate": 2.9013030642369947e-05, "loss": 0.654, "step": 70600 }, { "epoch": 1.6302846090080134, "grad_norm": 3.134399890899658, "learning_rate": 2.9005795436366545e-05, "loss": 0.657, "step": 70800 }, { "epoch": 1.6348899327622732, "grad_norm": 3.899355888366699, "learning_rate": 2.8998523872544028e-05, "loss": 0.6565, "step": 71000 }, { "epoch": 1.639495256516533, "grad_norm": 3.356893301010132, "learning_rate": 2.8991252308721514e-05, "loss": 0.6465, "step": 71200 }, { "epoch": 1.644100580270793, "grad_norm": 3.021803855895996, "learning_rate": 2.8983980744899e-05, "loss": 0.6409, "step": 71400 }, { "epoch": 1.6487059040250531, "grad_norm": 3.1984188556671143, "learning_rate": 2.8976709181076485e-05, "loss": 0.6457, "step": 71600 }, { "epoch": 1.6533112277793127, "grad_norm": 2.6747777462005615, "learning_rate": 2.8969437617253968e-05, "loss": 0.6497, "step": 71800 }, { "epoch": 1.6579165515335728, "grad_norm": 3.9369540214538574, "learning_rate": 2.896216605343145e-05, "loss": 0.6556, "step": 72000 }, { "epoch": 1.6625218752878328, "grad_norm": 3.03513503074646, "learning_rate": 2.8954894489608936e-05, "loss": 0.652, "step": 72200 }, { "epoch": 1.6671271990420926, "grad_norm": 2.8434908390045166, "learning_rate": 2.894762292578642e-05, "loss": 0.6575, "step": 72400 }, { "epoch": 1.6717325227963524, "grad_norm": 3.372441291809082, "learning_rate": 2.8940351361963905e-05, "loss": 0.6576, "step": 72600 }, { "epoch": 1.6763378465506125, "grad_norm": 2.8065624237060547, "learning_rate": 2.893307979814139e-05, "loss": 0.6379, "step": 72800 }, { "epoch": 1.6809431703048725, "grad_norm": 4.157696723937988, "learning_rate": 2.8925808234318873e-05, "loss": 0.6482, "step": 73000 }, { "epoch": 1.6855484940591323, "grad_norm": 2.8976545333862305, "learning_rate": 2.8918536670496356e-05, "loss": 0.6596, "step": 73200 }, { "epoch": 1.6901538178133921, "grad_norm": 3.67669415473938, "learning_rate": 2.8911265106673842e-05, "loss": 0.6556, "step": 73400 }, { "epoch": 1.6947591415676522, "grad_norm": 3.2437984943389893, "learning_rate": 2.8903993542851324e-05, "loss": 0.6519, "step": 73600 }, { "epoch": 1.6993644653219122, "grad_norm": 2.789106607437134, "learning_rate": 2.889672197902881e-05, "loss": 0.673, "step": 73800 }, { "epoch": 1.703969789076172, "grad_norm": 3.525022268295288, "learning_rate": 2.8889450415206296e-05, "loss": 0.6559, "step": 74000 }, { "epoch": 1.7085751128304318, "grad_norm": 3.419024705886841, "learning_rate": 2.8882178851383782e-05, "loss": 0.6539, "step": 74200 }, { "epoch": 1.7131804365846919, "grad_norm": 3.462752103805542, "learning_rate": 2.8874907287561265e-05, "loss": 0.6531, "step": 74400 }, { "epoch": 1.717785760338952, "grad_norm": 3.5559234619140625, "learning_rate": 2.8867635723738747e-05, "loss": 0.6499, "step": 74600 }, { "epoch": 1.7223910840932117, "grad_norm": 2.969327211380005, "learning_rate": 2.8860364159916233e-05, "loss": 0.6348, "step": 74800 }, { "epoch": 1.7269964078474715, "grad_norm": 2.5927562713623047, "learning_rate": 2.8853128953912828e-05, "loss": 0.6491, "step": 75000 }, { "epoch": 1.7316017316017316, "grad_norm": 3.60489821434021, "learning_rate": 2.8845857390090314e-05, "loss": 0.6627, "step": 75200 }, { "epoch": 1.7362070553559916, "grad_norm": 3.0263278484344482, "learning_rate": 2.8838585826267797e-05, "loss": 0.6649, "step": 75400 }, { "epoch": 1.7408123791102514, "grad_norm": 3.406975507736206, "learning_rate": 2.8831314262445283e-05, "loss": 0.6602, "step": 75600 }, { "epoch": 1.7454177028645113, "grad_norm": 3.032499313354492, "learning_rate": 2.882404269862277e-05, "loss": 0.6548, "step": 75800 }, { "epoch": 1.7500230266187713, "grad_norm": 3.3158748149871826, "learning_rate": 2.881677113480025e-05, "loss": 0.6661, "step": 76000 }, { "epoch": 1.7546283503730313, "grad_norm": 2.534254312515259, "learning_rate": 2.8809499570977733e-05, "loss": 0.645, "step": 76200 }, { "epoch": 1.7592336741272911, "grad_norm": 2.951493978500366, "learning_rate": 2.880222800715522e-05, "loss": 0.6594, "step": 76400 }, { "epoch": 1.763838997881551, "grad_norm": 2.681861162185669, "learning_rate": 2.8794956443332705e-05, "loss": 0.652, "step": 76600 }, { "epoch": 1.768444321635811, "grad_norm": 3.9448912143707275, "learning_rate": 2.8787684879510188e-05, "loss": 0.659, "step": 76800 }, { "epoch": 1.773049645390071, "grad_norm": 3.7418482303619385, "learning_rate": 2.8780413315687674e-05, "loss": 0.655, "step": 77000 }, { "epoch": 1.7776549691443309, "grad_norm": 3.9162464141845703, "learning_rate": 2.877314175186516e-05, "loss": 0.6591, "step": 77200 }, { "epoch": 1.7822602928985907, "grad_norm": 3.8449208736419678, "learning_rate": 2.876587018804264e-05, "loss": 0.6717, "step": 77400 }, { "epoch": 1.7868656166528507, "grad_norm": 2.90519118309021, "learning_rate": 2.8758598624220125e-05, "loss": 0.6559, "step": 77600 }, { "epoch": 1.7914709404071107, "grad_norm": 3.301666259765625, "learning_rate": 2.875132706039761e-05, "loss": 0.6533, "step": 77800 }, { "epoch": 1.7960762641613706, "grad_norm": 3.6207275390625, "learning_rate": 2.8744091854394206e-05, "loss": 0.6582, "step": 78000 }, { "epoch": 1.8006815879156304, "grad_norm": 3.3020622730255127, "learning_rate": 2.8736856648390804e-05, "loss": 0.6606, "step": 78200 }, { "epoch": 1.8052869116698904, "grad_norm": 3.674409866333008, "learning_rate": 2.8729585084568287e-05, "loss": 0.6581, "step": 78400 }, { "epoch": 1.8098922354241505, "grad_norm": 2.18864369392395, "learning_rate": 2.8722313520745773e-05, "loss": 0.6641, "step": 78600 }, { "epoch": 1.8144975591784103, "grad_norm": 3.165595769882202, "learning_rate": 2.871504195692326e-05, "loss": 0.6568, "step": 78800 }, { "epoch": 1.81910288293267, "grad_norm": 2.2597508430480957, "learning_rate": 2.8707770393100738e-05, "loss": 0.651, "step": 79000 }, { "epoch": 1.8237082066869301, "grad_norm": 3.7777042388916016, "learning_rate": 2.8700498829278224e-05, "loss": 0.6773, "step": 79200 }, { "epoch": 1.8283135304411902, "grad_norm": 3.0962727069854736, "learning_rate": 2.869322726545571e-05, "loss": 0.6643, "step": 79400 }, { "epoch": 1.83291885419545, "grad_norm": 3.4903509616851807, "learning_rate": 2.8685955701633196e-05, "loss": 0.6636, "step": 79600 }, { "epoch": 1.8375241779497098, "grad_norm": 3.2742772102355957, "learning_rate": 2.8678684137810678e-05, "loss": 0.6497, "step": 79800 }, { "epoch": 1.8421295017039698, "grad_norm": 3.5722882747650146, "learning_rate": 2.8671412573988164e-05, "loss": 0.6536, "step": 80000 }, { "epoch": 1.8467348254582299, "grad_norm": 4.832566738128662, "learning_rate": 2.8664141010165647e-05, "loss": 0.6584, "step": 80200 }, { "epoch": 1.8513401492124897, "grad_norm": 3.033071756362915, "learning_rate": 2.865686944634313e-05, "loss": 0.6542, "step": 80400 }, { "epoch": 1.8559454729667495, "grad_norm": 3.029519557952881, "learning_rate": 2.8649597882520615e-05, "loss": 0.6627, "step": 80600 }, { "epoch": 1.8605507967210095, "grad_norm": 3.6355273723602295, "learning_rate": 2.86423263186981e-05, "loss": 0.6568, "step": 80800 }, { "epoch": 1.8651561204752696, "grad_norm": 2.547201633453369, "learning_rate": 2.8635054754875584e-05, "loss": 0.6582, "step": 81000 }, { "epoch": 1.8697614442295294, "grad_norm": 3.6909263134002686, "learning_rate": 2.862778319105307e-05, "loss": 0.6528, "step": 81200 }, { "epoch": 1.8743667679837892, "grad_norm": 3.107825994491577, "learning_rate": 2.8620511627230555e-05, "loss": 0.6452, "step": 81400 }, { "epoch": 1.8789720917380492, "grad_norm": 2.624311685562134, "learning_rate": 2.8613240063408034e-05, "loss": 0.6588, "step": 81600 }, { "epoch": 1.883577415492309, "grad_norm": 3.7440786361694336, "learning_rate": 2.8606004857404633e-05, "loss": 0.6489, "step": 81800 }, { "epoch": 1.8881827392465689, "grad_norm": 3.2564656734466553, "learning_rate": 2.859873329358212e-05, "loss": 0.6599, "step": 82000 }, { "epoch": 1.892788063000829, "grad_norm": 3.0685300827026367, "learning_rate": 2.85914617297596e-05, "loss": 0.6504, "step": 82200 }, { "epoch": 1.897393386755089, "grad_norm": 3.587435245513916, "learning_rate": 2.8584190165937087e-05, "loss": 0.6499, "step": 82400 }, { "epoch": 1.9019987105093488, "grad_norm": 2.8492074012756348, "learning_rate": 2.8576918602114573e-05, "loss": 0.6353, "step": 82600 }, { "epoch": 1.9066040342636086, "grad_norm": 3.6821560859680176, "learning_rate": 2.8569647038292056e-05, "loss": 0.6529, "step": 82800 }, { "epoch": 1.9112093580178686, "grad_norm": 4.206520080566406, "learning_rate": 2.856237547446954e-05, "loss": 0.6524, "step": 83000 }, { "epoch": 1.9158146817721287, "grad_norm": 3.277606725692749, "learning_rate": 2.8555103910647024e-05, "loss": 0.6579, "step": 83200 }, { "epoch": 1.9204200055263885, "grad_norm": 3.900179624557495, "learning_rate": 2.8547832346824507e-05, "loss": 0.6627, "step": 83400 }, { "epoch": 1.9250253292806483, "grad_norm": 2.804596185684204, "learning_rate": 2.8540560783001993e-05, "loss": 0.6655, "step": 83600 }, { "epoch": 1.9296306530349083, "grad_norm": 3.212975263595581, "learning_rate": 2.853328921917948e-05, "loss": 0.6614, "step": 83800 }, { "epoch": 1.9342359767891684, "grad_norm": 4.128197193145752, "learning_rate": 2.8526017655356965e-05, "loss": 0.6546, "step": 84000 }, { "epoch": 1.9388413005434282, "grad_norm": 3.0182225704193115, "learning_rate": 2.8518746091534447e-05, "loss": 0.659, "step": 84200 }, { "epoch": 1.943446624297688, "grad_norm": 2.6237552165985107, "learning_rate": 2.851147452771193e-05, "loss": 0.6467, "step": 84400 }, { "epoch": 1.948051948051948, "grad_norm": 2.700956106185913, "learning_rate": 2.8504202963889416e-05, "loss": 0.6597, "step": 84600 }, { "epoch": 1.952657271806208, "grad_norm": 3.318554162979126, "learning_rate": 2.8496931400066898e-05, "loss": 0.649, "step": 84800 }, { "epoch": 1.9572625955604679, "grad_norm": 2.75907301902771, "learning_rate": 2.8489659836244384e-05, "loss": 0.6713, "step": 85000 }, { "epoch": 1.9618679193147277, "grad_norm": 3.922351837158203, "learning_rate": 2.848238827242187e-05, "loss": 0.6495, "step": 85200 }, { "epoch": 1.9664732430689877, "grad_norm": 2.883132219314575, "learning_rate": 2.8475116708599352e-05, "loss": 0.6609, "step": 85400 }, { "epoch": 1.9710785668232478, "grad_norm": 2.974393844604492, "learning_rate": 2.8467845144776835e-05, "loss": 0.6435, "step": 85600 }, { "epoch": 1.9756838905775076, "grad_norm": 3.174490451812744, "learning_rate": 2.846057358095432e-05, "loss": 0.6514, "step": 85800 }, { "epoch": 1.9802892143317674, "grad_norm": 3.1419320106506348, "learning_rate": 2.8453302017131803e-05, "loss": 0.6652, "step": 86000 }, { "epoch": 1.9848945380860274, "grad_norm": 3.508948802947998, "learning_rate": 2.844603045330929e-05, "loss": 0.6562, "step": 86200 }, { "epoch": 1.9894998618402875, "grad_norm": 3.572232484817505, "learning_rate": 2.8438758889486775e-05, "loss": 0.6547, "step": 86400 }, { "epoch": 1.9941051855945473, "grad_norm": 2.923038959503174, "learning_rate": 2.8431487325664258e-05, "loss": 0.6555, "step": 86600 }, { "epoch": 1.9987105093488071, "grad_norm": 2.6537113189697266, "learning_rate": 2.8424252119660856e-05, "loss": 0.6481, "step": 86800 }, { "epoch": 2.0, "eval_loss": 0.6096945405006409, "eval_runtime": 145.8772, "eval_samples_per_second": 194.417, "eval_steps_per_second": 12.154, "step": 86856 }, { "epoch": 2.003315833103067, "grad_norm": 2.6319403648376465, "learning_rate": 2.841698055583834e-05, "loss": 0.6447, "step": 87000 }, { "epoch": 2.007921156857327, "grad_norm": 3.3144571781158447, "learning_rate": 2.840970899201582e-05, "loss": 0.6489, "step": 87200 }, { "epoch": 2.012526480611587, "grad_norm": 3.0572547912597656, "learning_rate": 2.8402437428193307e-05, "loss": 0.6458, "step": 87400 }, { "epoch": 2.017131804365847, "grad_norm": 3.151933431625366, "learning_rate": 2.8395165864370793e-05, "loss": 0.6484, "step": 87600 }, { "epoch": 2.021737128120107, "grad_norm": 3.794235944747925, "learning_rate": 2.8387894300548276e-05, "loss": 0.6493, "step": 87800 }, { "epoch": 2.026342451874367, "grad_norm": 2.9801411628723145, "learning_rate": 2.838062273672576e-05, "loss": 0.6399, "step": 88000 }, { "epoch": 2.0309477756286265, "grad_norm": 3.094648599624634, "learning_rate": 2.8373351172903248e-05, "loss": 0.6488, "step": 88200 }, { "epoch": 2.0355530993828865, "grad_norm": 2.9568779468536377, "learning_rate": 2.8366079609080727e-05, "loss": 0.6476, "step": 88400 }, { "epoch": 2.0401584231371466, "grad_norm": 3.226010799407959, "learning_rate": 2.8358808045258213e-05, "loss": 0.6439, "step": 88600 }, { "epoch": 2.0447637468914066, "grad_norm": 3.10718035697937, "learning_rate": 2.83515364814357e-05, "loss": 0.6421, "step": 88800 }, { "epoch": 2.049369070645666, "grad_norm": 3.8462493419647217, "learning_rate": 2.8344264917613184e-05, "loss": 0.6497, "step": 89000 }, { "epoch": 2.0539743943999262, "grad_norm": 2.297658920288086, "learning_rate": 2.8336993353790667e-05, "loss": 0.6425, "step": 89200 }, { "epoch": 2.0585797181541863, "grad_norm": 3.4975883960723877, "learning_rate": 2.8329721789968153e-05, "loss": 0.6387, "step": 89400 }, { "epoch": 2.0631850419084463, "grad_norm": 2.6451213359832764, "learning_rate": 2.8322450226145635e-05, "loss": 0.6507, "step": 89600 }, { "epoch": 2.067790365662706, "grad_norm": 4.4015679359436035, "learning_rate": 2.8315178662323118e-05, "loss": 0.6549, "step": 89800 }, { "epoch": 2.072395689416966, "grad_norm": 3.2496938705444336, "learning_rate": 2.8307907098500604e-05, "loss": 0.6443, "step": 90000 }, { "epoch": 2.077001013171226, "grad_norm": 3.8921310901641846, "learning_rate": 2.830063553467809e-05, "loss": 0.6563, "step": 90200 }, { "epoch": 2.081606336925486, "grad_norm": 2.6001060009002686, "learning_rate": 2.8293363970855572e-05, "loss": 0.6561, "step": 90400 }, { "epoch": 2.0862116606797456, "grad_norm": 2.925668716430664, "learning_rate": 2.8286092407033058e-05, "loss": 0.6395, "step": 90600 }, { "epoch": 2.0908169844340057, "grad_norm": 2.9956302642822266, "learning_rate": 2.8278857201029657e-05, "loss": 0.6584, "step": 90800 }, { "epoch": 2.0954223081882657, "grad_norm": 3.5268077850341797, "learning_rate": 2.8271585637207136e-05, "loss": 0.648, "step": 91000 }, { "epoch": 2.1000276319425257, "grad_norm": 4.238087177276611, "learning_rate": 2.8264314073384622e-05, "loss": 0.6387, "step": 91200 }, { "epoch": 2.1046329556967853, "grad_norm": 2.673576593399048, "learning_rate": 2.8257042509562108e-05, "loss": 0.6379, "step": 91400 }, { "epoch": 2.1092382794510454, "grad_norm": 2.9278106689453125, "learning_rate": 2.824977094573959e-05, "loss": 0.6527, "step": 91600 }, { "epoch": 2.1138436032053054, "grad_norm": 3.109639883041382, "learning_rate": 2.824253573973619e-05, "loss": 0.6437, "step": 91800 }, { "epoch": 2.1184489269595654, "grad_norm": 3.2861876487731934, "learning_rate": 2.8235264175913675e-05, "loss": 0.6269, "step": 92000 }, { "epoch": 2.123054250713825, "grad_norm": 3.4922659397125244, "learning_rate": 2.8227992612091157e-05, "loss": 0.647, "step": 92200 }, { "epoch": 2.127659574468085, "grad_norm": 3.1858551502227783, "learning_rate": 2.8220721048268643e-05, "loss": 0.6438, "step": 92400 }, { "epoch": 2.132264898222345, "grad_norm": 3.2486298084259033, "learning_rate": 2.8213449484446126e-05, "loss": 0.6403, "step": 92600 }, { "epoch": 2.136870221976605, "grad_norm": 2.3722951412200928, "learning_rate": 2.8206177920623608e-05, "loss": 0.6346, "step": 92800 }, { "epoch": 2.1414755457308647, "grad_norm": 3.3336052894592285, "learning_rate": 2.8198906356801094e-05, "loss": 0.6514, "step": 93000 }, { "epoch": 2.1460808694851248, "grad_norm": 2.755908727645874, "learning_rate": 2.819163479297858e-05, "loss": 0.6402, "step": 93200 }, { "epoch": 2.150686193239385, "grad_norm": 3.960623264312744, "learning_rate": 2.8184363229156063e-05, "loss": 0.6398, "step": 93400 }, { "epoch": 2.155291516993645, "grad_norm": 3.4139673709869385, "learning_rate": 2.817709166533355e-05, "loss": 0.6282, "step": 93600 }, { "epoch": 2.1598968407479044, "grad_norm": 3.090846300125122, "learning_rate": 2.816982010151103e-05, "loss": 0.6634, "step": 93800 }, { "epoch": 2.1645021645021645, "grad_norm": 2.835188627243042, "learning_rate": 2.8162584895507626e-05, "loss": 0.6342, "step": 94000 }, { "epoch": 2.1691074882564245, "grad_norm": 2.9350528717041016, "learning_rate": 2.8155313331685112e-05, "loss": 0.6412, "step": 94200 }, { "epoch": 2.1737128120106846, "grad_norm": 3.2040562629699707, "learning_rate": 2.8148041767862598e-05, "loss": 0.6519, "step": 94400 }, { "epoch": 2.178318135764944, "grad_norm": 2.805971384048462, "learning_rate": 2.814077020404008e-05, "loss": 0.6438, "step": 94600 }, { "epoch": 2.182923459519204, "grad_norm": 2.857898712158203, "learning_rate": 2.8133498640217567e-05, "loss": 0.6487, "step": 94800 }, { "epoch": 2.1875287832734642, "grad_norm": 2.6172449588775635, "learning_rate": 2.8126227076395052e-05, "loss": 0.6422, "step": 95000 }, { "epoch": 2.192134107027724, "grad_norm": 4.178059101104736, "learning_rate": 2.8118955512572535e-05, "loss": 0.6477, "step": 95200 }, { "epoch": 2.196739430781984, "grad_norm": 3.011091947555542, "learning_rate": 2.8111683948750018e-05, "loss": 0.6463, "step": 95400 }, { "epoch": 2.201344754536244, "grad_norm": 3.2300426959991455, "learning_rate": 2.8104412384927503e-05, "loss": 0.6409, "step": 95600 }, { "epoch": 2.205950078290504, "grad_norm": 3.7736730575561523, "learning_rate": 2.8097140821104986e-05, "loss": 0.6504, "step": 95800 }, { "epoch": 2.210555402044764, "grad_norm": 3.110412120819092, "learning_rate": 2.8089869257282472e-05, "loss": 0.6378, "step": 96000 }, { "epoch": 2.2151607257990236, "grad_norm": 4.121431350708008, "learning_rate": 2.8082597693459958e-05, "loss": 0.653, "step": 96200 }, { "epoch": 2.2197660495532836, "grad_norm": 3.8190181255340576, "learning_rate": 2.807532612963744e-05, "loss": 0.6441, "step": 96400 }, { "epoch": 2.2243713733075436, "grad_norm": 2.9528844356536865, "learning_rate": 2.8068054565814923e-05, "loss": 0.6276, "step": 96600 }, { "epoch": 2.2289766970618032, "grad_norm": 3.1562676429748535, "learning_rate": 2.806078300199241e-05, "loss": 0.6367, "step": 96800 }, { "epoch": 2.2335820208160633, "grad_norm": 3.0100133419036865, "learning_rate": 2.8053511438169895e-05, "loss": 0.6336, "step": 97000 }, { "epoch": 2.2381873445703233, "grad_norm": 2.758850336074829, "learning_rate": 2.8046239874347377e-05, "loss": 0.6509, "step": 97200 }, { "epoch": 2.2427926683245833, "grad_norm": 3.6952168941497803, "learning_rate": 2.8038968310524863e-05, "loss": 0.6601, "step": 97400 }, { "epoch": 2.247397992078843, "grad_norm": 3.7031092643737793, "learning_rate": 2.803173310452146e-05, "loss": 0.6531, "step": 97600 }, { "epoch": 2.252003315833103, "grad_norm": 3.341907024383545, "learning_rate": 2.8024461540698944e-05, "loss": 0.6614, "step": 97800 }, { "epoch": 2.256608639587363, "grad_norm": 3.172600746154785, "learning_rate": 2.8017189976876427e-05, "loss": 0.6426, "step": 98000 }, { "epoch": 2.261213963341623, "grad_norm": 3.3580822944641113, "learning_rate": 2.800991841305391e-05, "loss": 0.6453, "step": 98200 }, { "epoch": 2.2658192870958827, "grad_norm": 4.135528087615967, "learning_rate": 2.8002646849231395e-05, "loss": 0.6386, "step": 98400 }, { "epoch": 2.2704246108501427, "grad_norm": 3.5923011302948, "learning_rate": 2.799537528540888e-05, "loss": 0.6651, "step": 98600 }, { "epoch": 2.2750299346044027, "grad_norm": 3.149178981781006, "learning_rate": 2.7988103721586367e-05, "loss": 0.6435, "step": 98800 }, { "epoch": 2.2796352583586628, "grad_norm": 3.2348053455352783, "learning_rate": 2.798083215776385e-05, "loss": 0.6424, "step": 99000 }, { "epoch": 2.2842405821129224, "grad_norm": 3.580576181411743, "learning_rate": 2.7973560593941335e-05, "loss": 0.6572, "step": 99200 }, { "epoch": 2.2888459058671824, "grad_norm": 3.2298011779785156, "learning_rate": 2.7966289030118818e-05, "loss": 0.666, "step": 99400 }, { "epoch": 2.2934512296214424, "grad_norm": 3.1280014514923096, "learning_rate": 2.79590174662963e-05, "loss": 0.6499, "step": 99600 }, { "epoch": 2.2980565533757025, "grad_norm": 3.244581937789917, "learning_rate": 2.79517822602929e-05, "loss": 0.6428, "step": 99800 }, { "epoch": 2.302661877129962, "grad_norm": 3.160811424255371, "learning_rate": 2.7944510696470385e-05, "loss": 0.639, "step": 100000 }, { "epoch": 2.307267200884222, "grad_norm": 3.2800822257995605, "learning_rate": 2.793727549046698e-05, "loss": 0.6504, "step": 100200 }, { "epoch": 2.311872524638482, "grad_norm": 2.948641300201416, "learning_rate": 2.7930003926644466e-05, "loss": 0.6455, "step": 100400 }, { "epoch": 2.316477848392742, "grad_norm": 3.270315408706665, "learning_rate": 2.792273236282195e-05, "loss": 0.6498, "step": 100600 }, { "epoch": 2.3210831721470018, "grad_norm": 3.389112710952759, "learning_rate": 2.7915460798999435e-05, "loss": 0.6392, "step": 100800 }, { "epoch": 2.325688495901262, "grad_norm": 3.453878164291382, "learning_rate": 2.7908189235176917e-05, "loss": 0.6538, "step": 101000 }, { "epoch": 2.330293819655522, "grad_norm": 2.7830543518066406, "learning_rate": 2.79009176713544e-05, "loss": 0.6402, "step": 101200 }, { "epoch": 2.334899143409782, "grad_norm": 3.6506121158599854, "learning_rate": 2.7893646107531886e-05, "loss": 0.6448, "step": 101400 }, { "epoch": 2.3395044671640415, "grad_norm": 2.926161050796509, "learning_rate": 2.788637454370937e-05, "loss": 0.6506, "step": 101600 }, { "epoch": 2.3441097909183015, "grad_norm": 2.95794677734375, "learning_rate": 2.7879102979886857e-05, "loss": 0.6472, "step": 101800 }, { "epoch": 2.3487151146725616, "grad_norm": 3.9702541828155518, "learning_rate": 2.787183141606434e-05, "loss": 0.645, "step": 102000 }, { "epoch": 2.3533204384268216, "grad_norm": 2.927553176879883, "learning_rate": 2.7864559852241826e-05, "loss": 0.6354, "step": 102200 }, { "epoch": 2.357925762181081, "grad_norm": 3.047414779663086, "learning_rate": 2.785728828841931e-05, "loss": 0.6475, "step": 102400 }, { "epoch": 2.3625310859353412, "grad_norm": 2.788905382156372, "learning_rate": 2.785001672459679e-05, "loss": 0.6542, "step": 102600 }, { "epoch": 2.3671364096896013, "grad_norm": 2.729799509048462, "learning_rate": 2.7842745160774277e-05, "loss": 0.6384, "step": 102800 }, { "epoch": 2.3717417334438613, "grad_norm": 3.3632562160491943, "learning_rate": 2.7835473596951763e-05, "loss": 0.6314, "step": 103000 }, { "epoch": 2.376347057198121, "grad_norm": 3.1274969577789307, "learning_rate": 2.7828202033129245e-05, "loss": 0.6456, "step": 103200 }, { "epoch": 2.380952380952381, "grad_norm": 3.8563055992126465, "learning_rate": 2.782093046930673e-05, "loss": 0.6393, "step": 103400 }, { "epoch": 2.385557704706641, "grad_norm": 3.3758862018585205, "learning_rate": 2.7813658905484214e-05, "loss": 0.6386, "step": 103600 }, { "epoch": 2.3901630284609006, "grad_norm": 3.2293407917022705, "learning_rate": 2.7806387341661696e-05, "loss": 0.6505, "step": 103800 }, { "epoch": 2.3947683522151606, "grad_norm": 3.1914443969726562, "learning_rate": 2.7799115777839182e-05, "loss": 0.643, "step": 104000 }, { "epoch": 2.3993736759694206, "grad_norm": 2.7224652767181396, "learning_rate": 2.7791844214016668e-05, "loss": 0.6374, "step": 104200 }, { "epoch": 2.4039789997236807, "grad_norm": 3.057840347290039, "learning_rate": 2.778457265019415e-05, "loss": 0.6343, "step": 104400 }, { "epoch": 2.4085843234779407, "grad_norm": 2.8306033611297607, "learning_rate": 2.7777301086371636e-05, "loss": 0.645, "step": 104600 }, { "epoch": 2.4131896472322003, "grad_norm": 3.3138833045959473, "learning_rate": 2.777002952254912e-05, "loss": 0.6502, "step": 104800 }, { "epoch": 2.4177949709864603, "grad_norm": 2.700263738632202, "learning_rate": 2.7762757958726605e-05, "loss": 0.6443, "step": 105000 }, { "epoch": 2.4224002947407204, "grad_norm": 4.0957932472229, "learning_rate": 2.7755486394904087e-05, "loss": 0.6615, "step": 105200 }, { "epoch": 2.42700561849498, "grad_norm": 2.5739316940307617, "learning_rate": 2.7748214831081573e-05, "loss": 0.6456, "step": 105400 }, { "epoch": 2.43161094224924, "grad_norm": 3.833193302154541, "learning_rate": 2.774094326725906e-05, "loss": 0.664, "step": 105600 }, { "epoch": 2.4362162660035, "grad_norm": 2.990009307861328, "learning_rate": 2.7733671703436542e-05, "loss": 0.6308, "step": 105800 }, { "epoch": 2.44082158975776, "grad_norm": 3.57989501953125, "learning_rate": 2.7726400139614028e-05, "loss": 0.6434, "step": 106000 }, { "epoch": 2.44542691351202, "grad_norm": 2.9661567211151123, "learning_rate": 2.771912857579151e-05, "loss": 0.6467, "step": 106200 }, { "epoch": 2.4500322372662797, "grad_norm": 2.5463151931762695, "learning_rate": 2.7711857011968993e-05, "loss": 0.6579, "step": 106400 }, { "epoch": 2.4546375610205398, "grad_norm": 3.589879274368286, "learning_rate": 2.770462180596559e-05, "loss": 0.6557, "step": 106600 }, { "epoch": 2.4592428847748, "grad_norm": 2.7275781631469727, "learning_rate": 2.7697386599962187e-05, "loss": 0.6576, "step": 106800 }, { "epoch": 2.4638482085290594, "grad_norm": 3.2831528186798096, "learning_rate": 2.7690115036139672e-05, "loss": 0.645, "step": 107000 }, { "epoch": 2.4684535322833194, "grad_norm": 2.328716993331909, "learning_rate": 2.768284347231716e-05, "loss": 0.6388, "step": 107200 }, { "epoch": 2.4730588560375795, "grad_norm": 3.319284439086914, "learning_rate": 2.767557190849464e-05, "loss": 0.6465, "step": 107400 }, { "epoch": 2.4776641797918395, "grad_norm": 3.1933634281158447, "learning_rate": 2.766833670249124e-05, "loss": 0.6414, "step": 107600 }, { "epoch": 2.482269503546099, "grad_norm": 3.2471187114715576, "learning_rate": 2.7661065138668725e-05, "loss": 0.6406, "step": 107800 }, { "epoch": 2.486874827300359, "grad_norm": 3.0858373641967773, "learning_rate": 2.7653793574846205e-05, "loss": 0.6537, "step": 108000 }, { "epoch": 2.491480151054619, "grad_norm": 2.9696667194366455, "learning_rate": 2.7646558368842803e-05, "loss": 0.6347, "step": 108200 }, { "epoch": 2.496085474808879, "grad_norm": 3.244499683380127, "learning_rate": 2.7639286805020286e-05, "loss": 0.6431, "step": 108400 }, { "epoch": 2.500690798563139, "grad_norm": 3.8631808757781982, "learning_rate": 2.763201524119777e-05, "loss": 0.6439, "step": 108600 }, { "epoch": 2.505296122317399, "grad_norm": 3.0365819931030273, "learning_rate": 2.7624743677375258e-05, "loss": 0.6466, "step": 108800 }, { "epoch": 2.509901446071659, "grad_norm": 3.1504974365234375, "learning_rate": 2.7617472113552743e-05, "loss": 0.6623, "step": 109000 }, { "epoch": 2.514506769825919, "grad_norm": 2.797166109085083, "learning_rate": 2.7610200549730226e-05, "loss": 0.6378, "step": 109200 }, { "epoch": 2.519112093580179, "grad_norm": 6.695431232452393, "learning_rate": 2.7602928985907712e-05, "loss": 0.6388, "step": 109400 }, { "epoch": 2.5237174173344386, "grad_norm": 2.692798614501953, "learning_rate": 2.7595657422085194e-05, "loss": 0.6556, "step": 109600 }, { "epoch": 2.5283227410886986, "grad_norm": 4.388221740722656, "learning_rate": 2.7588385858262677e-05, "loss": 0.6547, "step": 109800 }, { "epoch": 2.5329280648429586, "grad_norm": 6.0723676681518555, "learning_rate": 2.7581114294440163e-05, "loss": 0.642, "step": 110000 }, { "epoch": 2.537533388597218, "grad_norm": 3.7598865032196045, "learning_rate": 2.757384273061765e-05, "loss": 0.637, "step": 110200 }, { "epoch": 2.5421387123514783, "grad_norm": 4.075303077697754, "learning_rate": 2.756657116679513e-05, "loss": 0.6458, "step": 110400 }, { "epoch": 2.5467440361057383, "grad_norm": 2.6612720489501953, "learning_rate": 2.7559299602972617e-05, "loss": 0.6478, "step": 110600 }, { "epoch": 2.551349359859998, "grad_norm": 3.044689416885376, "learning_rate": 2.75520280391501e-05, "loss": 0.6586, "step": 110800 }, { "epoch": 2.555954683614258, "grad_norm": 3.982969284057617, "learning_rate": 2.7544756475327582e-05, "loss": 0.6457, "step": 111000 }, { "epoch": 2.560560007368518, "grad_norm": 3.1461875438690186, "learning_rate": 2.7537484911505068e-05, "loss": 0.649, "step": 111200 }, { "epoch": 2.565165331122778, "grad_norm": 3.544240951538086, "learning_rate": 2.7530213347682554e-05, "loss": 0.6479, "step": 111400 }, { "epoch": 2.569770654877038, "grad_norm": 3.3684897422790527, "learning_rate": 2.752294178386004e-05, "loss": 0.6437, "step": 111600 }, { "epoch": 2.5743759786312976, "grad_norm": 3.4466545581817627, "learning_rate": 2.7515670220037523e-05, "loss": 0.6518, "step": 111800 }, { "epoch": 2.5789813023855577, "grad_norm": 3.449298858642578, "learning_rate": 2.7508398656215005e-05, "loss": 0.6504, "step": 112000 }, { "epoch": 2.5835866261398177, "grad_norm": 2.8273544311523438, "learning_rate": 2.750112709239249e-05, "loss": 0.6395, "step": 112200 }, { "epoch": 2.5881919498940773, "grad_norm": 2.8043863773345947, "learning_rate": 2.7493855528569973e-05, "loss": 0.6562, "step": 112400 }, { "epoch": 2.5927972736483373, "grad_norm": 3.3060178756713867, "learning_rate": 2.748658396474746e-05, "loss": 0.637, "step": 112600 }, { "epoch": 2.5974025974025974, "grad_norm": 3.410897970199585, "learning_rate": 2.7479312400924945e-05, "loss": 0.6376, "step": 112800 }, { "epoch": 2.6020079211568574, "grad_norm": 3.279798984527588, "learning_rate": 2.7472040837102428e-05, "loss": 0.6449, "step": 113000 }, { "epoch": 2.6066132449111175, "grad_norm": 3.1562256813049316, "learning_rate": 2.7464769273279914e-05, "loss": 0.6447, "step": 113200 }, { "epoch": 2.611218568665377, "grad_norm": 2.973257303237915, "learning_rate": 2.7457534067276512e-05, "loss": 0.6406, "step": 113400 }, { "epoch": 2.615823892419637, "grad_norm": 3.439020872116089, "learning_rate": 2.745026250345399e-05, "loss": 0.6391, "step": 113600 }, { "epoch": 2.620429216173897, "grad_norm": 2.99761700630188, "learning_rate": 2.7442990939631477e-05, "loss": 0.6486, "step": 113800 }, { "epoch": 2.6250345399281567, "grad_norm": 3.7341833114624023, "learning_rate": 2.7435719375808963e-05, "loss": 0.6371, "step": 114000 }, { "epoch": 2.6296398636824168, "grad_norm": 3.088653326034546, "learning_rate": 2.7428447811986446e-05, "loss": 0.6513, "step": 114200 }, { "epoch": 2.634245187436677, "grad_norm": 3.4004712104797363, "learning_rate": 2.7421176248163932e-05, "loss": 0.6321, "step": 114400 }, { "epoch": 2.638850511190937, "grad_norm": 2.652799129486084, "learning_rate": 2.7413904684341418e-05, "loss": 0.6364, "step": 114600 }, { "epoch": 2.643455834945197, "grad_norm": 3.6918208599090576, "learning_rate": 2.7406633120518897e-05, "loss": 0.6341, "step": 114800 }, { "epoch": 2.6480611586994565, "grad_norm": 3.6925747394561768, "learning_rate": 2.7399361556696383e-05, "loss": 0.6422, "step": 115000 }, { "epoch": 2.6526664824537165, "grad_norm": 3.1433355808258057, "learning_rate": 2.739208999287387e-05, "loss": 0.6372, "step": 115200 }, { "epoch": 2.6572718062079765, "grad_norm": 2.9344465732574463, "learning_rate": 2.738481842905135e-05, "loss": 0.6478, "step": 115400 }, { "epoch": 2.661877129962236, "grad_norm": 3.018880605697632, "learning_rate": 2.7377546865228837e-05, "loss": 0.6395, "step": 115600 }, { "epoch": 2.666482453716496, "grad_norm": 2.8171603679656982, "learning_rate": 2.7370275301406323e-05, "loss": 0.6492, "step": 115800 }, { "epoch": 2.671087777470756, "grad_norm": 2.8728506565093994, "learning_rate": 2.7363003737583802e-05, "loss": 0.6405, "step": 116000 }, { "epoch": 2.6756931012250162, "grad_norm": 3.09126877784729, "learning_rate": 2.7355732173761288e-05, "loss": 0.6398, "step": 116200 }, { "epoch": 2.6802984249792763, "grad_norm": 3.089311122894287, "learning_rate": 2.7348460609938774e-05, "loss": 0.6494, "step": 116400 }, { "epoch": 2.684903748733536, "grad_norm": 2.3543293476104736, "learning_rate": 2.734118904611626e-05, "loss": 0.6376, "step": 116600 }, { "epoch": 2.689509072487796, "grad_norm": 2.459444046020508, "learning_rate": 2.7333917482293742e-05, "loss": 0.6337, "step": 116800 }, { "epoch": 2.694114396242056, "grad_norm": 3.3463871479034424, "learning_rate": 2.7326645918471228e-05, "loss": 0.6359, "step": 117000 }, { "epoch": 2.6987197199963155, "grad_norm": 2.9327232837677, "learning_rate": 2.7319374354648714e-05, "loss": 0.6264, "step": 117200 }, { "epoch": 2.7033250437505756, "grad_norm": 3.052452802658081, "learning_rate": 2.7312102790826193e-05, "loss": 0.6466, "step": 117400 }, { "epoch": 2.7079303675048356, "grad_norm": 3.294747829437256, "learning_rate": 2.730483122700368e-05, "loss": 0.6525, "step": 117600 }, { "epoch": 2.7125356912590957, "grad_norm": 3.948997735977173, "learning_rate": 2.7297596021000278e-05, "loss": 0.6457, "step": 117800 }, { "epoch": 2.7171410150133557, "grad_norm": 2.7049851417541504, "learning_rate": 2.729032445717776e-05, "loss": 0.6435, "step": 118000 }, { "epoch": 2.7217463387676153, "grad_norm": 3.9702322483062744, "learning_rate": 2.7283052893355246e-05, "loss": 0.6516, "step": 118200 }, { "epoch": 2.7263516625218753, "grad_norm": 2.606865882873535, "learning_rate": 2.7275781329532732e-05, "loss": 0.6551, "step": 118400 }, { "epoch": 2.7309569862761354, "grad_norm": 3.387070894241333, "learning_rate": 2.7268509765710215e-05, "loss": 0.6403, "step": 118600 }, { "epoch": 2.735562310030395, "grad_norm": 3.625943899154663, "learning_rate": 2.7261238201887697e-05, "loss": 0.6269, "step": 118800 }, { "epoch": 2.740167633784655, "grad_norm": 2.9141345024108887, "learning_rate": 2.7254002995884292e-05, "loss": 0.6501, "step": 119000 }, { "epoch": 2.744772957538915, "grad_norm": 3.324512004852295, "learning_rate": 2.724673143206178e-05, "loss": 0.6393, "step": 119200 }, { "epoch": 2.749378281293175, "grad_norm": 3.5145280361175537, "learning_rate": 2.7239459868239264e-05, "loss": 0.6319, "step": 119400 }, { "epoch": 2.7539836050474347, "grad_norm": 2.751305103302002, "learning_rate": 2.723218830441675e-05, "loss": 0.6362, "step": 119600 }, { "epoch": 2.7585889288016947, "grad_norm": 3.314880132675171, "learning_rate": 2.7224916740594233e-05, "loss": 0.635, "step": 119800 }, { "epoch": 2.7631942525559547, "grad_norm": 3.0874931812286377, "learning_rate": 2.721764517677172e-05, "loss": 0.6568, "step": 120000 }, { "epoch": 2.767799576310215, "grad_norm": 3.5240249633789062, "learning_rate": 2.7210373612949205e-05, "loss": 0.6494, "step": 120200 }, { "epoch": 2.7724049000644744, "grad_norm": 2.875325918197632, "learning_rate": 2.7203102049126684e-05, "loss": 0.6355, "step": 120400 }, { "epoch": 2.7770102238187344, "grad_norm": 3.846749782562256, "learning_rate": 2.719583048530417e-05, "loss": 0.6338, "step": 120600 }, { "epoch": 2.7816155475729945, "grad_norm": 3.331397533416748, "learning_rate": 2.7188558921481656e-05, "loss": 0.6511, "step": 120800 }, { "epoch": 2.786220871327254, "grad_norm": 3.6676218509674072, "learning_rate": 2.7181287357659138e-05, "loss": 0.6326, "step": 121000 }, { "epoch": 2.790826195081514, "grad_norm": 3.7938387393951416, "learning_rate": 2.7174015793836624e-05, "loss": 0.6354, "step": 121200 }, { "epoch": 2.795431518835774, "grad_norm": 3.1966028213500977, "learning_rate": 2.716674423001411e-05, "loss": 0.6457, "step": 121400 }, { "epoch": 2.800036842590034, "grad_norm": 2.660951852798462, "learning_rate": 2.715947266619159e-05, "loss": 0.6536, "step": 121600 }, { "epoch": 2.804642166344294, "grad_norm": 3.02982234954834, "learning_rate": 2.7152201102369075e-05, "loss": 0.6507, "step": 121800 }, { "epoch": 2.809247490098554, "grad_norm": 3.0377113819122314, "learning_rate": 2.714492953854656e-05, "loss": 0.645, "step": 122000 }, { "epoch": 2.813852813852814, "grad_norm": 3.3548905849456787, "learning_rate": 2.7137657974724043e-05, "loss": 0.636, "step": 122200 }, { "epoch": 2.818458137607074, "grad_norm": 4.323446750640869, "learning_rate": 2.713038641090153e-05, "loss": 0.6529, "step": 122400 }, { "epoch": 2.8230634613613335, "grad_norm": 3.245694637298584, "learning_rate": 2.7123114847079015e-05, "loss": 0.6364, "step": 122600 }, { "epoch": 2.8276687851155935, "grad_norm": 2.8766958713531494, "learning_rate": 2.7115843283256498e-05, "loss": 0.6388, "step": 122800 }, { "epoch": 2.8322741088698535, "grad_norm": 2.433682441711426, "learning_rate": 2.7108608077253093e-05, "loss": 0.6383, "step": 123000 }, { "epoch": 2.8368794326241136, "grad_norm": 3.1630749702453613, "learning_rate": 2.710137287124969e-05, "loss": 0.6312, "step": 123200 }, { "epoch": 2.8414847563783736, "grad_norm": 3.1013870239257812, "learning_rate": 2.7094101307427174e-05, "loss": 0.6377, "step": 123400 }, { "epoch": 2.846090080132633, "grad_norm": 3.5772645473480225, "learning_rate": 2.708682974360466e-05, "loss": 0.6404, "step": 123600 }, { "epoch": 2.8506954038868932, "grad_norm": 2.970681667327881, "learning_rate": 2.7079558179782146e-05, "loss": 0.6472, "step": 123800 }, { "epoch": 2.8553007276411533, "grad_norm": 2.7734873294830322, "learning_rate": 2.707228661595963e-05, "loss": 0.6329, "step": 124000 }, { "epoch": 2.859906051395413, "grad_norm": 3.264719247817993, "learning_rate": 2.7065015052137114e-05, "loss": 0.6518, "step": 124200 }, { "epoch": 2.864511375149673, "grad_norm": 3.1306731700897217, "learning_rate": 2.70577434883146e-05, "loss": 0.652, "step": 124400 }, { "epoch": 2.869116698903933, "grad_norm": 3.149149179458618, "learning_rate": 2.705047192449208e-05, "loss": 0.6429, "step": 124600 }, { "epoch": 2.873722022658193, "grad_norm": 3.6126046180725098, "learning_rate": 2.7043200360669565e-05, "loss": 0.6355, "step": 124800 }, { "epoch": 2.878327346412453, "grad_norm": 2.957371234893799, "learning_rate": 2.703592879684705e-05, "loss": 0.6425, "step": 125000 }, { "epoch": 2.8829326701667126, "grad_norm": 2.859186887741089, "learning_rate": 2.7028693590843646e-05, "loss": 0.6445, "step": 125200 }, { "epoch": 2.8875379939209727, "grad_norm": 3.0626325607299805, "learning_rate": 2.7021422027021132e-05, "loss": 0.6572, "step": 125400 }, { "epoch": 2.8921433176752327, "grad_norm": 2.7346231937408447, "learning_rate": 2.7014150463198618e-05, "loss": 0.6421, "step": 125600 }, { "epoch": 2.8967486414294923, "grad_norm": 2.984370470046997, "learning_rate": 2.70068788993761e-05, "loss": 0.6436, "step": 125800 }, { "epoch": 2.9013539651837523, "grad_norm": 3.117213726043701, "learning_rate": 2.6999607335553583e-05, "loss": 0.6493, "step": 126000 }, { "epoch": 2.9059592889380124, "grad_norm": 3.779122829437256, "learning_rate": 2.699233577173107e-05, "loss": 0.6394, "step": 126200 }, { "epoch": 2.9105646126922724, "grad_norm": 3.12115740776062, "learning_rate": 2.6985064207908552e-05, "loss": 0.6344, "step": 126400 }, { "epoch": 2.9151699364465324, "grad_norm": 3.0919904708862305, "learning_rate": 2.6977792644086038e-05, "loss": 0.6405, "step": 126600 }, { "epoch": 2.919775260200792, "grad_norm": 2.8004214763641357, "learning_rate": 2.6970521080263524e-05, "loss": 0.6489, "step": 126800 }, { "epoch": 2.924380583955052, "grad_norm": 2.920830726623535, "learning_rate": 2.6963249516441006e-05, "loss": 0.6394, "step": 127000 }, { "epoch": 2.928985907709312, "grad_norm": 3.278106451034546, "learning_rate": 2.6955977952618492e-05, "loss": 0.6434, "step": 127200 }, { "epoch": 2.9335912314635717, "grad_norm": 3.6461238861083984, "learning_rate": 2.6948706388795975e-05, "loss": 0.646, "step": 127400 }, { "epoch": 2.9381965552178317, "grad_norm": 3.31779146194458, "learning_rate": 2.694143482497346e-05, "loss": 0.6361, "step": 127600 }, { "epoch": 2.942801878972092, "grad_norm": 2.7438087463378906, "learning_rate": 2.6934199618970056e-05, "loss": 0.6437, "step": 127800 }, { "epoch": 2.947407202726352, "grad_norm": 2.8473029136657715, "learning_rate": 2.692692805514754e-05, "loss": 0.6338, "step": 128000 }, { "epoch": 2.952012526480612, "grad_norm": 2.5815622806549072, "learning_rate": 2.6919656491325024e-05, "loss": 0.6371, "step": 128200 }, { "epoch": 2.9566178502348714, "grad_norm": 3.202380657196045, "learning_rate": 2.691238492750251e-05, "loss": 0.6469, "step": 128400 }, { "epoch": 2.9612231739891315, "grad_norm": 3.2944741249084473, "learning_rate": 2.6905113363679996e-05, "loss": 0.6515, "step": 128600 }, { "epoch": 2.9658284977433915, "grad_norm": 2.853166103363037, "learning_rate": 2.6897841799857475e-05, "loss": 0.6392, "step": 128800 }, { "epoch": 2.970433821497651, "grad_norm": 3.1663033962249756, "learning_rate": 2.689057023603496e-05, "loss": 0.6454, "step": 129000 }, { "epoch": 2.975039145251911, "grad_norm": 2.5806081295013428, "learning_rate": 2.6883298672212447e-05, "loss": 0.6411, "step": 129200 }, { "epoch": 2.979644469006171, "grad_norm": 2.8585219383239746, "learning_rate": 2.6876027108389933e-05, "loss": 0.6475, "step": 129400 }, { "epoch": 2.984249792760431, "grad_norm": 2.955064058303833, "learning_rate": 2.6868755544567415e-05, "loss": 0.6548, "step": 129600 }, { "epoch": 2.988855116514691, "grad_norm": 4.072856426239014, "learning_rate": 2.68614839807449e-05, "loss": 0.6538, "step": 129800 }, { "epoch": 2.993460440268951, "grad_norm": 3.6334667205810547, "learning_rate": 2.6854212416922384e-05, "loss": 0.6406, "step": 130000 }, { "epoch": 2.998065764023211, "grad_norm": 2.8243021965026855, "learning_rate": 2.6846940853099866e-05, "loss": 0.6339, "step": 130200 }, { "epoch": 3.0, "eval_loss": 0.5988962650299072, "eval_runtime": 145.7025, "eval_samples_per_second": 194.65, "eval_steps_per_second": 12.169, "step": 130284 }, { "epoch": 3.002671087777471, "grad_norm": 2.527531385421753, "learning_rate": 2.6839669289277352e-05, "loss": 0.6279, "step": 130400 }, { "epoch": 3.0072764115317305, "grad_norm": 3.933335304260254, "learning_rate": 2.6832397725454838e-05, "loss": 0.6282, "step": 130600 }, { "epoch": 3.0118817352859906, "grad_norm": 3.1796278953552246, "learning_rate": 2.682512616163232e-05, "loss": 0.6296, "step": 130800 }, { "epoch": 3.0164870590402506, "grad_norm": 2.6977293491363525, "learning_rate": 2.681789095562892e-05, "loss": 0.6249, "step": 131000 }, { "epoch": 3.0210923827945106, "grad_norm": 3.7220981121063232, "learning_rate": 2.6810619391806405e-05, "loss": 0.6298, "step": 131200 }, { "epoch": 3.0256977065487702, "grad_norm": 3.126789093017578, "learning_rate": 2.6803347827983888e-05, "loss": 0.6473, "step": 131400 }, { "epoch": 3.0303030303030303, "grad_norm": 2.9881398677825928, "learning_rate": 2.679607626416137e-05, "loss": 0.6445, "step": 131600 }, { "epoch": 3.0349083540572903, "grad_norm": 4.632811069488525, "learning_rate": 2.6788804700338856e-05, "loss": 0.6415, "step": 131800 }, { "epoch": 3.0395136778115504, "grad_norm": 3.6200356483459473, "learning_rate": 2.678153313651634e-05, "loss": 0.6242, "step": 132000 }, { "epoch": 3.04411900156581, "grad_norm": 3.1809945106506348, "learning_rate": 2.6774261572693825e-05, "loss": 0.638, "step": 132200 }, { "epoch": 3.04872432532007, "grad_norm": 4.0169806480407715, "learning_rate": 2.676699000887131e-05, "loss": 0.6335, "step": 132400 }, { "epoch": 3.05332964907433, "grad_norm": 3.9464833736419678, "learning_rate": 2.6759718445048793e-05, "loss": 0.6376, "step": 132600 }, { "epoch": 3.05793497282859, "grad_norm": 2.9559237957000732, "learning_rate": 2.6752446881226276e-05, "loss": 0.6343, "step": 132800 }, { "epoch": 3.0625402965828497, "grad_norm": 3.0488362312316895, "learning_rate": 2.674517531740376e-05, "loss": 0.6426, "step": 133000 }, { "epoch": 3.0671456203371097, "grad_norm": 3.531463861465454, "learning_rate": 2.6737903753581244e-05, "loss": 0.6438, "step": 133200 }, { "epoch": 3.0717509440913697, "grad_norm": 3.7633442878723145, "learning_rate": 2.673063218975873e-05, "loss": 0.6383, "step": 133400 }, { "epoch": 3.0763562678456298, "grad_norm": 2.790278196334839, "learning_rate": 2.6723360625936216e-05, "loss": 0.6376, "step": 133600 }, { "epoch": 3.0809615915998894, "grad_norm": 3.4008514881134033, "learning_rate": 2.67160890621137e-05, "loss": 0.6293, "step": 133800 }, { "epoch": 3.0855669153541494, "grad_norm": 2.998616933822632, "learning_rate": 2.6708817498291184e-05, "loss": 0.6387, "step": 134000 }, { "epoch": 3.0901722391084094, "grad_norm": 3.212735891342163, "learning_rate": 2.6701545934468667e-05, "loss": 0.6208, "step": 134200 }, { "epoch": 3.094777562862669, "grad_norm": 3.259173631668091, "learning_rate": 2.6694310728465262e-05, "loss": 0.6398, "step": 134400 }, { "epoch": 3.099382886616929, "grad_norm": 3.346163511276245, "learning_rate": 2.6687039164642748e-05, "loss": 0.6488, "step": 134600 }, { "epoch": 3.103988210371189, "grad_norm": 3.005929708480835, "learning_rate": 2.6679767600820234e-05, "loss": 0.6339, "step": 134800 }, { "epoch": 3.108593534125449, "grad_norm": 3.2009568214416504, "learning_rate": 2.6672496036997716e-05, "loss": 0.63, "step": 135000 }, { "epoch": 3.1131988578797087, "grad_norm": 2.799675226211548, "learning_rate": 2.6665224473175202e-05, "loss": 0.6275, "step": 135200 }, { "epoch": 3.1178041816339688, "grad_norm": 3.211223840713501, "learning_rate": 2.6657952909352688e-05, "loss": 0.6436, "step": 135400 }, { "epoch": 3.122409505388229, "grad_norm": 2.964587926864624, "learning_rate": 2.665068134553017e-05, "loss": 0.6512, "step": 135600 }, { "epoch": 3.127014829142489, "grad_norm": 3.677194595336914, "learning_rate": 2.6643409781707653e-05, "loss": 0.6396, "step": 135800 }, { "epoch": 3.1316201528967484, "grad_norm": 3.1306989192962646, "learning_rate": 2.663613821788514e-05, "loss": 0.6295, "step": 136000 }, { "epoch": 3.1362254766510085, "grad_norm": 2.744112968444824, "learning_rate": 2.6628866654062625e-05, "loss": 0.6328, "step": 136200 }, { "epoch": 3.1408308004052685, "grad_norm": 3.3055408000946045, "learning_rate": 2.6621595090240108e-05, "loss": 0.649, "step": 136400 }, { "epoch": 3.1454361241595286, "grad_norm": 2.8039541244506836, "learning_rate": 2.6614323526417593e-05, "loss": 0.6382, "step": 136600 }, { "epoch": 3.150041447913788, "grad_norm": 4.089864730834961, "learning_rate": 2.6607051962595076e-05, "loss": 0.642, "step": 136800 }, { "epoch": 3.154646771668048, "grad_norm": 4.057522296905518, "learning_rate": 2.659978039877256e-05, "loss": 0.6349, "step": 137000 }, { "epoch": 3.1592520954223082, "grad_norm": 3.3105990886688232, "learning_rate": 2.6592508834950044e-05, "loss": 0.6389, "step": 137200 }, { "epoch": 3.1638574191765683, "grad_norm": 3.4372615814208984, "learning_rate": 2.6585273628946643e-05, "loss": 0.6244, "step": 137400 }, { "epoch": 3.168462742930828, "grad_norm": 2.707204818725586, "learning_rate": 2.6578002065124126e-05, "loss": 0.6341, "step": 137600 }, { "epoch": 3.173068066685088, "grad_norm": 2.621185064315796, "learning_rate": 2.657073050130161e-05, "loss": 0.6464, "step": 137800 }, { "epoch": 3.177673390439348, "grad_norm": 3.090057611465454, "learning_rate": 2.6563458937479097e-05, "loss": 0.6257, "step": 138000 }, { "epoch": 3.182278714193608, "grad_norm": 2.8256921768188477, "learning_rate": 2.655618737365658e-05, "loss": 0.6349, "step": 138200 }, { "epoch": 3.1868840379478676, "grad_norm": 4.003274917602539, "learning_rate": 2.6548915809834062e-05, "loss": 0.6321, "step": 138400 }, { "epoch": 3.1914893617021276, "grad_norm": 3.346566677093506, "learning_rate": 2.654164424601155e-05, "loss": 0.636, "step": 138600 }, { "epoch": 3.1960946854563876, "grad_norm": 3.386808156967163, "learning_rate": 2.653437268218903e-05, "loss": 0.6287, "step": 138800 }, { "epoch": 3.2007000092106477, "grad_norm": 2.970651149749756, "learning_rate": 2.6527101118366517e-05, "loss": 0.6299, "step": 139000 }, { "epoch": 3.2053053329649073, "grad_norm": 3.1046876907348633, "learning_rate": 2.6519865912363115e-05, "loss": 0.6245, "step": 139200 }, { "epoch": 3.2099106567191673, "grad_norm": 3.6521196365356445, "learning_rate": 2.6512594348540598e-05, "loss": 0.6315, "step": 139400 }, { "epoch": 3.2145159804734273, "grad_norm": 2.994075298309326, "learning_rate": 2.6505322784718084e-05, "loss": 0.6459, "step": 139600 }, { "epoch": 3.2191213042276874, "grad_norm": 3.732825517654419, "learning_rate": 2.6498051220895566e-05, "loss": 0.6445, "step": 139800 }, { "epoch": 3.223726627981947, "grad_norm": 3.046517848968506, "learning_rate": 2.649077965707305e-05, "loss": 0.6352, "step": 140000 }, { "epoch": 3.228331951736207, "grad_norm": 3.441150426864624, "learning_rate": 2.6483508093250535e-05, "loss": 0.6337, "step": 140200 }, { "epoch": 3.232937275490467, "grad_norm": 4.240359306335449, "learning_rate": 2.647623652942802e-05, "loss": 0.6317, "step": 140400 }, { "epoch": 3.237542599244727, "grad_norm": 3.5375964641571045, "learning_rate": 2.6468964965605503e-05, "loss": 0.6309, "step": 140600 }, { "epoch": 3.2421479229989867, "grad_norm": 3.648287773132324, "learning_rate": 2.646169340178299e-05, "loss": 0.6336, "step": 140800 }, { "epoch": 3.2467532467532467, "grad_norm": 3.4129178524017334, "learning_rate": 2.6454421837960475e-05, "loss": 0.6226, "step": 141000 }, { "epoch": 3.2513585705075068, "grad_norm": 3.6263647079467773, "learning_rate": 2.6447150274137954e-05, "loss": 0.6289, "step": 141200 }, { "epoch": 3.2559638942617664, "grad_norm": 2.798513889312744, "learning_rate": 2.643987871031544e-05, "loss": 0.631, "step": 141400 }, { "epoch": 3.2605692180160264, "grad_norm": 3.113395929336548, "learning_rate": 2.6432607146492926e-05, "loss": 0.6316, "step": 141600 }, { "epoch": 3.2651745417702864, "grad_norm": 2.998884916305542, "learning_rate": 2.642533558267041e-05, "loss": 0.6218, "step": 141800 }, { "epoch": 3.2697798655245465, "grad_norm": 3.264549493789673, "learning_rate": 2.6418064018847894e-05, "loss": 0.6306, "step": 142000 }, { "epoch": 3.2743851892788065, "grad_norm": 2.9390909671783447, "learning_rate": 2.641079245502538e-05, "loss": 0.6357, "step": 142200 }, { "epoch": 3.278990513033066, "grad_norm": 2.7646069526672363, "learning_rate": 2.6403520891202863e-05, "loss": 0.6232, "step": 142400 }, { "epoch": 3.283595836787326, "grad_norm": 3.1988773345947266, "learning_rate": 2.6396249327380345e-05, "loss": 0.6246, "step": 142600 }, { "epoch": 3.288201160541586, "grad_norm": 2.912640333175659, "learning_rate": 2.638897776355783e-05, "loss": 0.6361, "step": 142800 }, { "epoch": 3.2928064842958458, "grad_norm": 3.98209810256958, "learning_rate": 2.6381706199735317e-05, "loss": 0.6234, "step": 143000 }, { "epoch": 3.297411808050106, "grad_norm": 3.3420772552490234, "learning_rate": 2.63744346359128e-05, "loss": 0.6344, "step": 143200 }, { "epoch": 3.302017131804366, "grad_norm": 4.18234395980835, "learning_rate": 2.6367163072090286e-05, "loss": 0.6178, "step": 143400 }, { "epoch": 3.306622455558626, "grad_norm": 2.9908995628356934, "learning_rate": 2.6359891508267768e-05, "loss": 0.6181, "step": 143600 }, { "epoch": 3.311227779312886, "grad_norm": 3.504472255706787, "learning_rate": 2.635261994444525e-05, "loss": 0.6403, "step": 143800 }, { "epoch": 3.3158331030671455, "grad_norm": 3.0107550621032715, "learning_rate": 2.6345348380622737e-05, "loss": 0.6338, "step": 144000 }, { "epoch": 3.3204384268214056, "grad_norm": 3.5131990909576416, "learning_rate": 2.6338076816800223e-05, "loss": 0.6412, "step": 144200 }, { "epoch": 3.3250437505756656, "grad_norm": 3.315256357192993, "learning_rate": 2.6330805252977705e-05, "loss": 0.6329, "step": 144400 }, { "epoch": 3.329649074329925, "grad_norm": 3.0303072929382324, "learning_rate": 2.632353368915519e-05, "loss": 0.6312, "step": 144600 }, { "epoch": 3.3342543980841852, "grad_norm": 3.3184735774993896, "learning_rate": 2.6316262125332677e-05, "loss": 0.647, "step": 144800 }, { "epoch": 3.3388597218384453, "grad_norm": 2.842087745666504, "learning_rate": 2.630899056151016e-05, "loss": 0.622, "step": 145000 }, { "epoch": 3.3434650455927053, "grad_norm": 2.741712808609009, "learning_rate": 2.6301718997687642e-05, "loss": 0.635, "step": 145200 }, { "epoch": 3.3480703693469653, "grad_norm": 2.9912500381469727, "learning_rate": 2.6294447433865128e-05, "loss": 0.6426, "step": 145400 }, { "epoch": 3.352675693101225, "grad_norm": 3.2431087493896484, "learning_rate": 2.6287175870042614e-05, "loss": 0.6314, "step": 145600 }, { "epoch": 3.357281016855485, "grad_norm": 2.5913467407226562, "learning_rate": 2.6279904306220096e-05, "loss": 0.6282, "step": 145800 }, { "epoch": 3.361886340609745, "grad_norm": 2.8884477615356445, "learning_rate": 2.6272632742397582e-05, "loss": 0.6446, "step": 146000 }, { "epoch": 3.3664916643640046, "grad_norm": 3.9637069702148438, "learning_rate": 2.6265361178575065e-05, "loss": 0.6327, "step": 146200 }, { "epoch": 3.3710969881182646, "grad_norm": 3.9412338733673096, "learning_rate": 2.6258089614752547e-05, "loss": 0.6294, "step": 146400 }, { "epoch": 3.3757023118725247, "grad_norm": 3.3022706508636475, "learning_rate": 2.6250818050930033e-05, "loss": 0.6326, "step": 146600 }, { "epoch": 3.3803076356267847, "grad_norm": 2.921814441680908, "learning_rate": 2.624354648710752e-05, "loss": 0.6312, "step": 146800 }, { "epoch": 3.3849129593810443, "grad_norm": 3.0949013233184814, "learning_rate": 2.6236274923285e-05, "loss": 0.6192, "step": 147000 }, { "epoch": 3.3895182831353043, "grad_norm": 4.004492282867432, "learning_rate": 2.6229003359462488e-05, "loss": 0.6298, "step": 147200 }, { "epoch": 3.3941236068895644, "grad_norm": 3.4825210571289062, "learning_rate": 2.622173179563997e-05, "loss": 0.6333, "step": 147400 }, { "epoch": 3.3987289306438244, "grad_norm": 3.0541634559631348, "learning_rate": 2.6214460231817453e-05, "loss": 0.6347, "step": 147600 }, { "epoch": 3.403334254398084, "grad_norm": 4.5528693199157715, "learning_rate": 2.6207261383633164e-05, "loss": 0.6381, "step": 147800 }, { "epoch": 3.407939578152344, "grad_norm": 3.09186053276062, "learning_rate": 2.6199989819810646e-05, "loss": 0.6424, "step": 148000 }, { "epoch": 3.412544901906604, "grad_norm": 4.061122894287109, "learning_rate": 2.6192718255988132e-05, "loss": 0.6292, "step": 148200 }, { "epoch": 3.417150225660864, "grad_norm": 3.7221198081970215, "learning_rate": 2.6185446692165618e-05, "loss": 0.6446, "step": 148400 }, { "epoch": 3.4217555494151237, "grad_norm": 2.8360812664031982, "learning_rate": 2.6178175128343104e-05, "loss": 0.6492, "step": 148600 }, { "epoch": 3.4263608731693838, "grad_norm": 3.033463954925537, "learning_rate": 2.6170903564520587e-05, "loss": 0.6273, "step": 148800 }, { "epoch": 3.430966196923644, "grad_norm": 5.270578384399414, "learning_rate": 2.6163632000698073e-05, "loss": 0.6235, "step": 149000 }, { "epoch": 3.435571520677904, "grad_norm": 2.8622264862060547, "learning_rate": 2.6156360436875555e-05, "loss": 0.6325, "step": 149200 }, { "epoch": 3.4401768444321634, "grad_norm": 3.450784206390381, "learning_rate": 2.6149088873053038e-05, "loss": 0.6373, "step": 149400 }, { "epoch": 3.4447821681864235, "grad_norm": 3.269690990447998, "learning_rate": 2.6141817309230524e-05, "loss": 0.6274, "step": 149600 }, { "epoch": 3.4493874919406835, "grad_norm": 3.7646055221557617, "learning_rate": 2.613454574540801e-05, "loss": 0.6287, "step": 149800 }, { "epoch": 3.4539928156949435, "grad_norm": 3.5802409648895264, "learning_rate": 2.6127274181585492e-05, "loss": 0.6322, "step": 150000 }, { "epoch": 3.458598139449203, "grad_norm": 3.1698813438415527, "learning_rate": 2.6120002617762978e-05, "loss": 0.6339, "step": 150200 }, { "epoch": 3.463203463203463, "grad_norm": 2.7178969383239746, "learning_rate": 2.611273105394046e-05, "loss": 0.6382, "step": 150400 }, { "epoch": 3.467808786957723, "grad_norm": 3.088505506515503, "learning_rate": 2.6105459490117943e-05, "loss": 0.6256, "step": 150600 }, { "epoch": 3.4724141107119832, "grad_norm": 2.7076618671417236, "learning_rate": 2.609818792629543e-05, "loss": 0.6312, "step": 150800 }, { "epoch": 3.477019434466243, "grad_norm": 2.320732831954956, "learning_rate": 2.6090916362472915e-05, "loss": 0.6289, "step": 151000 }, { "epoch": 3.481624758220503, "grad_norm": 3.147030830383301, "learning_rate": 2.6083644798650397e-05, "loss": 0.641, "step": 151200 }, { "epoch": 3.486230081974763, "grad_norm": 3.3665804862976074, "learning_rate": 2.6076373234827883e-05, "loss": 0.633, "step": 151400 }, { "epoch": 3.4908354057290225, "grad_norm": 3.705167531967163, "learning_rate": 2.606910167100537e-05, "loss": 0.6372, "step": 151600 }, { "epoch": 3.4954407294832825, "grad_norm": 2.7983779907226562, "learning_rate": 2.606183010718285e-05, "loss": 0.6277, "step": 151800 }, { "epoch": 3.5000460532375426, "grad_norm": 3.0177760124206543, "learning_rate": 2.6054594901179447e-05, "loss": 0.6277, "step": 152000 }, { "epoch": 3.5046513769918026, "grad_norm": 3.7722854614257812, "learning_rate": 2.6047323337356933e-05, "loss": 0.6278, "step": 152200 }, { "epoch": 3.5092567007460627, "grad_norm": 2.6710495948791504, "learning_rate": 2.6040051773534415e-05, "loss": 0.6202, "step": 152400 }, { "epoch": 3.5138620245003223, "grad_norm": 3.0754504203796387, "learning_rate": 2.60327802097119e-05, "loss": 0.6317, "step": 152600 }, { "epoch": 3.5184673482545823, "grad_norm": 3.376631498336792, "learning_rate": 2.60255450037085e-05, "loss": 0.6422, "step": 152800 }, { "epoch": 3.5230726720088423, "grad_norm": 2.970846652984619, "learning_rate": 2.6018273439885982e-05, "loss": 0.6315, "step": 153000 }, { "epoch": 3.527677995763102, "grad_norm": 2.7988736629486084, "learning_rate": 2.6011001876063468e-05, "loss": 0.6321, "step": 153200 }, { "epoch": 3.532283319517362, "grad_norm": 2.94100022315979, "learning_rate": 2.600373031224095e-05, "loss": 0.6253, "step": 153400 }, { "epoch": 3.536888643271622, "grad_norm": 3.08990216255188, "learning_rate": 2.5996458748418433e-05, "loss": 0.6337, "step": 153600 }, { "epoch": 3.541493967025882, "grad_norm": 2.9995460510253906, "learning_rate": 2.598918718459592e-05, "loss": 0.634, "step": 153800 }, { "epoch": 3.546099290780142, "grad_norm": 3.2981879711151123, "learning_rate": 2.5981915620773405e-05, "loss": 0.6263, "step": 154000 }, { "epoch": 3.5507046145344017, "grad_norm": 3.200422525405884, "learning_rate": 2.5974644056950888e-05, "loss": 0.6185, "step": 154200 }, { "epoch": 3.5553099382886617, "grad_norm": 2.9688191413879395, "learning_rate": 2.5967372493128374e-05, "loss": 0.6213, "step": 154400 }, { "epoch": 3.5599152620429217, "grad_norm": 2.5572378635406494, "learning_rate": 2.5960100929305856e-05, "loss": 0.6365, "step": 154600 }, { "epoch": 3.5645205857971813, "grad_norm": 2.690509557723999, "learning_rate": 2.5952829365483342e-05, "loss": 0.6428, "step": 154800 }, { "epoch": 3.5691259095514414, "grad_norm": 5.786709785461426, "learning_rate": 2.5945594159479937e-05, "loss": 0.6448, "step": 155000 }, { "epoch": 3.5737312333057014, "grad_norm": 2.30430269241333, "learning_rate": 2.5938322595657423e-05, "loss": 0.6192, "step": 155200 }, { "epoch": 3.5783365570599615, "grad_norm": 3.188671588897705, "learning_rate": 2.5931051031834906e-05, "loss": 0.6335, "step": 155400 }, { "epoch": 3.5829418808142215, "grad_norm": 2.945617198944092, "learning_rate": 2.592377946801239e-05, "loss": 0.6265, "step": 155600 }, { "epoch": 3.587547204568481, "grad_norm": 3.1751980781555176, "learning_rate": 2.5916507904189878e-05, "loss": 0.6199, "step": 155800 }, { "epoch": 3.592152528322741, "grad_norm": 2.6950361728668213, "learning_rate": 2.590923634036736e-05, "loss": 0.6283, "step": 156000 }, { "epoch": 3.596757852077001, "grad_norm": 3.544735908508301, "learning_rate": 2.5901964776544843e-05, "loss": 0.6281, "step": 156200 }, { "epoch": 3.6013631758312608, "grad_norm": 2.9812283515930176, "learning_rate": 2.589469321272233e-05, "loss": 0.6275, "step": 156400 }, { "epoch": 3.605968499585521, "grad_norm": 3.3013553619384766, "learning_rate": 2.5887421648899814e-05, "loss": 0.6292, "step": 156600 }, { "epoch": 3.610573823339781, "grad_norm": 3.0218076705932617, "learning_rate": 2.5880150085077297e-05, "loss": 0.6279, "step": 156800 }, { "epoch": 3.615179147094041, "grad_norm": 2.9628562927246094, "learning_rate": 2.5872878521254783e-05, "loss": 0.6444, "step": 157000 }, { "epoch": 3.619784470848301, "grad_norm": 3.080231189727783, "learning_rate": 2.586560695743227e-05, "loss": 0.6305, "step": 157200 }, { "epoch": 3.6243897946025605, "grad_norm": 3.3439502716064453, "learning_rate": 2.5858335393609748e-05, "loss": 0.6354, "step": 157400 }, { "epoch": 3.6289951183568205, "grad_norm": 3.6231765747070312, "learning_rate": 2.5851063829787234e-05, "loss": 0.622, "step": 157600 }, { "epoch": 3.6336004421110806, "grad_norm": 4.266796588897705, "learning_rate": 2.584379226596472e-05, "loss": 0.6385, "step": 157800 }, { "epoch": 3.63820576586534, "grad_norm": 2.9517550468444824, "learning_rate": 2.5836520702142202e-05, "loss": 0.6319, "step": 158000 }, { "epoch": 3.6428110896196, "grad_norm": 3.549234390258789, "learning_rate": 2.5829249138319688e-05, "loss": 0.6233, "step": 158200 }, { "epoch": 3.6474164133738602, "grad_norm": 2.957629442214966, "learning_rate": 2.5821977574497174e-05, "loss": 0.628, "step": 158400 }, { "epoch": 3.65202173712812, "grad_norm": 2.4832262992858887, "learning_rate": 2.581474236849377e-05, "loss": 0.6252, "step": 158600 }, { "epoch": 3.65662706088238, "grad_norm": 3.6070032119750977, "learning_rate": 2.5807470804671255e-05, "loss": 0.6317, "step": 158800 }, { "epoch": 3.66123238463664, "grad_norm": 3.4381484985351562, "learning_rate": 2.5800199240848738e-05, "loss": 0.6174, "step": 159000 }, { "epoch": 3.6658377083909, "grad_norm": 2.8952486515045166, "learning_rate": 2.579292767702622e-05, "loss": 0.6334, "step": 159200 }, { "epoch": 3.67044303214516, "grad_norm": 3.704712390899658, "learning_rate": 2.5785656113203706e-05, "loss": 0.6371, "step": 159400 }, { "epoch": 3.6750483558994196, "grad_norm": 3.430786609649658, "learning_rate": 2.5778384549381192e-05, "loss": 0.6369, "step": 159600 }, { "epoch": 3.6796536796536796, "grad_norm": 3.201442003250122, "learning_rate": 2.5771112985558675e-05, "loss": 0.6274, "step": 159800 }, { "epoch": 3.6842590034079397, "grad_norm": 3.125777244567871, "learning_rate": 2.576384142173616e-05, "loss": 0.6318, "step": 160000 }, { "epoch": 3.6888643271621993, "grad_norm": 2.6248064041137695, "learning_rate": 2.5756569857913643e-05, "loss": 0.6344, "step": 160200 }, { "epoch": 3.6934696509164593, "grad_norm": 3.5570108890533447, "learning_rate": 2.5749298294091126e-05, "loss": 0.6293, "step": 160400 }, { "epoch": 3.6980749746707193, "grad_norm": 3.13261342048645, "learning_rate": 2.5742063088087724e-05, "loss": 0.634, "step": 160600 }, { "epoch": 3.7026802984249794, "grad_norm": 3.070614814758301, "learning_rate": 2.573479152426521e-05, "loss": 0.6255, "step": 160800 }, { "epoch": 3.7072856221792394, "grad_norm": 2.835911750793457, "learning_rate": 2.5727519960442693e-05, "loss": 0.6224, "step": 161000 }, { "epoch": 3.711890945933499, "grad_norm": 2.9913852214813232, "learning_rate": 2.572024839662018e-05, "loss": 0.6335, "step": 161200 }, { "epoch": 3.716496269687759, "grad_norm": 3.0731775760650635, "learning_rate": 2.5712976832797664e-05, "loss": 0.6479, "step": 161400 }, { "epoch": 3.721101593442019, "grad_norm": 3.5844905376434326, "learning_rate": 2.5705705268975144e-05, "loss": 0.6201, "step": 161600 }, { "epoch": 3.7257069171962787, "grad_norm": 3.9543886184692383, "learning_rate": 2.569843370515263e-05, "loss": 0.6318, "step": 161800 }, { "epoch": 3.7303122409505387, "grad_norm": 2.6492578983306885, "learning_rate": 2.5691162141330115e-05, "loss": 0.6319, "step": 162000 }, { "epoch": 3.7349175647047987, "grad_norm": 3.418893814086914, "learning_rate": 2.5683890577507598e-05, "loss": 0.637, "step": 162200 }, { "epoch": 3.739522888459059, "grad_norm": 2.8401198387145996, "learning_rate": 2.5676619013685084e-05, "loss": 0.6223, "step": 162400 }, { "epoch": 3.744128212213319, "grad_norm": 3.19801664352417, "learning_rate": 2.566934744986257e-05, "loss": 0.6277, "step": 162600 }, { "epoch": 3.7487335359675784, "grad_norm": 2.9500930309295654, "learning_rate": 2.5662075886040052e-05, "loss": 0.6275, "step": 162800 }, { "epoch": 3.7533388597218384, "grad_norm": 2.7662055492401123, "learning_rate": 2.5654804322217535e-05, "loss": 0.619, "step": 163000 }, { "epoch": 3.7579441834760985, "grad_norm": 2.61580753326416, "learning_rate": 2.564753275839502e-05, "loss": 0.6305, "step": 163200 }, { "epoch": 3.762549507230358, "grad_norm": 3.7837417125701904, "learning_rate": 2.5640261194572507e-05, "loss": 0.6273, "step": 163400 }, { "epoch": 3.767154830984618, "grad_norm": 3.516418695449829, "learning_rate": 2.563298963074999e-05, "loss": 0.638, "step": 163600 }, { "epoch": 3.771760154738878, "grad_norm": 2.6570441722869873, "learning_rate": 2.5625718066927475e-05, "loss": 0.6319, "step": 163800 }, { "epoch": 3.776365478493138, "grad_norm": 2.759058952331543, "learning_rate": 2.561844650310496e-05, "loss": 0.6406, "step": 164000 }, { "epoch": 3.7809708022473982, "grad_norm": 2.742692232131958, "learning_rate": 2.561117493928244e-05, "loss": 0.6278, "step": 164200 }, { "epoch": 3.785576126001658, "grad_norm": 3.729440212249756, "learning_rate": 2.5603903375459926e-05, "loss": 0.6289, "step": 164400 }, { "epoch": 3.790181449755918, "grad_norm": 3.219895601272583, "learning_rate": 2.5596631811637412e-05, "loss": 0.6275, "step": 164600 }, { "epoch": 3.794786773510178, "grad_norm": 3.222836971282959, "learning_rate": 2.5589360247814894e-05, "loss": 0.6331, "step": 164800 }, { "epoch": 3.7993920972644375, "grad_norm": 2.4223623275756836, "learning_rate": 2.558208868399238e-05, "loss": 0.623, "step": 165000 }, { "epoch": 3.8039974210186975, "grad_norm": 3.0702037811279297, "learning_rate": 2.5574817120169866e-05, "loss": 0.6285, "step": 165200 }, { "epoch": 3.8086027447729576, "grad_norm": 2.8104872703552246, "learning_rate": 2.556758191416646e-05, "loss": 0.6353, "step": 165400 }, { "epoch": 3.8132080685272176, "grad_norm": 3.1389353275299072, "learning_rate": 2.556034670816306e-05, "loss": 0.6363, "step": 165600 }, { "epoch": 3.8178133922814776, "grad_norm": 3.318134307861328, "learning_rate": 2.5553111502159655e-05, "loss": 0.6259, "step": 165800 }, { "epoch": 3.8224187160357372, "grad_norm": 3.075765609741211, "learning_rate": 2.554583993833714e-05, "loss": 0.6325, "step": 166000 }, { "epoch": 3.8270240397899973, "grad_norm": 3.2406067848205566, "learning_rate": 2.5538568374514624e-05, "loss": 0.6303, "step": 166200 }, { "epoch": 3.8316293635442573, "grad_norm": 3.4027702808380127, "learning_rate": 2.5531296810692106e-05, "loss": 0.634, "step": 166400 }, { "epoch": 3.836234687298517, "grad_norm": 3.2992939949035645, "learning_rate": 2.5524025246869592e-05, "loss": 0.6285, "step": 166600 }, { "epoch": 3.840840011052777, "grad_norm": 3.0388951301574707, "learning_rate": 2.5516753683047078e-05, "loss": 0.631, "step": 166800 }, { "epoch": 3.845445334807037, "grad_norm": 3.0130348205566406, "learning_rate": 2.550948211922456e-05, "loss": 0.6239, "step": 167000 }, { "epoch": 3.850050658561297, "grad_norm": 3.0870108604431152, "learning_rate": 2.5502210555402047e-05, "loss": 0.6318, "step": 167200 }, { "epoch": 3.8546559823155566, "grad_norm": 3.3640689849853516, "learning_rate": 2.549493899157953e-05, "loss": 0.6381, "step": 167400 }, { "epoch": 3.8592613060698167, "grad_norm": 3.3157622814178467, "learning_rate": 2.548766742775701e-05, "loss": 0.6208, "step": 167600 }, { "epoch": 3.8638666298240767, "grad_norm": 3.0967700481414795, "learning_rate": 2.5480395863934498e-05, "loss": 0.6294, "step": 167800 }, { "epoch": 3.8684719535783367, "grad_norm": 2.9279990196228027, "learning_rate": 2.5473124300111983e-05, "loss": 0.629, "step": 168000 }, { "epoch": 3.8730772773325963, "grad_norm": 3.182149648666382, "learning_rate": 2.546585273628947e-05, "loss": 0.62, "step": 168200 }, { "epoch": 3.8776826010868564, "grad_norm": 2.424578905105591, "learning_rate": 2.5458581172466952e-05, "loss": 0.6178, "step": 168400 }, { "epoch": 3.8822879248411164, "grad_norm": 3.4072725772857666, "learning_rate": 2.5451309608644434e-05, "loss": 0.6325, "step": 168600 }, { "epoch": 3.886893248595376, "grad_norm": 3.33803653717041, "learning_rate": 2.544407440264103e-05, "loss": 0.6347, "step": 168800 }, { "epoch": 3.891498572349636, "grad_norm": 3.434117555618286, "learning_rate": 2.5436802838818516e-05, "loss": 0.6278, "step": 169000 }, { "epoch": 3.896103896103896, "grad_norm": 3.191801071166992, "learning_rate": 2.5429531274996e-05, "loss": 0.6292, "step": 169200 }, { "epoch": 3.900709219858156, "grad_norm": 3.666332483291626, "learning_rate": 2.5422259711173484e-05, "loss": 0.6279, "step": 169400 }, { "epoch": 3.905314543612416, "grad_norm": 4.157709121704102, "learning_rate": 2.541498814735097e-05, "loss": 0.6244, "step": 169600 }, { "epoch": 3.9099198673666757, "grad_norm": 2.783106565475464, "learning_rate": 2.5407716583528456e-05, "loss": 0.6278, "step": 169800 }, { "epoch": 3.9145251911209358, "grad_norm": 3.1697680950164795, "learning_rate": 2.5400445019705942e-05, "loss": 0.6312, "step": 170000 }, { "epoch": 3.919130514875196, "grad_norm": 2.8394782543182373, "learning_rate": 2.539317345588342e-05, "loss": 0.6282, "step": 170200 }, { "epoch": 3.9237358386294554, "grad_norm": 2.653749704360962, "learning_rate": 2.5385901892060907e-05, "loss": 0.6185, "step": 170400 }, { "epoch": 3.9283411623837154, "grad_norm": 2.6118738651275635, "learning_rate": 2.5378630328238393e-05, "loss": 0.6259, "step": 170600 }, { "epoch": 3.9329464861379755, "grad_norm": 4.355154991149902, "learning_rate": 2.5371358764415875e-05, "loss": 0.6394, "step": 170800 }, { "epoch": 3.9375518098922355, "grad_norm": 2.658700466156006, "learning_rate": 2.536408720059336e-05, "loss": 0.6277, "step": 171000 }, { "epoch": 3.9421571336464956, "grad_norm": 3.9519739151000977, "learning_rate": 2.5356815636770847e-05, "loss": 0.6268, "step": 171200 }, { "epoch": 3.946762457400755, "grad_norm": 2.6966614723205566, "learning_rate": 2.5349544072948326e-05, "loss": 0.6368, "step": 171400 }, { "epoch": 3.951367781155015, "grad_norm": 2.29526686668396, "learning_rate": 2.5342272509125812e-05, "loss": 0.6327, "step": 171600 }, { "epoch": 3.9559731049092752, "grad_norm": 2.904367208480835, "learning_rate": 2.5335000945303298e-05, "loss": 0.6201, "step": 171800 }, { "epoch": 3.960578428663535, "grad_norm": 3.2330174446105957, "learning_rate": 2.532772938148078e-05, "loss": 0.6297, "step": 172000 }, { "epoch": 3.965183752417795, "grad_norm": 2.654149293899536, "learning_rate": 2.5320457817658266e-05, "loss": 0.6334, "step": 172200 }, { "epoch": 3.969789076172055, "grad_norm": 2.883559465408325, "learning_rate": 2.5313186253835752e-05, "loss": 0.6175, "step": 172400 }, { "epoch": 3.974394399926315, "grad_norm": 2.8193156719207764, "learning_rate": 2.5305914690013235e-05, "loss": 0.6293, "step": 172600 }, { "epoch": 3.978999723680575, "grad_norm": 3.1371946334838867, "learning_rate": 2.5298643126190717e-05, "loss": 0.6256, "step": 172800 }, { "epoch": 3.9836050474348346, "grad_norm": 3.378344774246216, "learning_rate": 2.5291371562368203e-05, "loss": 0.6241, "step": 173000 }, { "epoch": 3.9882103711890946, "grad_norm": 3.4124484062194824, "learning_rate": 2.52841363563648e-05, "loss": 0.6255, "step": 173200 }, { "epoch": 3.9928156949433546, "grad_norm": 2.8864808082580566, "learning_rate": 2.5276901150361397e-05, "loss": 0.6253, "step": 173400 }, { "epoch": 3.9974210186976142, "grad_norm": 2.594573736190796, "learning_rate": 2.5269629586538883e-05, "loss": 0.6265, "step": 173600 }, { "epoch": 4.0, "eval_loss": 0.5913488864898682, "eval_runtime": 146.1973, "eval_samples_per_second": 193.991, "eval_steps_per_second": 12.127, "step": 173712 }, { "epoch": 4.002026342451875, "grad_norm": 4.0110249519348145, "learning_rate": 2.5262358022716366e-05, "loss": 0.6277, "step": 173800 }, { "epoch": 4.006631666206134, "grad_norm": 3.5719571113586426, "learning_rate": 2.525508645889385e-05, "loss": 0.6222, "step": 174000 }, { "epoch": 4.011236989960394, "grad_norm": 3.5779929161071777, "learning_rate": 2.5247814895071337e-05, "loss": 0.6147, "step": 174200 }, { "epoch": 4.015842313714654, "grad_norm": 2.7468457221984863, "learning_rate": 2.5240543331248817e-05, "loss": 0.6193, "step": 174400 }, { "epoch": 4.020447637468914, "grad_norm": 3.6863133907318115, "learning_rate": 2.5233271767426302e-05, "loss": 0.627, "step": 174600 }, { "epoch": 4.025052961223174, "grad_norm": 3.8438150882720947, "learning_rate": 2.522600020360379e-05, "loss": 0.6304, "step": 174800 }, { "epoch": 4.029658284977434, "grad_norm": 3.4153921604156494, "learning_rate": 2.521872863978127e-05, "loss": 0.6209, "step": 175000 }, { "epoch": 4.034263608731694, "grad_norm": 3.0748400688171387, "learning_rate": 2.5211457075958757e-05, "loss": 0.6252, "step": 175200 }, { "epoch": 4.038868932485954, "grad_norm": 4.052780628204346, "learning_rate": 2.5204185512136243e-05, "loss": 0.6123, "step": 175400 }, { "epoch": 4.043474256240214, "grad_norm": 3.1808793544769287, "learning_rate": 2.5196913948313722e-05, "loss": 0.6148, "step": 175600 }, { "epoch": 4.048079579994473, "grad_norm": 2.881800413131714, "learning_rate": 2.5189642384491208e-05, "loss": 0.6156, "step": 175800 }, { "epoch": 4.052684903748734, "grad_norm": 2.8981564044952393, "learning_rate": 2.5182370820668694e-05, "loss": 0.6211, "step": 176000 }, { "epoch": 4.057290227502993, "grad_norm": 3.5495710372924805, "learning_rate": 2.517509925684618e-05, "loss": 0.617, "step": 176200 }, { "epoch": 4.061895551257253, "grad_norm": 2.4835290908813477, "learning_rate": 2.5167827693023662e-05, "loss": 0.6297, "step": 176400 }, { "epoch": 4.0665008750115135, "grad_norm": 2.574183225631714, "learning_rate": 2.5160556129201148e-05, "loss": 0.6176, "step": 176600 }, { "epoch": 4.071106198765773, "grad_norm": 3.0550966262817383, "learning_rate": 2.5153284565378634e-05, "loss": 0.6242, "step": 176800 }, { "epoch": 4.0757115225200335, "grad_norm": 3.120473861694336, "learning_rate": 2.5146013001556113e-05, "loss": 0.6201, "step": 177000 }, { "epoch": 4.080316846274293, "grad_norm": 2.9657979011535645, "learning_rate": 2.51387414377336e-05, "loss": 0.6219, "step": 177200 }, { "epoch": 4.084922170028553, "grad_norm": 2.9161338806152344, "learning_rate": 2.5131469873911085e-05, "loss": 0.623, "step": 177400 }, { "epoch": 4.089527493782813, "grad_norm": 2.9412291049957275, "learning_rate": 2.5124198310088567e-05, "loss": 0.6353, "step": 177600 }, { "epoch": 4.094132817537073, "grad_norm": 3.1718437671661377, "learning_rate": 2.5116926746266053e-05, "loss": 0.6205, "step": 177800 }, { "epoch": 4.098738141291332, "grad_norm": 4.523801326751709, "learning_rate": 2.510965518244354e-05, "loss": 0.6243, "step": 178000 }, { "epoch": 4.103343465045593, "grad_norm": 2.9366707801818848, "learning_rate": 2.510238361862102e-05, "loss": 0.619, "step": 178200 }, { "epoch": 4.1079487887998525, "grad_norm": 2.822934865951538, "learning_rate": 2.5095148412617617e-05, "loss": 0.6183, "step": 178400 }, { "epoch": 4.112554112554113, "grad_norm": 4.115916728973389, "learning_rate": 2.5087876848795103e-05, "loss": 0.6153, "step": 178600 }, { "epoch": 4.1171594363083726, "grad_norm": 3.5023276805877686, "learning_rate": 2.5080605284972585e-05, "loss": 0.6262, "step": 178800 }, { "epoch": 4.121764760062632, "grad_norm": 2.906297206878662, "learning_rate": 2.507333372115007e-05, "loss": 0.6307, "step": 179000 }, { "epoch": 4.126370083816893, "grad_norm": 2.6169967651367188, "learning_rate": 2.5066062157327557e-05, "loss": 0.625, "step": 179200 }, { "epoch": 4.130975407571152, "grad_norm": 3.4697892665863037, "learning_rate": 2.505879059350504e-05, "loss": 0.631, "step": 179400 }, { "epoch": 4.135580731325412, "grad_norm": 3.589235305786133, "learning_rate": 2.5051519029682522e-05, "loss": 0.6238, "step": 179600 }, { "epoch": 4.140186055079672, "grad_norm": 3.2380635738372803, "learning_rate": 2.5044247465860008e-05, "loss": 0.6222, "step": 179800 }, { "epoch": 4.144791378833932, "grad_norm": 2.568429946899414, "learning_rate": 2.503697590203749e-05, "loss": 0.6218, "step": 180000 }, { "epoch": 4.149396702588192, "grad_norm": 3.3458428382873535, "learning_rate": 2.5029704338214977e-05, "loss": 0.6222, "step": 180200 }, { "epoch": 4.154002026342452, "grad_norm": 3.1617233753204346, "learning_rate": 2.5022432774392463e-05, "loss": 0.6284, "step": 180400 }, { "epoch": 4.158607350096712, "grad_norm": 2.6967251300811768, "learning_rate": 2.5015197568389058e-05, "loss": 0.6211, "step": 180600 }, { "epoch": 4.163212673850972, "grad_norm": 4.2213568687438965, "learning_rate": 2.5007926004566544e-05, "loss": 0.6242, "step": 180800 }, { "epoch": 4.167817997605232, "grad_norm": 3.8117053508758545, "learning_rate": 2.500065444074403e-05, "loss": 0.6182, "step": 181000 }, { "epoch": 4.172423321359491, "grad_norm": 3.316926956176758, "learning_rate": 2.499338287692151e-05, "loss": 0.6174, "step": 181200 }, { "epoch": 4.177028645113752, "grad_norm": 3.363097667694092, "learning_rate": 2.4986111313098995e-05, "loss": 0.6207, "step": 181400 }, { "epoch": 4.181633968868011, "grad_norm": 3.064410924911499, "learning_rate": 2.497883974927648e-05, "loss": 0.6218, "step": 181600 }, { "epoch": 4.186239292622272, "grad_norm": 2.6976685523986816, "learning_rate": 2.4971568185453963e-05, "loss": 0.6128, "step": 181800 }, { "epoch": 4.190844616376531, "grad_norm": 3.532400608062744, "learning_rate": 2.496429662163145e-05, "loss": 0.6185, "step": 182000 }, { "epoch": 4.195449940130791, "grad_norm": 2.791353702545166, "learning_rate": 2.4957025057808935e-05, "loss": 0.6279, "step": 182200 }, { "epoch": 4.2000552638850515, "grad_norm": 2.433746814727783, "learning_rate": 2.4949753493986417e-05, "loss": 0.6216, "step": 182400 }, { "epoch": 4.204660587639311, "grad_norm": 3.122479200363159, "learning_rate": 2.49424819301639e-05, "loss": 0.6206, "step": 182600 }, { "epoch": 4.209265911393571, "grad_norm": 3.7672858238220215, "learning_rate": 2.4935210366341386e-05, "loss": 0.6327, "step": 182800 }, { "epoch": 4.213871235147831, "grad_norm": 2.832796812057495, "learning_rate": 2.4927938802518872e-05, "loss": 0.6363, "step": 183000 }, { "epoch": 4.218476558902091, "grad_norm": 2.60891056060791, "learning_rate": 2.4920667238696354e-05, "loss": 0.6279, "step": 183200 }, { "epoch": 4.22308188265635, "grad_norm": 2.9585559368133545, "learning_rate": 2.491339567487384e-05, "loss": 0.6306, "step": 183400 }, { "epoch": 4.227687206410611, "grad_norm": 2.4493677616119385, "learning_rate": 2.4906124111051326e-05, "loss": 0.6151, "step": 183600 }, { "epoch": 4.23229253016487, "grad_norm": 3.008601188659668, "learning_rate": 2.4898852547228805e-05, "loss": 0.61, "step": 183800 }, { "epoch": 4.236897853919131, "grad_norm": 3.0694305896759033, "learning_rate": 2.489158098340629e-05, "loss": 0.6369, "step": 184000 }, { "epoch": 4.2415031776733905, "grad_norm": 3.306530714035034, "learning_rate": 2.4884309419583777e-05, "loss": 0.6271, "step": 184200 }, { "epoch": 4.24610850142765, "grad_norm": 3.441470146179199, "learning_rate": 2.487703785576126e-05, "loss": 0.6163, "step": 184400 }, { "epoch": 4.2507138251819105, "grad_norm": 2.845263957977295, "learning_rate": 2.4869766291938746e-05, "loss": 0.6207, "step": 184600 }, { "epoch": 4.25531914893617, "grad_norm": 3.332775354385376, "learning_rate": 2.4862531085935344e-05, "loss": 0.6221, "step": 184800 }, { "epoch": 4.25992447269043, "grad_norm": 2.332599639892578, "learning_rate": 2.4855259522112827e-05, "loss": 0.6207, "step": 185000 }, { "epoch": 4.26452979644469, "grad_norm": 4.441643238067627, "learning_rate": 2.4848024316109425e-05, "loss": 0.6276, "step": 185200 }, { "epoch": 4.26913512019895, "grad_norm": 3.2958297729492188, "learning_rate": 2.4840752752286904e-05, "loss": 0.6209, "step": 185400 }, { "epoch": 4.27374044395321, "grad_norm": 2.6615874767303467, "learning_rate": 2.483348118846439e-05, "loss": 0.6227, "step": 185600 }, { "epoch": 4.27834576770747, "grad_norm": 2.7402963638305664, "learning_rate": 2.4826209624641876e-05, "loss": 0.613, "step": 185800 }, { "epoch": 4.2829510914617295, "grad_norm": 3.0777735710144043, "learning_rate": 2.4818938060819362e-05, "loss": 0.6105, "step": 186000 }, { "epoch": 4.28755641521599, "grad_norm": 3.108518123626709, "learning_rate": 2.4811702854815957e-05, "loss": 0.6155, "step": 186200 }, { "epoch": 4.2921617389702496, "grad_norm": 2.7684409618377686, "learning_rate": 2.4804431290993443e-05, "loss": 0.6132, "step": 186400 }, { "epoch": 4.296767062724509, "grad_norm": 3.1115176677703857, "learning_rate": 2.4797159727170926e-05, "loss": 0.6306, "step": 186600 }, { "epoch": 4.30137238647877, "grad_norm": 3.601163387298584, "learning_rate": 2.478988816334841e-05, "loss": 0.6142, "step": 186800 }, { "epoch": 4.305977710233029, "grad_norm": 3.0134575366973877, "learning_rate": 2.4782616599525894e-05, "loss": 0.6248, "step": 187000 }, { "epoch": 4.31058303398729, "grad_norm": 2.950279951095581, "learning_rate": 2.4775345035703377e-05, "loss": 0.6184, "step": 187200 }, { "epoch": 4.315188357741549, "grad_norm": 2.8289947509765625, "learning_rate": 2.4768073471880863e-05, "loss": 0.6292, "step": 187400 }, { "epoch": 4.319793681495809, "grad_norm": 3.4812533855438232, "learning_rate": 2.476080190805835e-05, "loss": 0.6284, "step": 187600 }, { "epoch": 4.324399005250069, "grad_norm": 4.80519437789917, "learning_rate": 2.4753530344235835e-05, "loss": 0.6278, "step": 187800 }, { "epoch": 4.329004329004329, "grad_norm": 2.8514554500579834, "learning_rate": 2.4746258780413317e-05, "loss": 0.6139, "step": 188000 }, { "epoch": 4.333609652758589, "grad_norm": 2.441417694091797, "learning_rate": 2.47389872165908e-05, "loss": 0.6142, "step": 188200 }, { "epoch": 4.338214976512849, "grad_norm": 3.1779348850250244, "learning_rate": 2.4731715652768285e-05, "loss": 0.6245, "step": 188400 }, { "epoch": 4.342820300267109, "grad_norm": 3.2313997745513916, "learning_rate": 2.4724444088945768e-05, "loss": 0.631, "step": 188600 }, { "epoch": 4.347425624021369, "grad_norm": 3.2089149951934814, "learning_rate": 2.4717172525123254e-05, "loss": 0.6131, "step": 188800 }, { "epoch": 4.352030947775629, "grad_norm": 3.3893136978149414, "learning_rate": 2.470990096130074e-05, "loss": 0.6144, "step": 189000 }, { "epoch": 4.356636271529888, "grad_norm": 3.192901611328125, "learning_rate": 2.4702629397478222e-05, "loss": 0.62, "step": 189200 }, { "epoch": 4.361241595284149, "grad_norm": 3.3176310062408447, "learning_rate": 2.4695357833655705e-05, "loss": 0.623, "step": 189400 }, { "epoch": 4.365846919038408, "grad_norm": 3.1651062965393066, "learning_rate": 2.468808626983319e-05, "loss": 0.6313, "step": 189600 }, { "epoch": 4.370452242792668, "grad_norm": 4.049862861633301, "learning_rate": 2.4680814706010673e-05, "loss": 0.6236, "step": 189800 }, { "epoch": 4.3750575665469285, "grad_norm": 3.2649123668670654, "learning_rate": 2.467354314218816e-05, "loss": 0.614, "step": 190000 }, { "epoch": 4.379662890301188, "grad_norm": 2.824345350265503, "learning_rate": 2.4666271578365645e-05, "loss": 0.6238, "step": 190200 }, { "epoch": 4.384268214055448, "grad_norm": 3.1499876976013184, "learning_rate": 2.4659000014543128e-05, "loss": 0.6226, "step": 190400 }, { "epoch": 4.388873537809708, "grad_norm": 3.3163814544677734, "learning_rate": 2.4651728450720614e-05, "loss": 0.6126, "step": 190600 }, { "epoch": 4.393478861563968, "grad_norm": 2.8196558952331543, "learning_rate": 2.4644493244717212e-05, "loss": 0.6226, "step": 190800 }, { "epoch": 4.398084185318228, "grad_norm": 2.6953351497650146, "learning_rate": 2.463722168089469e-05, "loss": 0.62, "step": 191000 }, { "epoch": 4.402689509072488, "grad_norm": 2.942456007003784, "learning_rate": 2.4629950117072177e-05, "loss": 0.6265, "step": 191200 }, { "epoch": 4.407294832826747, "grad_norm": 2.7047855854034424, "learning_rate": 2.4622678553249663e-05, "loss": 0.6177, "step": 191400 }, { "epoch": 4.411900156581008, "grad_norm": 3.093517541885376, "learning_rate": 2.4615406989427146e-05, "loss": 0.6225, "step": 191600 }, { "epoch": 4.4165054803352675, "grad_norm": 3.8072400093078613, "learning_rate": 2.4608171783423744e-05, "loss": 0.6177, "step": 191800 }, { "epoch": 4.421110804089528, "grad_norm": 2.6595051288604736, "learning_rate": 2.460090021960123e-05, "loss": 0.6163, "step": 192000 }, { "epoch": 4.4257161278437875, "grad_norm": 3.101194143295288, "learning_rate": 2.4593628655778713e-05, "loss": 0.6157, "step": 192200 }, { "epoch": 4.430321451598047, "grad_norm": 2.7680416107177734, "learning_rate": 2.4586357091956195e-05, "loss": 0.6209, "step": 192400 }, { "epoch": 4.434926775352308, "grad_norm": 3.988497734069824, "learning_rate": 2.457908552813368e-05, "loss": 0.6246, "step": 192600 }, { "epoch": 4.439532099106567, "grad_norm": 3.0158700942993164, "learning_rate": 2.4571813964311164e-05, "loss": 0.6225, "step": 192800 }, { "epoch": 4.444137422860827, "grad_norm": 3.032444953918457, "learning_rate": 2.456454240048865e-05, "loss": 0.6229, "step": 193000 }, { "epoch": 4.448742746615087, "grad_norm": 2.915055274963379, "learning_rate": 2.4557270836666136e-05, "loss": 0.629, "step": 193200 }, { "epoch": 4.453348070369347, "grad_norm": 3.404008388519287, "learning_rate": 2.4549999272843618e-05, "loss": 0.6233, "step": 193400 }, { "epoch": 4.4579533941236065, "grad_norm": 2.661874294281006, "learning_rate": 2.45427277090211e-05, "loss": 0.6216, "step": 193600 }, { "epoch": 4.462558717877867, "grad_norm": 3.4487946033477783, "learning_rate": 2.4535456145198586e-05, "loss": 0.6312, "step": 193800 }, { "epoch": 4.4671640416321265, "grad_norm": 2.926490068435669, "learning_rate": 2.4528184581376072e-05, "loss": 0.6134, "step": 194000 }, { "epoch": 4.471769365386387, "grad_norm": 3.7607150077819824, "learning_rate": 2.4520913017553555e-05, "loss": 0.621, "step": 194200 }, { "epoch": 4.476374689140647, "grad_norm": 4.96255350112915, "learning_rate": 2.451364145373104e-05, "loss": 0.622, "step": 194400 }, { "epoch": 4.480980012894906, "grad_norm": 3.3920106887817383, "learning_rate": 2.4506369889908527e-05, "loss": 0.6184, "step": 194600 }, { "epoch": 4.485585336649167, "grad_norm": 3.106212854385376, "learning_rate": 2.449909832608601e-05, "loss": 0.624, "step": 194800 }, { "epoch": 4.490190660403426, "grad_norm": 3.0986621379852295, "learning_rate": 2.4491826762263492e-05, "loss": 0.6199, "step": 195000 }, { "epoch": 4.494795984157686, "grad_norm": 2.7702393531799316, "learning_rate": 2.4484555198440978e-05, "loss": 0.6219, "step": 195200 }, { "epoch": 4.499401307911946, "grad_norm": 2.7158069610595703, "learning_rate": 2.447728363461846e-05, "loss": 0.6228, "step": 195400 }, { "epoch": 4.504006631666206, "grad_norm": 2.6804583072662354, "learning_rate": 2.4470012070795946e-05, "loss": 0.6264, "step": 195600 }, { "epoch": 4.508611955420466, "grad_norm": 3.254936695098877, "learning_rate": 2.4462740506973432e-05, "loss": 0.6178, "step": 195800 }, { "epoch": 4.513217279174726, "grad_norm": 2.8413448333740234, "learning_rate": 2.4455468943150915e-05, "loss": 0.6221, "step": 196000 }, { "epoch": 4.517822602928986, "grad_norm": 3.7565321922302246, "learning_rate": 2.4448197379328397e-05, "loss": 0.6167, "step": 196200 }, { "epoch": 4.522427926683246, "grad_norm": 2.5034019947052, "learning_rate": 2.4440925815505883e-05, "loss": 0.6137, "step": 196400 }, { "epoch": 4.527033250437506, "grad_norm": 3.1947107315063477, "learning_rate": 2.4433654251683366e-05, "loss": 0.621, "step": 196600 }, { "epoch": 4.531638574191765, "grad_norm": 2.7699062824249268, "learning_rate": 2.4426419045679964e-05, "loss": 0.6192, "step": 196800 }, { "epoch": 4.536243897946026, "grad_norm": 3.791555166244507, "learning_rate": 2.441914748185745e-05, "loss": 0.633, "step": 197000 }, { "epoch": 4.540849221700285, "grad_norm": 2.6840012073516846, "learning_rate": 2.4411875918034933e-05, "loss": 0.6252, "step": 197200 }, { "epoch": 4.545454545454545, "grad_norm": 3.7765004634857178, "learning_rate": 2.440460435421242e-05, "loss": 0.6171, "step": 197400 }, { "epoch": 4.5500598692088055, "grad_norm": 2.8564860820770264, "learning_rate": 2.4397332790389904e-05, "loss": 0.6235, "step": 197600 }, { "epoch": 4.554665192963065, "grad_norm": 3.1641488075256348, "learning_rate": 2.4390061226567384e-05, "loss": 0.6197, "step": 197800 }, { "epoch": 4.5592705167173255, "grad_norm": 3.0477914810180664, "learning_rate": 2.438278966274487e-05, "loss": 0.6269, "step": 198000 }, { "epoch": 4.563875840471585, "grad_norm": 3.0091304779052734, "learning_rate": 2.4375518098922355e-05, "loss": 0.622, "step": 198200 }, { "epoch": 4.568481164225845, "grad_norm": 3.7036385536193848, "learning_rate": 2.4368246535099838e-05, "loss": 0.6151, "step": 198400 }, { "epoch": 4.573086487980105, "grad_norm": 2.7939934730529785, "learning_rate": 2.4360974971277324e-05, "loss": 0.6295, "step": 198600 }, { "epoch": 4.577691811734365, "grad_norm": 3.725222110748291, "learning_rate": 2.435370340745481e-05, "loss": 0.6212, "step": 198800 }, { "epoch": 4.582297135488625, "grad_norm": 3.55149245262146, "learning_rate": 2.4346431843632292e-05, "loss": 0.6143, "step": 199000 }, { "epoch": 4.586902459242885, "grad_norm": 3.657155990600586, "learning_rate": 2.4339160279809775e-05, "loss": 0.6245, "step": 199200 }, { "epoch": 4.5915077829971445, "grad_norm": 2.774144172668457, "learning_rate": 2.433188871598726e-05, "loss": 0.6263, "step": 199400 }, { "epoch": 4.596113106751405, "grad_norm": 2.516934633255005, "learning_rate": 2.4324617152164747e-05, "loss": 0.6096, "step": 199600 }, { "epoch": 4.6007184305056645, "grad_norm": 3.243980646133423, "learning_rate": 2.431734558834223e-05, "loss": 0.6136, "step": 199800 }, { "epoch": 4.605323754259924, "grad_norm": 3.0042223930358887, "learning_rate": 2.4310110382338828e-05, "loss": 0.6219, "step": 200000 }, { "epoch": 4.609929078014185, "grad_norm": 2.626081943511963, "learning_rate": 2.430283881851631e-05, "loss": 0.6093, "step": 200200 }, { "epoch": 4.614534401768444, "grad_norm": 3.4706902503967285, "learning_rate": 2.4295567254693793e-05, "loss": 0.6217, "step": 200400 }, { "epoch": 4.619139725522704, "grad_norm": 3.2710297107696533, "learning_rate": 2.428829569087128e-05, "loss": 0.617, "step": 200600 }, { "epoch": 4.623745049276964, "grad_norm": 2.6531553268432617, "learning_rate": 2.4281024127048765e-05, "loss": 0.636, "step": 200800 }, { "epoch": 4.628350373031224, "grad_norm": 2.9135398864746094, "learning_rate": 2.4273752563226247e-05, "loss": 0.6269, "step": 201000 }, { "epoch": 4.632955696785484, "grad_norm": 2.918564558029175, "learning_rate": 2.4266480999403733e-05, "loss": 0.6139, "step": 201200 }, { "epoch": 4.637561020539744, "grad_norm": 3.0329713821411133, "learning_rate": 2.425920943558122e-05, "loss": 0.619, "step": 201400 }, { "epoch": 4.6421663442940035, "grad_norm": 2.717499017715454, "learning_rate": 2.42519378717587e-05, "loss": 0.6141, "step": 201600 }, { "epoch": 4.646771668048264, "grad_norm": 3.2214903831481934, "learning_rate": 2.4244666307936184e-05, "loss": 0.614, "step": 201800 }, { "epoch": 4.651376991802524, "grad_norm": 2.94065260887146, "learning_rate": 2.423739474411367e-05, "loss": 0.6206, "step": 202000 }, { "epoch": 4.655982315556784, "grad_norm": 3.6057605743408203, "learning_rate": 2.4230123180291152e-05, "loss": 0.6097, "step": 202200 }, { "epoch": 4.660587639311044, "grad_norm": 3.165396213531494, "learning_rate": 2.422285161646864e-05, "loss": 0.6283, "step": 202400 }, { "epoch": 4.665192963065303, "grad_norm": 3.9540810585021973, "learning_rate": 2.4215580052646124e-05, "loss": 0.6247, "step": 202600 }, { "epoch": 4.669798286819564, "grad_norm": 2.56862735748291, "learning_rate": 2.4208308488823607e-05, "loss": 0.6188, "step": 202800 }, { "epoch": 4.674403610573823, "grad_norm": 2.8995814323425293, "learning_rate": 2.420103692500109e-05, "loss": 0.6152, "step": 203000 }, { "epoch": 4.679008934328083, "grad_norm": 3.891038656234741, "learning_rate": 2.4193765361178575e-05, "loss": 0.6129, "step": 203200 }, { "epoch": 4.683614258082343, "grad_norm": 3.055830478668213, "learning_rate": 2.418649379735606e-05, "loss": 0.6204, "step": 203400 }, { "epoch": 4.688219581836603, "grad_norm": 2.924553394317627, "learning_rate": 2.4179222233533544e-05, "loss": 0.6245, "step": 203600 }, { "epoch": 4.692824905590863, "grad_norm": 3.2216031551361084, "learning_rate": 2.4171987027530142e-05, "loss": 0.6289, "step": 203800 }, { "epoch": 4.697430229345123, "grad_norm": 3.0679335594177246, "learning_rate": 2.4164715463707625e-05, "loss": 0.6114, "step": 204000 }, { "epoch": 4.702035553099383, "grad_norm": 2.5067596435546875, "learning_rate": 2.415744389988511e-05, "loss": 0.6313, "step": 204200 }, { "epoch": 4.706640876853643, "grad_norm": 3.141855239868164, "learning_rate": 2.4150172336062597e-05, "loss": 0.6201, "step": 204400 }, { "epoch": 4.711246200607903, "grad_norm": 2.854891300201416, "learning_rate": 2.4142900772240076e-05, "loss": 0.6298, "step": 204600 }, { "epoch": 4.715851524362162, "grad_norm": 3.0763156414031982, "learning_rate": 2.413562920841756e-05, "loss": 0.6246, "step": 204800 }, { "epoch": 4.720456848116423, "grad_norm": 3.858271837234497, "learning_rate": 2.4128357644595048e-05, "loss": 0.6288, "step": 205000 }, { "epoch": 4.7250621718706824, "grad_norm": 3.183344602584839, "learning_rate": 2.4121086080772534e-05, "loss": 0.614, "step": 205200 }, { "epoch": 4.729667495624943, "grad_norm": 3.646907329559326, "learning_rate": 2.4113814516950016e-05, "loss": 0.6259, "step": 205400 }, { "epoch": 4.7342728193792025, "grad_norm": 3.3939154148101807, "learning_rate": 2.4106542953127502e-05, "loss": 0.6315, "step": 205600 }, { "epoch": 4.738878143133462, "grad_norm": 3.364961862564087, "learning_rate": 2.4099271389304984e-05, "loss": 0.6232, "step": 205800 }, { "epoch": 4.743483466887723, "grad_norm": 2.7885661125183105, "learning_rate": 2.4091999825482467e-05, "loss": 0.6219, "step": 206000 }, { "epoch": 4.748088790641982, "grad_norm": 3.6182515621185303, "learning_rate": 2.4084728261659953e-05, "loss": 0.6302, "step": 206200 }, { "epoch": 4.752694114396242, "grad_norm": 2.98297381401062, "learning_rate": 2.407745669783744e-05, "loss": 0.6217, "step": 206400 }, { "epoch": 4.757299438150502, "grad_norm": 4.925945281982422, "learning_rate": 2.407018513401492e-05, "loss": 0.6299, "step": 206600 }, { "epoch": 4.761904761904762, "grad_norm": 2.90513277053833, "learning_rate": 2.406294992801152e-05, "loss": 0.6254, "step": 206800 }, { "epoch": 4.7665100856590215, "grad_norm": 2.9922261238098145, "learning_rate": 2.4055714722008115e-05, "loss": 0.6204, "step": 207000 }, { "epoch": 4.771115409413282, "grad_norm": 3.269758939743042, "learning_rate": 2.40484431581856e-05, "loss": 0.6091, "step": 207200 }, { "epoch": 4.7757207331675415, "grad_norm": 2.7046399116516113, "learning_rate": 2.4041171594363084e-05, "loss": 0.6204, "step": 207400 }, { "epoch": 4.780326056921801, "grad_norm": 3.494405508041382, "learning_rate": 2.4033900030540566e-05, "loss": 0.6361, "step": 207600 }, { "epoch": 4.784931380676062, "grad_norm": 2.91849684715271, "learning_rate": 2.4026628466718052e-05, "loss": 0.623, "step": 207800 }, { "epoch": 4.789536704430321, "grad_norm": 2.500559091567993, "learning_rate": 2.4019356902895538e-05, "loss": 0.615, "step": 208000 }, { "epoch": 4.794142028184582, "grad_norm": 3.2106189727783203, "learning_rate": 2.401208533907302e-05, "loss": 0.6345, "step": 208200 }, { "epoch": 4.798747351938841, "grad_norm": 3.0638959407806396, "learning_rate": 2.4004813775250506e-05, "loss": 0.6176, "step": 208400 }, { "epoch": 4.803352675693101, "grad_norm": 3.2531685829162598, "learning_rate": 2.3997542211427992e-05, "loss": 0.628, "step": 208600 }, { "epoch": 4.807957999447361, "grad_norm": 2.51589035987854, "learning_rate": 2.3990270647605475e-05, "loss": 0.6179, "step": 208800 }, { "epoch": 4.812563323201621, "grad_norm": 4.157649040222168, "learning_rate": 2.3982999083782957e-05, "loss": 0.6069, "step": 209000 }, { "epoch": 4.817168646955881, "grad_norm": 2.5439136028289795, "learning_rate": 2.3975727519960443e-05, "loss": 0.6243, "step": 209200 }, { "epoch": 4.821773970710141, "grad_norm": 2.936598300933838, "learning_rate": 2.396845595613793e-05, "loss": 0.6232, "step": 209400 }, { "epoch": 4.826379294464401, "grad_norm": 3.0496158599853516, "learning_rate": 2.3961184392315412e-05, "loss": 0.6308, "step": 209600 }, { "epoch": 4.830984618218661, "grad_norm": 3.6222925186157227, "learning_rate": 2.3953912828492898e-05, "loss": 0.6168, "step": 209800 }, { "epoch": 4.835589941972921, "grad_norm": 3.679816722869873, "learning_rate": 2.394664126467038e-05, "loss": 0.6095, "step": 210000 }, { "epoch": 4.84019526572718, "grad_norm": 3.8149142265319824, "learning_rate": 2.3939369700847863e-05, "loss": 0.616, "step": 210200 }, { "epoch": 4.844800589481441, "grad_norm": 3.1322624683380127, "learning_rate": 2.393209813702535e-05, "loss": 0.6137, "step": 210400 }, { "epoch": 4.8494059132357, "grad_norm": 2.972074031829834, "learning_rate": 2.3924826573202835e-05, "loss": 0.6372, "step": 210600 }, { "epoch": 4.85401123698996, "grad_norm": 2.327326774597168, "learning_rate": 2.3917555009380317e-05, "loss": 0.6182, "step": 210800 }, { "epoch": 4.85861656074422, "grad_norm": 3.573911666870117, "learning_rate": 2.3910283445557803e-05, "loss": 0.6278, "step": 211000 }, { "epoch": 4.86322188449848, "grad_norm": 3.1551811695098877, "learning_rate": 2.39030482395544e-05, "loss": 0.6174, "step": 211200 }, { "epoch": 4.8678272082527405, "grad_norm": 3.1177220344543457, "learning_rate": 2.389577667573188e-05, "loss": 0.6203, "step": 211400 }, { "epoch": 4.872432532007, "grad_norm": 3.517178773880005, "learning_rate": 2.3888505111909367e-05, "loss": 0.6165, "step": 211600 }, { "epoch": 4.87703785576126, "grad_norm": 2.932919502258301, "learning_rate": 2.3881233548086853e-05, "loss": 0.6154, "step": 211800 }, { "epoch": 4.88164317951552, "grad_norm": 3.162429094314575, "learning_rate": 2.3873961984264335e-05, "loss": 0.6215, "step": 212000 }, { "epoch": 4.88624850326978, "grad_norm": 3.3734543323516846, "learning_rate": 2.386669042044182e-05, "loss": 0.6068, "step": 212200 }, { "epoch": 4.89085382702404, "grad_norm": 3.567715644836426, "learning_rate": 2.3859418856619307e-05, "loss": 0.6183, "step": 212400 }, { "epoch": 4.8954591507783, "grad_norm": 3.8765225410461426, "learning_rate": 2.385214729279679e-05, "loss": 0.6122, "step": 212600 }, { "epoch": 4.9000644745325594, "grad_norm": 3.328627109527588, "learning_rate": 2.3844875728974272e-05, "loss": 0.6196, "step": 212800 }, { "epoch": 4.90466979828682, "grad_norm": 3.3088150024414062, "learning_rate": 2.3837604165151758e-05, "loss": 0.6223, "step": 213000 }, { "epoch": 4.9092751220410795, "grad_norm": 3.936145305633545, "learning_rate": 2.3830368959148353e-05, "loss": 0.6145, "step": 213200 }, { "epoch": 4.913880445795339, "grad_norm": 4.120792388916016, "learning_rate": 2.382309739532584e-05, "loss": 0.6168, "step": 213400 }, { "epoch": 4.9184857695496, "grad_norm": 2.7531352043151855, "learning_rate": 2.3815825831503325e-05, "loss": 0.608, "step": 213600 }, { "epoch": 4.923091093303859, "grad_norm": 3.97782301902771, "learning_rate": 2.3808554267680807e-05, "loss": 0.6283, "step": 213800 }, { "epoch": 4.927696417058119, "grad_norm": 2.840651035308838, "learning_rate": 2.3801282703858293e-05, "loss": 0.608, "step": 214000 }, { "epoch": 4.932301740812379, "grad_norm": 2.7774887084960938, "learning_rate": 2.3794011140035776e-05, "loss": 0.6015, "step": 214200 }, { "epoch": 4.936907064566639, "grad_norm": 2.750030040740967, "learning_rate": 2.378677593403237e-05, "loss": 0.5992, "step": 214400 }, { "epoch": 4.941512388320899, "grad_norm": 3.340794563293457, "learning_rate": 2.3779504370209857e-05, "loss": 0.6196, "step": 214600 }, { "epoch": 4.946117712075159, "grad_norm": 2.626340866088867, "learning_rate": 2.3772232806387343e-05, "loss": 0.6199, "step": 214800 }, { "epoch": 4.9507230358294185, "grad_norm": 2.984501361846924, "learning_rate": 2.3764961242564825e-05, "loss": 0.6095, "step": 215000 }, { "epoch": 4.955328359583679, "grad_norm": 3.6400811672210693, "learning_rate": 2.375768967874231e-05, "loss": 0.6172, "step": 215200 }, { "epoch": 4.959933683337939, "grad_norm": 3.16477632522583, "learning_rate": 2.3750418114919797e-05, "loss": 0.6134, "step": 215400 }, { "epoch": 4.964539007092198, "grad_norm": 3.852839708328247, "learning_rate": 2.374314655109728e-05, "loss": 0.6247, "step": 215600 }, { "epoch": 4.969144330846459, "grad_norm": 3.3444173336029053, "learning_rate": 2.3735874987274762e-05, "loss": 0.6085, "step": 215800 }, { "epoch": 4.973749654600718, "grad_norm": 4.497110366821289, "learning_rate": 2.3728603423452248e-05, "loss": 0.6162, "step": 216000 }, { "epoch": 4.978354978354979, "grad_norm": 2.6496007442474365, "learning_rate": 2.372133185962973e-05, "loss": 0.6196, "step": 216200 }, { "epoch": 4.982960302109238, "grad_norm": 3.1037495136260986, "learning_rate": 2.371409665362633e-05, "loss": 0.6156, "step": 216400 }, { "epoch": 4.987565625863498, "grad_norm": 2.7602591514587402, "learning_rate": 2.3706825089803815e-05, "loss": 0.628, "step": 216600 }, { "epoch": 4.992170949617758, "grad_norm": 4.069486141204834, "learning_rate": 2.3699553525981298e-05, "loss": 0.6232, "step": 216800 }, { "epoch": 4.996776273372018, "grad_norm": 2.9536936283111572, "learning_rate": 2.3692281962158784e-05, "loss": 0.6129, "step": 217000 }, { "epoch": 5.0, "eval_loss": 0.5846441388130188, "eval_runtime": 145.6578, "eval_samples_per_second": 194.71, "eval_steps_per_second": 12.172, "step": 217140 }, { "epoch": 5.001381597126278, "grad_norm": 3.7548468112945557, "learning_rate": 2.3685010398336266e-05, "loss": 0.6154, "step": 217200 }, { "epoch": 5.005986920880538, "grad_norm": 3.002516269683838, "learning_rate": 2.367773883451375e-05, "loss": 0.6113, "step": 217400 }, { "epoch": 5.010592244634798, "grad_norm": 2.816727638244629, "learning_rate": 2.3670467270691235e-05, "loss": 0.6102, "step": 217600 }, { "epoch": 5.015197568389058, "grad_norm": 2.5023694038391113, "learning_rate": 2.366319570686872e-05, "loss": 0.6157, "step": 217800 }, { "epoch": 5.019802892143318, "grad_norm": 3.309704542160034, "learning_rate": 2.3655924143046203e-05, "loss": 0.6056, "step": 218000 }, { "epoch": 5.024408215897577, "grad_norm": 2.3339779376983643, "learning_rate": 2.364865257922369e-05, "loss": 0.6094, "step": 218200 }, { "epoch": 5.029013539651838, "grad_norm": 3.4080960750579834, "learning_rate": 2.364138101540117e-05, "loss": 0.6022, "step": 218400 }, { "epoch": 5.033618863406097, "grad_norm": 3.769949197769165, "learning_rate": 2.3634109451578657e-05, "loss": 0.6158, "step": 218600 }, { "epoch": 5.038224187160357, "grad_norm": 3.2221176624298096, "learning_rate": 2.362683788775614e-05, "loss": 0.6077, "step": 218800 }, { "epoch": 5.0428295109146175, "grad_norm": 2.9638614654541016, "learning_rate": 2.3619566323933626e-05, "loss": 0.6134, "step": 219000 }, { "epoch": 5.047434834668877, "grad_norm": 3.3254809379577637, "learning_rate": 2.3612294760111112e-05, "loss": 0.6121, "step": 219200 }, { "epoch": 5.052040158423138, "grad_norm": 3.3986082077026367, "learning_rate": 2.3605023196288594e-05, "loss": 0.6231, "step": 219400 }, { "epoch": 5.056645482177397, "grad_norm": 3.187117099761963, "learning_rate": 2.359775163246608e-05, "loss": 0.6075, "step": 219600 }, { "epoch": 5.061250805931657, "grad_norm": 3.4964001178741455, "learning_rate": 2.3590480068643563e-05, "loss": 0.6022, "step": 219800 }, { "epoch": 5.065856129685917, "grad_norm": 2.966295003890991, "learning_rate": 2.3583208504821045e-05, "loss": 0.6103, "step": 220000 }, { "epoch": 5.070461453440177, "grad_norm": 2.458843946456909, "learning_rate": 2.357593694099853e-05, "loss": 0.6164, "step": 220200 }, { "epoch": 5.075066777194436, "grad_norm": 2.4280691146850586, "learning_rate": 2.3568665377176017e-05, "loss": 0.6103, "step": 220400 }, { "epoch": 5.079672100948697, "grad_norm": 2.904318332672119, "learning_rate": 2.35613938133535e-05, "loss": 0.6018, "step": 220600 }, { "epoch": 5.0842774247029565, "grad_norm": 3.2932612895965576, "learning_rate": 2.3554122249530986e-05, "loss": 0.6065, "step": 220800 }, { "epoch": 5.088882748457216, "grad_norm": 3.404580593109131, "learning_rate": 2.3546850685708468e-05, "loss": 0.6202, "step": 221000 }, { "epoch": 5.093488072211477, "grad_norm": 3.106785535812378, "learning_rate": 2.3539579121885954e-05, "loss": 0.6062, "step": 221200 }, { "epoch": 5.098093395965736, "grad_norm": 3.391956329345703, "learning_rate": 2.3532307558063437e-05, "loss": 0.6091, "step": 221400 }, { "epoch": 5.102698719719997, "grad_norm": 2.8422746658325195, "learning_rate": 2.3525035994240922e-05, "loss": 0.6021, "step": 221600 }, { "epoch": 5.107304043474256, "grad_norm": 2.9199986457824707, "learning_rate": 2.351776443041841e-05, "loss": 0.6146, "step": 221800 }, { "epoch": 5.111909367228516, "grad_norm": 3.0510222911834717, "learning_rate": 2.351049286659589e-05, "loss": 0.6042, "step": 222000 }, { "epoch": 5.116514690982776, "grad_norm": 4.642153263092041, "learning_rate": 2.350325766059249e-05, "loss": 0.6086, "step": 222200 }, { "epoch": 5.121120014737036, "grad_norm": 4.9221086502075195, "learning_rate": 2.3495986096769972e-05, "loss": 0.6069, "step": 222400 }, { "epoch": 5.1257253384912955, "grad_norm": 3.0352025032043457, "learning_rate": 2.3488714532947455e-05, "loss": 0.6003, "step": 222600 }, { "epoch": 5.130330662245556, "grad_norm": 3.675919771194458, "learning_rate": 2.348144296912494e-05, "loss": 0.6157, "step": 222800 }, { "epoch": 5.134935985999816, "grad_norm": 3.621706247329712, "learning_rate": 2.3474171405302426e-05, "loss": 0.6081, "step": 223000 }, { "epoch": 5.139541309754076, "grad_norm": 3.0481488704681396, "learning_rate": 2.346689984147991e-05, "loss": 0.6155, "step": 223200 }, { "epoch": 5.144146633508336, "grad_norm": 3.5414528846740723, "learning_rate": 2.3459628277657395e-05, "loss": 0.6054, "step": 223400 }, { "epoch": 5.148751957262595, "grad_norm": 3.2819435596466064, "learning_rate": 2.345235671383488e-05, "loss": 0.6131, "step": 223600 }, { "epoch": 5.153357281016856, "grad_norm": 2.693819761276245, "learning_rate": 2.344508515001236e-05, "loss": 0.6082, "step": 223800 }, { "epoch": 5.157962604771115, "grad_norm": 2.757230520248413, "learning_rate": 2.3437813586189846e-05, "loss": 0.6084, "step": 224000 }, { "epoch": 5.162567928525375, "grad_norm": 3.3219823837280273, "learning_rate": 2.343054202236733e-05, "loss": 0.6068, "step": 224200 }, { "epoch": 5.167173252279635, "grad_norm": 2.8286936283111572, "learning_rate": 2.3423270458544814e-05, "loss": 0.6043, "step": 224400 }, { "epoch": 5.171778576033895, "grad_norm": 3.0710113048553467, "learning_rate": 2.34159988947223e-05, "loss": 0.6132, "step": 224600 }, { "epoch": 5.1763838997881555, "grad_norm": 3.501431465148926, "learning_rate": 2.3408727330899786e-05, "loss": 0.6119, "step": 224800 }, { "epoch": 5.180989223542415, "grad_norm": 2.9645442962646484, "learning_rate": 2.3401455767077265e-05, "loss": 0.6027, "step": 225000 }, { "epoch": 5.185594547296675, "grad_norm": 3.057513475418091, "learning_rate": 2.3394220561073864e-05, "loss": 0.6105, "step": 225200 }, { "epoch": 5.190199871050935, "grad_norm": 3.0226759910583496, "learning_rate": 2.338694899725135e-05, "loss": 0.6052, "step": 225400 }, { "epoch": 5.194805194805195, "grad_norm": 2.725600481033325, "learning_rate": 2.3379677433428832e-05, "loss": 0.6066, "step": 225600 }, { "epoch": 5.199410518559454, "grad_norm": 3.8302340507507324, "learning_rate": 2.3372405869606318e-05, "loss": 0.6098, "step": 225800 }, { "epoch": 5.204015842313715, "grad_norm": 2.8420770168304443, "learning_rate": 2.3365134305783804e-05, "loss": 0.6164, "step": 226000 }, { "epoch": 5.208621166067974, "grad_norm": 3.4650795459747314, "learning_rate": 2.3357862741961287e-05, "loss": 0.6102, "step": 226200 }, { "epoch": 5.213226489822235, "grad_norm": 3.013132095336914, "learning_rate": 2.3350591178138772e-05, "loss": 0.6133, "step": 226400 }, { "epoch": 5.2178318135764945, "grad_norm": 3.517982244491577, "learning_rate": 2.3343319614316255e-05, "loss": 0.6042, "step": 226600 }, { "epoch": 5.222437137330754, "grad_norm": 3.570617914199829, "learning_rate": 2.3336048050493737e-05, "loss": 0.6194, "step": 226800 }, { "epoch": 5.227042461085015, "grad_norm": 3.553957223892212, "learning_rate": 2.3328776486671223e-05, "loss": 0.6028, "step": 227000 }, { "epoch": 5.231647784839274, "grad_norm": 3.234729051589966, "learning_rate": 2.332150492284871e-05, "loss": 0.6069, "step": 227200 }, { "epoch": 5.236253108593534, "grad_norm": 2.738168478012085, "learning_rate": 2.3314233359026195e-05, "loss": 0.6122, "step": 227400 }, { "epoch": 5.240858432347794, "grad_norm": 3.4680685997009277, "learning_rate": 2.3306961795203678e-05, "loss": 0.6166, "step": 227600 }, { "epoch": 5.245463756102054, "grad_norm": 3.349167823791504, "learning_rate": 2.329969023138116e-05, "loss": 0.6182, "step": 227800 }, { "epoch": 5.250069079856313, "grad_norm": 2.7940080165863037, "learning_rate": 2.3292418667558646e-05, "loss": 0.6073, "step": 228000 }, { "epoch": 5.254674403610574, "grad_norm": 3.263004779815674, "learning_rate": 2.328514710373613e-05, "loss": 0.6102, "step": 228200 }, { "epoch": 5.2592797273648335, "grad_norm": 3.2109506130218506, "learning_rate": 2.3277875539913615e-05, "loss": 0.6147, "step": 228400 }, { "epoch": 5.263885051119094, "grad_norm": 3.7838523387908936, "learning_rate": 2.32706039760911e-05, "loss": 0.61, "step": 228600 }, { "epoch": 5.268490374873354, "grad_norm": 2.4661216735839844, "learning_rate": 2.3263332412268583e-05, "loss": 0.6123, "step": 228800 }, { "epoch": 5.273095698627613, "grad_norm": 3.483590841293335, "learning_rate": 2.325606084844607e-05, "loss": 0.6215, "step": 229000 }, { "epoch": 5.277701022381874, "grad_norm": 2.8300187587738037, "learning_rate": 2.324878928462355e-05, "loss": 0.6036, "step": 229200 }, { "epoch": 5.282306346136133, "grad_norm": 3.444559097290039, "learning_rate": 2.3241517720801034e-05, "loss": 0.6136, "step": 229400 }, { "epoch": 5.286911669890394, "grad_norm": 3.6756949424743652, "learning_rate": 2.323424615697852e-05, "loss": 0.621, "step": 229600 }, { "epoch": 5.291516993644653, "grad_norm": 3.188176393508911, "learning_rate": 2.322701095097512e-05, "loss": 0.6068, "step": 229800 }, { "epoch": 5.296122317398913, "grad_norm": 3.3259594440460205, "learning_rate": 2.32197393871526e-05, "loss": 0.6159, "step": 230000 }, { "epoch": 5.300727641153173, "grad_norm": 2.9103612899780273, "learning_rate": 2.3212467823330087e-05, "loss": 0.6131, "step": 230200 }, { "epoch": 5.305332964907433, "grad_norm": 2.856694459915161, "learning_rate": 2.3205196259507573e-05, "loss": 0.608, "step": 230400 }, { "epoch": 5.309938288661693, "grad_norm": 3.170351982116699, "learning_rate": 2.3197924695685052e-05, "loss": 0.6211, "step": 230600 }, { "epoch": 5.314543612415953, "grad_norm": 2.481973648071289, "learning_rate": 2.3190653131862538e-05, "loss": 0.6183, "step": 230800 }, { "epoch": 5.319148936170213, "grad_norm": 2.819699287414551, "learning_rate": 2.3183381568040024e-05, "loss": 0.6043, "step": 231000 }, { "epoch": 5.323754259924472, "grad_norm": 2.6565167903900146, "learning_rate": 2.3176110004217506e-05, "loss": 0.6098, "step": 231200 }, { "epoch": 5.328359583678733, "grad_norm": 3.6145877838134766, "learning_rate": 2.3168838440394992e-05, "loss": 0.6108, "step": 231400 }, { "epoch": 5.332964907432992, "grad_norm": 3.620488166809082, "learning_rate": 2.3161566876572478e-05, "loss": 0.6066, "step": 231600 }, { "epoch": 5.337570231187253, "grad_norm": 3.417673110961914, "learning_rate": 2.3154331670569073e-05, "loss": 0.6115, "step": 231800 }, { "epoch": 5.342175554941512, "grad_norm": 3.6562862396240234, "learning_rate": 2.3147060106746556e-05, "loss": 0.6219, "step": 232000 }, { "epoch": 5.346780878695772, "grad_norm": 3.1641769409179688, "learning_rate": 2.3139788542924042e-05, "loss": 0.5932, "step": 232200 }, { "epoch": 5.3513862024500325, "grad_norm": 3.7529780864715576, "learning_rate": 2.3132516979101524e-05, "loss": 0.6114, "step": 232400 }, { "epoch": 5.355991526204292, "grad_norm": 4.237635612487793, "learning_rate": 2.3125281773098123e-05, "loss": 0.6147, "step": 232600 }, { "epoch": 5.360596849958552, "grad_norm": 3.604637861251831, "learning_rate": 2.311801020927561e-05, "loss": 0.6077, "step": 232800 }, { "epoch": 5.365202173712812, "grad_norm": 3.4843764305114746, "learning_rate": 2.311073864545309e-05, "loss": 0.6114, "step": 233000 }, { "epoch": 5.369807497467072, "grad_norm": 2.583153486251831, "learning_rate": 2.3103467081630577e-05, "loss": 0.5862, "step": 233200 }, { "epoch": 5.374412821221332, "grad_norm": 2.859898328781128, "learning_rate": 2.3096195517808063e-05, "loss": 0.6113, "step": 233400 }, { "epoch": 5.379018144975592, "grad_norm": 3.472050666809082, "learning_rate": 2.3088923953985542e-05, "loss": 0.6087, "step": 233600 }, { "epoch": 5.383623468729851, "grad_norm": 3.197916030883789, "learning_rate": 2.308168874798214e-05, "loss": 0.602, "step": 233800 }, { "epoch": 5.388228792484112, "grad_norm": 3.002330780029297, "learning_rate": 2.3074417184159624e-05, "loss": 0.6079, "step": 234000 }, { "epoch": 5.3928341162383715, "grad_norm": 3.3833117485046387, "learning_rate": 2.306714562033711e-05, "loss": 0.6113, "step": 234200 }, { "epoch": 5.397439439992631, "grad_norm": 2.8529210090637207, "learning_rate": 2.3059874056514595e-05, "loss": 0.6106, "step": 234400 }, { "epoch": 5.402044763746892, "grad_norm": 3.600402593612671, "learning_rate": 2.305260249269208e-05, "loss": 0.6042, "step": 234600 }, { "epoch": 5.406650087501151, "grad_norm": 2.895305633544922, "learning_rate": 2.3045330928869564e-05, "loss": 0.6136, "step": 234800 }, { "epoch": 5.411255411255412, "grad_norm": 2.833522319793701, "learning_rate": 2.3038059365047046e-05, "loss": 0.6148, "step": 235000 }, { "epoch": 5.415860735009671, "grad_norm": 3.2785439491271973, "learning_rate": 2.3030787801224532e-05, "loss": 0.5955, "step": 235200 }, { "epoch": 5.420466058763931, "grad_norm": 2.5901570320129395, "learning_rate": 2.3023516237402015e-05, "loss": 0.6243, "step": 235400 }, { "epoch": 5.425071382518191, "grad_norm": 2.593280076980591, "learning_rate": 2.30162446735795e-05, "loss": 0.6095, "step": 235600 }, { "epoch": 5.429676706272451, "grad_norm": 2.4974257946014404, "learning_rate": 2.3008973109756987e-05, "loss": 0.6196, "step": 235800 }, { "epoch": 5.4342820300267105, "grad_norm": 2.7232484817504883, "learning_rate": 2.3001737903753582e-05, "loss": 0.6088, "step": 236000 }, { "epoch": 5.438887353780971, "grad_norm": 3.5613925457000732, "learning_rate": 2.2994466339931068e-05, "loss": 0.6062, "step": 236200 }, { "epoch": 5.443492677535231, "grad_norm": 3.134225368499756, "learning_rate": 2.2987194776108554e-05, "loss": 0.6045, "step": 236400 }, { "epoch": 5.448098001289491, "grad_norm": 3.1845412254333496, "learning_rate": 2.2979923212286033e-05, "loss": 0.6135, "step": 236600 }, { "epoch": 5.452703325043751, "grad_norm": 3.119331121444702, "learning_rate": 2.297265164846352e-05, "loss": 0.6089, "step": 236800 }, { "epoch": 5.45730864879801, "grad_norm": 3.175262928009033, "learning_rate": 2.2965380084641005e-05, "loss": 0.6185, "step": 237000 }, { "epoch": 5.461913972552271, "grad_norm": 2.737478733062744, "learning_rate": 2.2958108520818487e-05, "loss": 0.6136, "step": 237200 }, { "epoch": 5.46651929630653, "grad_norm": 3.8264358043670654, "learning_rate": 2.2950836956995973e-05, "loss": 0.5989, "step": 237400 }, { "epoch": 5.47112462006079, "grad_norm": 2.678365468978882, "learning_rate": 2.294356539317346e-05, "loss": 0.6062, "step": 237600 }, { "epoch": 5.47572994381505, "grad_norm": 3.2151529788970947, "learning_rate": 2.2936293829350938e-05, "loss": 0.6091, "step": 237800 }, { "epoch": 5.48033526756931, "grad_norm": 2.8425002098083496, "learning_rate": 2.2929022265528424e-05, "loss": 0.6134, "step": 238000 }, { "epoch": 5.48494059132357, "grad_norm": 2.8937878608703613, "learning_rate": 2.292175070170591e-05, "loss": 0.6041, "step": 238200 }, { "epoch": 5.48954591507783, "grad_norm": 3.9438226222991943, "learning_rate": 2.2914479137883392e-05, "loss": 0.6061, "step": 238400 }, { "epoch": 5.49415123883209, "grad_norm": 2.902695655822754, "learning_rate": 2.290720757406088e-05, "loss": 0.6239, "step": 238600 }, { "epoch": 5.49875656258635, "grad_norm": 2.9250173568725586, "learning_rate": 2.2899936010238364e-05, "loss": 0.6094, "step": 238800 }, { "epoch": 5.50336188634061, "grad_norm": 3.668975591659546, "learning_rate": 2.2892664446415847e-05, "loss": 0.61, "step": 239000 }, { "epoch": 5.507967210094869, "grad_norm": 3.020686626434326, "learning_rate": 2.288539288259333e-05, "loss": 0.621, "step": 239200 }, { "epoch": 5.51257253384913, "grad_norm": 3.6001780033111572, "learning_rate": 2.2878121318770815e-05, "loss": 0.6037, "step": 239400 }, { "epoch": 5.517177857603389, "grad_norm": 2.8009440898895264, "learning_rate": 2.28708497549483e-05, "loss": 0.6158, "step": 239600 }, { "epoch": 5.52178318135765, "grad_norm": 2.9138927459716797, "learning_rate": 2.2863578191125784e-05, "loss": 0.6091, "step": 239800 }, { "epoch": 5.5263885051119095, "grad_norm": 3.2366840839385986, "learning_rate": 2.285630662730327e-05, "loss": 0.61, "step": 240000 }, { "epoch": 5.530993828866169, "grad_norm": 3.211121082305908, "learning_rate": 2.2849071421299865e-05, "loss": 0.6203, "step": 240200 }, { "epoch": 5.53559915262043, "grad_norm": 2.9791855812072754, "learning_rate": 2.284179985747735e-05, "loss": 0.611, "step": 240400 }, { "epoch": 5.540204476374689, "grad_norm": 2.9468283653259277, "learning_rate": 2.2834528293654833e-05, "loss": 0.5963, "step": 240600 }, { "epoch": 5.544809800128949, "grad_norm": 2.6335391998291016, "learning_rate": 2.282725672983232e-05, "loss": 0.5982, "step": 240800 }, { "epoch": 5.549415123883209, "grad_norm": 3.262544870376587, "learning_rate": 2.28199851660098e-05, "loss": 0.6005, "step": 241000 }, { "epoch": 5.554020447637469, "grad_norm": 2.9877734184265137, "learning_rate": 2.28127499600064e-05, "loss": 0.6062, "step": 241200 }, { "epoch": 5.558625771391728, "grad_norm": 3.008165121078491, "learning_rate": 2.2805478396183883e-05, "loss": 0.6056, "step": 241400 }, { "epoch": 5.563231095145989, "grad_norm": 2.6136744022369385, "learning_rate": 2.279820683236137e-05, "loss": 0.604, "step": 241600 }, { "epoch": 5.5678364189002485, "grad_norm": 3.0362038612365723, "learning_rate": 2.2790935268538855e-05, "loss": 0.6016, "step": 241800 }, { "epoch": 5.572441742654509, "grad_norm": 3.641286849975586, "learning_rate": 2.2783663704716334e-05, "loss": 0.6089, "step": 242000 }, { "epoch": 5.577047066408769, "grad_norm": 3.9004762172698975, "learning_rate": 2.277639214089382e-05, "loss": 0.6049, "step": 242200 }, { "epoch": 5.581652390163028, "grad_norm": 3.441751003265381, "learning_rate": 2.2769120577071306e-05, "loss": 0.6185, "step": 242400 }, { "epoch": 5.586257713917289, "grad_norm": 2.6894123554229736, "learning_rate": 2.276184901324879e-05, "loss": 0.614, "step": 242600 }, { "epoch": 5.590863037671548, "grad_norm": 3.961691379547119, "learning_rate": 2.2754577449426274e-05, "loss": 0.6081, "step": 242800 }, { "epoch": 5.595468361425809, "grad_norm": 4.229228496551514, "learning_rate": 2.274730588560376e-05, "loss": 0.6125, "step": 243000 }, { "epoch": 5.600073685180068, "grad_norm": 2.8456718921661377, "learning_rate": 2.2740034321781242e-05, "loss": 0.6013, "step": 243200 }, { "epoch": 5.604679008934328, "grad_norm": 2.563215494155884, "learning_rate": 2.2732762757958725e-05, "loss": 0.6055, "step": 243400 }, { "epoch": 5.609284332688588, "grad_norm": 3.1517744064331055, "learning_rate": 2.272549119413621e-05, "loss": 0.598, "step": 243600 }, { "epoch": 5.613889656442848, "grad_norm": 3.084362030029297, "learning_rate": 2.2718219630313697e-05, "loss": 0.6114, "step": 243800 }, { "epoch": 5.618494980197108, "grad_norm": 2.9705121517181396, "learning_rate": 2.271094806649118e-05, "loss": 0.6105, "step": 244000 }, { "epoch": 5.623100303951368, "grad_norm": 2.46872615814209, "learning_rate": 2.2703676502668665e-05, "loss": 0.6021, "step": 244200 }, { "epoch": 5.627705627705628, "grad_norm": 3.1614882946014404, "learning_rate": 2.269640493884615e-05, "loss": 0.6155, "step": 244400 }, { "epoch": 5.632310951459887, "grad_norm": 3.2688674926757812, "learning_rate": 2.268913337502363e-05, "loss": 0.617, "step": 244600 }, { "epoch": 5.636916275214148, "grad_norm": 2.7544074058532715, "learning_rate": 2.2681861811201116e-05, "loss": 0.6218, "step": 244800 }, { "epoch": 5.641521598968407, "grad_norm": 3.150360345840454, "learning_rate": 2.2674590247378602e-05, "loss": 0.6118, "step": 245000 }, { "epoch": 5.646126922722667, "grad_norm": 3.299985647201538, "learning_rate": 2.2667318683556088e-05, "loss": 0.6018, "step": 245200 }, { "epoch": 5.650732246476927, "grad_norm": 3.0777273178100586, "learning_rate": 2.266004711973357e-05, "loss": 0.6136, "step": 245400 }, { "epoch": 5.655337570231187, "grad_norm": 3.37021803855896, "learning_rate": 2.2652775555911056e-05, "loss": 0.6124, "step": 245600 }, { "epoch": 5.6599428939854475, "grad_norm": 3.1448745727539062, "learning_rate": 2.264550399208854e-05, "loss": 0.6185, "step": 245800 }, { "epoch": 5.664548217739707, "grad_norm": 3.1662933826446533, "learning_rate": 2.263823242826602e-05, "loss": 0.6087, "step": 246000 }, { "epoch": 5.669153541493967, "grad_norm": 2.839693546295166, "learning_rate": 2.2630960864443507e-05, "loss": 0.6074, "step": 246200 }, { "epoch": 5.673758865248227, "grad_norm": 3.3215548992156982, "learning_rate": 2.2623689300620993e-05, "loss": 0.6169, "step": 246400 }, { "epoch": 5.678364189002487, "grad_norm": 3.0180182456970215, "learning_rate": 2.2616417736798476e-05, "loss": 0.6126, "step": 246600 }, { "epoch": 5.682969512756747, "grad_norm": 2.791381597518921, "learning_rate": 2.2609146172975962e-05, "loss": 0.6059, "step": 246800 }, { "epoch": 5.687574836511007, "grad_norm": 3.137742280960083, "learning_rate": 2.2601874609153448e-05, "loss": 0.6082, "step": 247000 }, { "epoch": 5.692180160265266, "grad_norm": 3.0345163345336914, "learning_rate": 2.2594603045330927e-05, "loss": 0.5985, "step": 247200 }, { "epoch": 5.696785484019527, "grad_norm": 3.281728744506836, "learning_rate": 2.2587331481508413e-05, "loss": 0.5944, "step": 247400 }, { "epoch": 5.7013908077737865, "grad_norm": 2.6144189834594727, "learning_rate": 2.258009627550501e-05, "loss": 0.6033, "step": 247600 }, { "epoch": 5.705996131528046, "grad_norm": 3.488960027694702, "learning_rate": 2.2572824711682494e-05, "loss": 0.6041, "step": 247800 }, { "epoch": 5.710601455282307, "grad_norm": 2.470059871673584, "learning_rate": 2.256555314785998e-05, "loss": 0.5939, "step": 248000 }, { "epoch": 5.715206779036566, "grad_norm": 2.9853994846343994, "learning_rate": 2.2558281584037466e-05, "loss": 0.6016, "step": 248200 }, { "epoch": 5.719812102790826, "grad_norm": 2.8882532119750977, "learning_rate": 2.2551010020214948e-05, "loss": 0.614, "step": 248400 }, { "epoch": 5.724417426545086, "grad_norm": 2.4324746131896973, "learning_rate": 2.254373845639243e-05, "loss": 0.6169, "step": 248600 }, { "epoch": 5.729022750299346, "grad_norm": 3.518960952758789, "learning_rate": 2.253650325038903e-05, "loss": 0.6053, "step": 248800 }, { "epoch": 5.733628074053606, "grad_norm": 3.073248863220215, "learning_rate": 2.2529231686566512e-05, "loss": 0.6129, "step": 249000 }, { "epoch": 5.738233397807866, "grad_norm": 3.165026903152466, "learning_rate": 2.2521960122743998e-05, "loss": 0.6069, "step": 249200 }, { "epoch": 5.7428387215621255, "grad_norm": 3.6242356300354004, "learning_rate": 2.2514688558921484e-05, "loss": 0.617, "step": 249400 }, { "epoch": 5.747444045316386, "grad_norm": 2.7420730590820312, "learning_rate": 2.2507416995098966e-05, "loss": 0.6056, "step": 249600 }, { "epoch": 5.752049369070646, "grad_norm": 3.044351577758789, "learning_rate": 2.2500145431276452e-05, "loss": 0.6047, "step": 249800 }, { "epoch": 5.756654692824906, "grad_norm": 3.59561824798584, "learning_rate": 2.2492873867453935e-05, "loss": 0.5947, "step": 250000 }, { "epoch": 5.761260016579166, "grad_norm": 2.7053959369659424, "learning_rate": 2.2485602303631417e-05, "loss": 0.6167, "step": 250200 }, { "epoch": 5.765865340333425, "grad_norm": 3.2808427810668945, "learning_rate": 2.2478367097628016e-05, "loss": 0.6096, "step": 250400 }, { "epoch": 5.770470664087686, "grad_norm": 3.0669496059417725, "learning_rate": 2.2471095533805502e-05, "loss": 0.6172, "step": 250600 }, { "epoch": 5.775075987841945, "grad_norm": 3.3072104454040527, "learning_rate": 2.2463823969982984e-05, "loss": 0.6153, "step": 250800 }, { "epoch": 5.779681311596205, "grad_norm": 3.2189414501190186, "learning_rate": 2.245655240616047e-05, "loss": 0.6051, "step": 251000 }, { "epoch": 5.784286635350465, "grad_norm": 3.9628310203552246, "learning_rate": 2.2449280842337956e-05, "loss": 0.6128, "step": 251200 }, { "epoch": 5.788891959104725, "grad_norm": 2.860816717147827, "learning_rate": 2.244200927851544e-05, "loss": 0.6007, "step": 251400 }, { "epoch": 5.793497282858985, "grad_norm": 2.6828348636627197, "learning_rate": 2.243473771469292e-05, "loss": 0.5999, "step": 251600 }, { "epoch": 5.798102606613245, "grad_norm": 3.44728946685791, "learning_rate": 2.2427466150870407e-05, "loss": 0.6162, "step": 251800 }, { "epoch": 5.802707930367505, "grad_norm": 2.5375330448150635, "learning_rate": 2.242019458704789e-05, "loss": 0.6101, "step": 252000 }, { "epoch": 5.807313254121765, "grad_norm": 3.269181489944458, "learning_rate": 2.2412923023225375e-05, "loss": 0.6227, "step": 252200 }, { "epoch": 5.811918577876025, "grad_norm": 3.957916498184204, "learning_rate": 2.240565145940286e-05, "loss": 0.6036, "step": 252400 }, { "epoch": 5.816523901630284, "grad_norm": 3.4399096965789795, "learning_rate": 2.2398379895580344e-05, "loss": 0.6127, "step": 252600 }, { "epoch": 5.821129225384545, "grad_norm": 3.4735524654388428, "learning_rate": 2.2391108331757826e-05, "loss": 0.6012, "step": 252800 }, { "epoch": 5.825734549138804, "grad_norm": 2.868939161300659, "learning_rate": 2.2383836767935312e-05, "loss": 0.6138, "step": 253000 }, { "epoch": 5.830339872893065, "grad_norm": 3.8807289600372314, "learning_rate": 2.2376565204112798e-05, "loss": 0.6036, "step": 253200 }, { "epoch": 5.8349451966473245, "grad_norm": 3.3366916179656982, "learning_rate": 2.236929364029028e-05, "loss": 0.6151, "step": 253400 }, { "epoch": 5.839550520401584, "grad_norm": 3.0559029579162598, "learning_rate": 2.2362022076467767e-05, "loss": 0.6081, "step": 253600 }, { "epoch": 5.8441558441558445, "grad_norm": 2.922211170196533, "learning_rate": 2.2354750512645253e-05, "loss": 0.611, "step": 253800 }, { "epoch": 5.848761167910104, "grad_norm": 3.6084461212158203, "learning_rate": 2.2347478948822735e-05, "loss": 0.6, "step": 254000 }, { "epoch": 5.853366491664364, "grad_norm": 3.366588592529297, "learning_rate": 2.2340207385000218e-05, "loss": 0.6079, "step": 254200 }, { "epoch": 5.857971815418624, "grad_norm": 4.779687404632568, "learning_rate": 2.2332935821177704e-05, "loss": 0.5983, "step": 254400 }, { "epoch": 5.862577139172884, "grad_norm": 2.4821720123291016, "learning_rate": 2.2325664257355186e-05, "loss": 0.5977, "step": 254600 }, { "epoch": 5.867182462927143, "grad_norm": 3.079432487487793, "learning_rate": 2.2318392693532672e-05, "loss": 0.6135, "step": 254800 }, { "epoch": 5.871787786681404, "grad_norm": 2.8621842861175537, "learning_rate": 2.2311121129710158e-05, "loss": 0.615, "step": 255000 }, { "epoch": 5.8763931104356635, "grad_norm": 3.1682121753692627, "learning_rate": 2.230384956588764e-05, "loss": 0.6029, "step": 255200 }, { "epoch": 5.880998434189923, "grad_norm": 3.040579080581665, "learning_rate": 2.2296578002065123e-05, "loss": 0.6091, "step": 255400 }, { "epoch": 5.885603757944184, "grad_norm": 3.665220260620117, "learning_rate": 2.228930643824261e-05, "loss": 0.6065, "step": 255600 }, { "epoch": 5.890209081698443, "grad_norm": 2.9831740856170654, "learning_rate": 2.228203487442009e-05, "loss": 0.6027, "step": 255800 }, { "epoch": 5.894814405452704, "grad_norm": 2.8328495025634766, "learning_rate": 2.2274763310597577e-05, "loss": 0.6086, "step": 256000 }, { "epoch": 5.899419729206963, "grad_norm": 3.341989755630493, "learning_rate": 2.2267528104594176e-05, "loss": 0.6206, "step": 256200 }, { "epoch": 5.904025052961223, "grad_norm": 2.885848045349121, "learning_rate": 2.226025654077166e-05, "loss": 0.6136, "step": 256400 }, { "epoch": 5.908630376715483, "grad_norm": 2.7301769256591797, "learning_rate": 2.2252984976949144e-05, "loss": 0.6032, "step": 256600 }, { "epoch": 5.913235700469743, "grad_norm": 3.254574775695801, "learning_rate": 2.2245713413126627e-05, "loss": 0.6161, "step": 256800 }, { "epoch": 5.917841024224003, "grad_norm": 3.9567196369171143, "learning_rate": 2.223844184930411e-05, "loss": 0.6057, "step": 257000 }, { "epoch": 5.922446347978263, "grad_norm": 2.612321615219116, "learning_rate": 2.2231170285481595e-05, "loss": 0.6112, "step": 257200 }, { "epoch": 5.927051671732523, "grad_norm": 2.945931911468506, "learning_rate": 2.222389872165908e-05, "loss": 0.6134, "step": 257400 }, { "epoch": 5.931656995486783, "grad_norm": 3.260687828063965, "learning_rate": 2.2216627157836564e-05, "loss": 0.5979, "step": 257600 }, { "epoch": 5.936262319241043, "grad_norm": 2.9708778858184814, "learning_rate": 2.220935559401405e-05, "loss": 0.6135, "step": 257800 }, { "epoch": 5.940867642995302, "grad_norm": 3.1571850776672363, "learning_rate": 2.2202084030191536e-05, "loss": 0.615, "step": 258000 }, { "epoch": 5.945472966749563, "grad_norm": 3.849186658859253, "learning_rate": 2.2194812466369018e-05, "loss": 0.62, "step": 258200 }, { "epoch": 5.950078290503822, "grad_norm": 3.6725234985351562, "learning_rate": 2.21875409025465e-05, "loss": 0.6047, "step": 258400 }, { "epoch": 5.954683614258082, "grad_norm": 2.586672306060791, "learning_rate": 2.2180269338723987e-05, "loss": 0.6092, "step": 258600 }, { "epoch": 5.959288938012342, "grad_norm": 3.1655867099761963, "learning_rate": 2.2172997774901473e-05, "loss": 0.609, "step": 258800 }, { "epoch": 5.963894261766602, "grad_norm": 2.586534023284912, "learning_rate": 2.2165726211078955e-05, "loss": 0.6168, "step": 259000 }, { "epoch": 5.9684995855208625, "grad_norm": 2.9593117237091064, "learning_rate": 2.215845464725644e-05, "loss": 0.6058, "step": 259200 }, { "epoch": 5.973104909275122, "grad_norm": 3.1021628379821777, "learning_rate": 2.2151183083433923e-05, "loss": 0.6136, "step": 259400 }, { "epoch": 5.977710233029382, "grad_norm": 3.3624014854431152, "learning_rate": 2.2143911519611406e-05, "loss": 0.6066, "step": 259600 }, { "epoch": 5.982315556783642, "grad_norm": 3.2505760192871094, "learning_rate": 2.2136639955788892e-05, "loss": 0.6154, "step": 259800 }, { "epoch": 5.986920880537902, "grad_norm": 3.521606683731079, "learning_rate": 2.2129368391966378e-05, "loss": 0.6109, "step": 260000 }, { "epoch": 5.991526204292162, "grad_norm": 3.341179370880127, "learning_rate": 2.212209682814386e-05, "loss": 0.6179, "step": 260200 }, { "epoch": 5.996131528046422, "grad_norm": 3.8948066234588623, "learning_rate": 2.211486162214046e-05, "loss": 0.6183, "step": 260400 }, { "epoch": 6.0, "eval_loss": 0.5793450474739075, "eval_runtime": 146.4894, "eval_samples_per_second": 193.604, "eval_steps_per_second": 12.103, "step": 260568 }, { "epoch": 6.000736851800681, "grad_norm": 3.0538671016693115, "learning_rate": 2.2107626416137054e-05, "loss": 0.6025, "step": 260600 }, { "epoch": 6.005342175554942, "grad_norm": 2.459566354751587, "learning_rate": 2.210035485231454e-05, "loss": 0.6142, "step": 260800 }, { "epoch": 6.0099474993092015, "grad_norm": 3.1511268615722656, "learning_rate": 2.2093083288492026e-05, "loss": 0.6067, "step": 261000 }, { "epoch": 6.014552823063461, "grad_norm": 2.9049158096313477, "learning_rate": 2.2085811724669505e-05, "loss": 0.6013, "step": 261200 }, { "epoch": 6.0191581468177215, "grad_norm": 2.9899065494537354, "learning_rate": 2.207854016084699e-05, "loss": 0.5913, "step": 261400 }, { "epoch": 6.023763470571981, "grad_norm": 3.2502787113189697, "learning_rate": 2.2071268597024477e-05, "loss": 0.5923, "step": 261600 }, { "epoch": 6.028368794326241, "grad_norm": 3.313624382019043, "learning_rate": 2.2063997033201963e-05, "loss": 0.6092, "step": 261800 }, { "epoch": 6.032974118080501, "grad_norm": 3.1583757400512695, "learning_rate": 2.2056725469379445e-05, "loss": 0.6048, "step": 262000 }, { "epoch": 6.037579441834761, "grad_norm": 2.4791722297668457, "learning_rate": 2.204945390555693e-05, "loss": 0.6054, "step": 262200 }, { "epoch": 6.042184765589021, "grad_norm": 3.2906877994537354, "learning_rate": 2.2042182341734414e-05, "loss": 0.6054, "step": 262400 }, { "epoch": 6.046790089343281, "grad_norm": 3.4483656883239746, "learning_rate": 2.2034910777911896e-05, "loss": 0.5909, "step": 262600 }, { "epoch": 6.0513954130975405, "grad_norm": 3.3106260299682617, "learning_rate": 2.2027639214089382e-05, "loss": 0.6066, "step": 262800 }, { "epoch": 6.056000736851801, "grad_norm": 3.107494354248047, "learning_rate": 2.2020367650266868e-05, "loss": 0.5997, "step": 263000 }, { "epoch": 6.0606060606060606, "grad_norm": 3.537893295288086, "learning_rate": 2.2013132444263463e-05, "loss": 0.5907, "step": 263200 }, { "epoch": 6.06521138436032, "grad_norm": 4.185168266296387, "learning_rate": 2.200586088044095e-05, "loss": 0.5958, "step": 263400 }, { "epoch": 6.069816708114581, "grad_norm": 2.7345850467681885, "learning_rate": 2.1998589316618435e-05, "loss": 0.6022, "step": 263600 }, { "epoch": 6.07442203186884, "grad_norm": 2.91339111328125, "learning_rate": 2.1991317752795914e-05, "loss": 0.6085, "step": 263800 }, { "epoch": 6.079027355623101, "grad_norm": 3.6052587032318115, "learning_rate": 2.19840461889734e-05, "loss": 0.5946, "step": 264000 }, { "epoch": 6.08363267937736, "grad_norm": 3.44474196434021, "learning_rate": 2.1976774625150886e-05, "loss": 0.6043, "step": 264200 }, { "epoch": 6.08823800313162, "grad_norm": 2.796043634414673, "learning_rate": 2.196950306132837e-05, "loss": 0.5965, "step": 264400 }, { "epoch": 6.09284332688588, "grad_norm": 2.4689533710479736, "learning_rate": 2.1962231497505855e-05, "loss": 0.6029, "step": 264600 }, { "epoch": 6.09744865064014, "grad_norm": 4.061243057250977, "learning_rate": 2.195495993368334e-05, "loss": 0.6063, "step": 264800 }, { "epoch": 6.1020539743944, "grad_norm": 3.009557008743286, "learning_rate": 2.1947688369860823e-05, "loss": 0.6208, "step": 265000 }, { "epoch": 6.10665929814866, "grad_norm": 3.3373091220855713, "learning_rate": 2.1940416806038306e-05, "loss": 0.609, "step": 265200 }, { "epoch": 6.11126462190292, "grad_norm": 2.8920810222625732, "learning_rate": 2.193314524221579e-05, "loss": 0.5935, "step": 265400 }, { "epoch": 6.11586994565718, "grad_norm": 4.441521644592285, "learning_rate": 2.1925873678393274e-05, "loss": 0.6031, "step": 265600 }, { "epoch": 6.12047526941144, "grad_norm": 3.1611151695251465, "learning_rate": 2.191860211457076e-05, "loss": 0.6097, "step": 265800 }, { "epoch": 6.125080593165699, "grad_norm": 2.591564178466797, "learning_rate": 2.1911330550748246e-05, "loss": 0.5988, "step": 266000 }, { "epoch": 6.12968591691996, "grad_norm": 2.5633347034454346, "learning_rate": 2.1904058986925732e-05, "loss": 0.6033, "step": 266200 }, { "epoch": 6.134291240674219, "grad_norm": 2.846238851547241, "learning_rate": 2.189678742310321e-05, "loss": 0.6011, "step": 266400 }, { "epoch": 6.138896564428479, "grad_norm": 2.9226114749908447, "learning_rate": 2.1889515859280697e-05, "loss": 0.6115, "step": 266600 }, { "epoch": 6.1435018881827395, "grad_norm": 3.3836491107940674, "learning_rate": 2.1882244295458183e-05, "loss": 0.6041, "step": 266800 }, { "epoch": 6.148107211936999, "grad_norm": 3.141162157058716, "learning_rate": 2.1874972731635665e-05, "loss": 0.5929, "step": 267000 }, { "epoch": 6.1527125356912595, "grad_norm": 3.0072133541107178, "learning_rate": 2.186770116781315e-05, "loss": 0.6038, "step": 267200 }, { "epoch": 6.157317859445519, "grad_norm": 3.065700054168701, "learning_rate": 2.1860465961809746e-05, "loss": 0.5946, "step": 267400 }, { "epoch": 6.161923183199779, "grad_norm": 2.743119239807129, "learning_rate": 2.1853194397987232e-05, "loss": 0.615, "step": 267600 }, { "epoch": 6.166528506954039, "grad_norm": 3.0159752368927, "learning_rate": 2.184595919198383e-05, "loss": 0.5905, "step": 267800 }, { "epoch": 6.171133830708299, "grad_norm": 3.077939033508301, "learning_rate": 2.1838687628161313e-05, "loss": 0.6043, "step": 268000 }, { "epoch": 6.175739154462558, "grad_norm": 3.3442816734313965, "learning_rate": 2.1831416064338796e-05, "loss": 0.606, "step": 268200 }, { "epoch": 6.180344478216819, "grad_norm": 2.753549337387085, "learning_rate": 2.1824144500516282e-05, "loss": 0.6119, "step": 268400 }, { "epoch": 6.1849498019710785, "grad_norm": 2.6323747634887695, "learning_rate": 2.1816872936693764e-05, "loss": 0.5995, "step": 268600 }, { "epoch": 6.189555125725338, "grad_norm": 3.0418403148651123, "learning_rate": 2.180960137287125e-05, "loss": 0.6196, "step": 268800 }, { "epoch": 6.1941604494795985, "grad_norm": 2.880768060684204, "learning_rate": 2.1802329809048736e-05, "loss": 0.5948, "step": 269000 }, { "epoch": 6.198765773233858, "grad_norm": 3.229389190673828, "learning_rate": 2.179505824522622e-05, "loss": 0.6088, "step": 269200 }, { "epoch": 6.203371096988119, "grad_norm": 2.6277809143066406, "learning_rate": 2.17877866814037e-05, "loss": 0.6097, "step": 269400 }, { "epoch": 6.207976420742378, "grad_norm": 2.8957467079162598, "learning_rate": 2.1780515117581187e-05, "loss": 0.6027, "step": 269600 }, { "epoch": 6.212581744496638, "grad_norm": 3.4677155017852783, "learning_rate": 2.1773243553758673e-05, "loss": 0.6016, "step": 269800 }, { "epoch": 6.217187068250898, "grad_norm": 2.457707405090332, "learning_rate": 2.1765971989936156e-05, "loss": 0.6114, "step": 270000 }, { "epoch": 6.221792392005158, "grad_norm": 3.322401762008667, "learning_rate": 2.1758773141751867e-05, "loss": 0.6114, "step": 270200 }, { "epoch": 6.2263977157594175, "grad_norm": 2.784977912902832, "learning_rate": 2.175150157792935e-05, "loss": 0.5959, "step": 270400 }, { "epoch": 6.231003039513678, "grad_norm": 2.8627588748931885, "learning_rate": 2.1744230014106835e-05, "loss": 0.6067, "step": 270600 }, { "epoch": 6.2356083632679375, "grad_norm": 3.0779459476470947, "learning_rate": 2.173695845028432e-05, "loss": 0.6007, "step": 270800 }, { "epoch": 6.240213687022198, "grad_norm": 3.0918633937835693, "learning_rate": 2.17296868864618e-05, "loss": 0.6034, "step": 271000 }, { "epoch": 6.244819010776458, "grad_norm": 3.245352029800415, "learning_rate": 2.1722415322639286e-05, "loss": 0.6013, "step": 271200 }, { "epoch": 6.249424334530717, "grad_norm": 3.179417610168457, "learning_rate": 2.1715143758816772e-05, "loss": 0.6043, "step": 271400 }, { "epoch": 6.254029658284978, "grad_norm": 3.051330804824829, "learning_rate": 2.1707872194994255e-05, "loss": 0.5994, "step": 271600 }, { "epoch": 6.258634982039237, "grad_norm": 3.0867514610290527, "learning_rate": 2.170060063117174e-05, "loss": 0.607, "step": 271800 }, { "epoch": 6.263240305793497, "grad_norm": 3.2159993648529053, "learning_rate": 2.1693329067349227e-05, "loss": 0.592, "step": 272000 }, { "epoch": 6.267845629547757, "grad_norm": 2.641268253326416, "learning_rate": 2.168605750352671e-05, "loss": 0.6019, "step": 272200 }, { "epoch": 6.272450953302017, "grad_norm": 3.2037994861602783, "learning_rate": 2.167878593970419e-05, "loss": 0.5966, "step": 272400 }, { "epoch": 6.2770562770562774, "grad_norm": 2.531846523284912, "learning_rate": 2.1671514375881678e-05, "loss": 0.598, "step": 272600 }, { "epoch": 6.281661600810537, "grad_norm": 3.13655161857605, "learning_rate": 2.1664242812059163e-05, "loss": 0.6102, "step": 272800 }, { "epoch": 6.286266924564797, "grad_norm": 3.5509562492370605, "learning_rate": 2.1656971248236646e-05, "loss": 0.6071, "step": 273000 }, { "epoch": 6.290872248319057, "grad_norm": 4.013696193695068, "learning_rate": 2.1649699684414132e-05, "loss": 0.6048, "step": 273200 }, { "epoch": 6.295477572073317, "grad_norm": 3.690094232559204, "learning_rate": 2.1642428120591618e-05, "loss": 0.6037, "step": 273400 }, { "epoch": 6.300082895827576, "grad_norm": 2.8810877799987793, "learning_rate": 2.1635156556769097e-05, "loss": 0.5939, "step": 273600 }, { "epoch": 6.304688219581837, "grad_norm": 2.392094612121582, "learning_rate": 2.1627884992946583e-05, "loss": 0.5881, "step": 273800 }, { "epoch": 6.309293543336096, "grad_norm": 2.9791033267974854, "learning_rate": 2.162061342912407e-05, "loss": 0.6063, "step": 274000 }, { "epoch": 6.313898867090357, "grad_norm": 3.6105103492736816, "learning_rate": 2.161334186530155e-05, "loss": 0.5895, "step": 274200 }, { "epoch": 6.3185041908446165, "grad_norm": 3.0321569442749023, "learning_rate": 2.1606070301479037e-05, "loss": 0.5868, "step": 274400 }, { "epoch": 6.323109514598876, "grad_norm": 2.3522133827209473, "learning_rate": 2.1598798737656523e-05, "loss": 0.5985, "step": 274600 }, { "epoch": 6.3277148383531365, "grad_norm": 3.172877550125122, "learning_rate": 2.1591527173834002e-05, "loss": 0.6006, "step": 274800 }, { "epoch": 6.332320162107396, "grad_norm": 3.2130961418151855, "learning_rate": 2.1584255610011488e-05, "loss": 0.6014, "step": 275000 }, { "epoch": 6.336925485861656, "grad_norm": 3.1269421577453613, "learning_rate": 2.1576984046188974e-05, "loss": 0.5893, "step": 275200 }, { "epoch": 6.341530809615916, "grad_norm": 2.8266472816467285, "learning_rate": 2.1569712482366457e-05, "loss": 0.6025, "step": 275400 }, { "epoch": 6.346136133370176, "grad_norm": 3.0055863857269287, "learning_rate": 2.1562440918543943e-05, "loss": 0.5972, "step": 275600 }, { "epoch": 6.350741457124435, "grad_norm": 3.1242921352386475, "learning_rate": 2.155516935472143e-05, "loss": 0.603, "step": 275800 }, { "epoch": 6.355346780878696, "grad_norm": 3.971442222595215, "learning_rate": 2.1547897790898914e-05, "loss": 0.6083, "step": 276000 }, { "epoch": 6.3599521046329555, "grad_norm": 3.2091002464294434, "learning_rate": 2.154066258489551e-05, "loss": 0.6112, "step": 276200 }, { "epoch": 6.364557428387216, "grad_norm": 3.378394842147827, "learning_rate": 2.1533391021072992e-05, "loss": 0.5943, "step": 276400 }, { "epoch": 6.3691627521414755, "grad_norm": 3.663804292678833, "learning_rate": 2.1526119457250475e-05, "loss": 0.5942, "step": 276600 }, { "epoch": 6.373768075895735, "grad_norm": 2.897817611694336, "learning_rate": 2.151884789342796e-05, "loss": 0.5977, "step": 276800 }, { "epoch": 6.378373399649996, "grad_norm": 3.405097484588623, "learning_rate": 2.1511576329605446e-05, "loss": 0.5935, "step": 277000 }, { "epoch": 6.382978723404255, "grad_norm": 3.117182970046997, "learning_rate": 2.150430476578293e-05, "loss": 0.6087, "step": 277200 }, { "epoch": 6.387584047158516, "grad_norm": 2.3870809078216553, "learning_rate": 2.1497033201960415e-05, "loss": 0.6, "step": 277400 }, { "epoch": 6.392189370912775, "grad_norm": 3.2303497791290283, "learning_rate": 2.1489761638137897e-05, "loss": 0.6026, "step": 277600 }, { "epoch": 6.396794694667035, "grad_norm": 3.261338710784912, "learning_rate": 2.1482490074315383e-05, "loss": 0.6004, "step": 277800 }, { "epoch": 6.401400018421295, "grad_norm": 2.4772233963012695, "learning_rate": 2.1475218510492866e-05, "loss": 0.6047, "step": 278000 }, { "epoch": 6.406005342175555, "grad_norm": 4.6650590896606445, "learning_rate": 2.1467946946670352e-05, "loss": 0.5949, "step": 278200 }, { "epoch": 6.4106106659298145, "grad_norm": 2.938955783843994, "learning_rate": 2.1460675382847838e-05, "loss": 0.6107, "step": 278400 }, { "epoch": 6.415215989684075, "grad_norm": 2.6229448318481445, "learning_rate": 2.145340381902532e-05, "loss": 0.6161, "step": 278600 }, { "epoch": 6.419821313438335, "grad_norm": 2.3591392040252686, "learning_rate": 2.1446132255202806e-05, "loss": 0.5965, "step": 278800 }, { "epoch": 6.424426637192594, "grad_norm": 3.7696409225463867, "learning_rate": 2.143886069138029e-05, "loss": 0.6014, "step": 279000 }, { "epoch": 6.429031960946855, "grad_norm": 3.477569818496704, "learning_rate": 2.143158912755777e-05, "loss": 0.5994, "step": 279200 }, { "epoch": 6.433637284701114, "grad_norm": 3.7880353927612305, "learning_rate": 2.1424317563735257e-05, "loss": 0.6106, "step": 279400 }, { "epoch": 6.438242608455375, "grad_norm": 2.632434844970703, "learning_rate": 2.1417045999912743e-05, "loss": 0.6119, "step": 279600 }, { "epoch": 6.442847932209634, "grad_norm": 2.5677168369293213, "learning_rate": 2.1409774436090226e-05, "loss": 0.5988, "step": 279800 }, { "epoch": 6.447453255963894, "grad_norm": 3.0066206455230713, "learning_rate": 2.140250287226771e-05, "loss": 0.6077, "step": 280000 }, { "epoch": 6.452058579718154, "grad_norm": 3.006856679916382, "learning_rate": 2.1395231308445194e-05, "loss": 0.5956, "step": 280200 }, { "epoch": 6.456663903472414, "grad_norm": 3.6395933628082275, "learning_rate": 2.138799610244179e-05, "loss": 0.6028, "step": 280400 }, { "epoch": 6.461269227226674, "grad_norm": 2.876479148864746, "learning_rate": 2.1380724538619275e-05, "loss": 0.6037, "step": 280600 }, { "epoch": 6.465874550980934, "grad_norm": 2.5705347061157227, "learning_rate": 2.137345297479676e-05, "loss": 0.6006, "step": 280800 }, { "epoch": 6.470479874735194, "grad_norm": 3.720731496810913, "learning_rate": 2.1366181410974244e-05, "loss": 0.599, "step": 281000 }, { "epoch": 6.475085198489454, "grad_norm": 4.109841823577881, "learning_rate": 2.135890984715173e-05, "loss": 0.6118, "step": 281200 }, { "epoch": 6.479690522243714, "grad_norm": 3.3773958683013916, "learning_rate": 2.1351638283329215e-05, "loss": 0.6039, "step": 281400 }, { "epoch": 6.484295845997973, "grad_norm": 2.8852438926696777, "learning_rate": 2.1344366719506694e-05, "loss": 0.6062, "step": 281600 }, { "epoch": 6.488901169752234, "grad_norm": 2.8208401203155518, "learning_rate": 2.1337131513503293e-05, "loss": 0.6038, "step": 281800 }, { "epoch": 6.4935064935064934, "grad_norm": 2.8988304138183594, "learning_rate": 2.132985994968078e-05, "loss": 0.6174, "step": 282000 }, { "epoch": 6.498111817260753, "grad_norm": 3.9682488441467285, "learning_rate": 2.132258838585826e-05, "loss": 0.5891, "step": 282200 }, { "epoch": 6.5027171410150135, "grad_norm": 2.687206745147705, "learning_rate": 2.1315316822035747e-05, "loss": 0.5942, "step": 282400 }, { "epoch": 6.507322464769273, "grad_norm": 2.6542367935180664, "learning_rate": 2.1308045258213233e-05, "loss": 0.6074, "step": 282600 }, { "epoch": 6.511927788523533, "grad_norm": 4.3083906173706055, "learning_rate": 2.1300773694390716e-05, "loss": 0.6039, "step": 282800 }, { "epoch": 6.516533112277793, "grad_norm": 3.527921199798584, "learning_rate": 2.1293502130568202e-05, "loss": 0.6026, "step": 283000 }, { "epoch": 6.521138436032053, "grad_norm": 2.9603312015533447, "learning_rate": 2.1286230566745684e-05, "loss": 0.6139, "step": 283200 }, { "epoch": 6.525743759786313, "grad_norm": 3.2545838356018066, "learning_rate": 2.1278959002923167e-05, "loss": 0.6099, "step": 283400 }, { "epoch": 6.530349083540573, "grad_norm": 3.675568103790283, "learning_rate": 2.1271687439100653e-05, "loss": 0.6022, "step": 283600 }, { "epoch": 6.5349544072948325, "grad_norm": 2.5842385292053223, "learning_rate": 2.126441587527814e-05, "loss": 0.608, "step": 283800 }, { "epoch": 6.539559731049093, "grad_norm": 3.1550087928771973, "learning_rate": 2.1257144311455625e-05, "loss": 0.5932, "step": 284000 }, { "epoch": 6.5441650548033525, "grad_norm": 2.6331937313079834, "learning_rate": 2.1249872747633107e-05, "loss": 0.5988, "step": 284200 }, { "epoch": 6.548770378557613, "grad_norm": 3.004626750946045, "learning_rate": 2.124260118381059e-05, "loss": 0.6025, "step": 284400 }, { "epoch": 6.553375702311873, "grad_norm": 2.480337142944336, "learning_rate": 2.1235329619988076e-05, "loss": 0.5956, "step": 284600 }, { "epoch": 6.557981026066132, "grad_norm": 3.277716875076294, "learning_rate": 2.1228058056165558e-05, "loss": 0.5961, "step": 284800 }, { "epoch": 6.562586349820393, "grad_norm": 2.759143590927124, "learning_rate": 2.1220786492343044e-05, "loss": 0.5875, "step": 285000 }, { "epoch": 6.567191673574652, "grad_norm": 2.480003833770752, "learning_rate": 2.121351492852053e-05, "loss": 0.5972, "step": 285200 }, { "epoch": 6.571796997328912, "grad_norm": 3.330686569213867, "learning_rate": 2.1206279722517125e-05, "loss": 0.5915, "step": 285400 }, { "epoch": 6.576402321083172, "grad_norm": 2.3041577339172363, "learning_rate": 2.119900815869461e-05, "loss": 0.6091, "step": 285600 }, { "epoch": 6.581007644837432, "grad_norm": 3.41249680519104, "learning_rate": 2.1191736594872097e-05, "loss": 0.5871, "step": 285800 }, { "epoch": 6.5856129685916915, "grad_norm": 2.979804277420044, "learning_rate": 2.1184465031049576e-05, "loss": 0.593, "step": 286000 }, { "epoch": 6.590218292345952, "grad_norm": 3.0623714923858643, "learning_rate": 2.1177193467227062e-05, "loss": 0.6024, "step": 286200 }, { "epoch": 6.594823616100212, "grad_norm": 3.924044132232666, "learning_rate": 2.1169921903404548e-05, "loss": 0.6108, "step": 286400 }, { "epoch": 6.599428939854472, "grad_norm": 2.970489501953125, "learning_rate": 2.116265033958203e-05, "loss": 0.5991, "step": 286600 }, { "epoch": 6.604034263608732, "grad_norm": 2.641425371170044, "learning_rate": 2.1155378775759516e-05, "loss": 0.6112, "step": 286800 }, { "epoch": 6.608639587362991, "grad_norm": 2.8824870586395264, "learning_rate": 2.1148107211937002e-05, "loss": 0.5837, "step": 287000 }, { "epoch": 6.613244911117252, "grad_norm": 2.681509256362915, "learning_rate": 2.114083564811448e-05, "loss": 0.6031, "step": 287200 }, { "epoch": 6.617850234871511, "grad_norm": 3.070089101791382, "learning_rate": 2.1133564084291967e-05, "loss": 0.5896, "step": 287400 }, { "epoch": 6.622455558625772, "grad_norm": 3.1326851844787598, "learning_rate": 2.1126328878288566e-05, "loss": 0.5909, "step": 287600 }, { "epoch": 6.627060882380031, "grad_norm": 3.5522382259368896, "learning_rate": 2.111905731446605e-05, "loss": 0.601, "step": 287800 }, { "epoch": 6.631666206134291, "grad_norm": 3.211503028869629, "learning_rate": 2.1111785750643534e-05, "loss": 0.6018, "step": 288000 }, { "epoch": 6.6362715298885515, "grad_norm": 2.8858470916748047, "learning_rate": 2.110451418682102e-05, "loss": 0.6202, "step": 288200 }, { "epoch": 6.640876853642811, "grad_norm": 3.317497730255127, "learning_rate": 2.1097242622998503e-05, "loss": 0.5961, "step": 288400 }, { "epoch": 6.645482177397071, "grad_norm": 2.4950830936431885, "learning_rate": 2.1089971059175985e-05, "loss": 0.6002, "step": 288600 }, { "epoch": 6.650087501151331, "grad_norm": 3.110621929168701, "learning_rate": 2.108269949535347e-05, "loss": 0.6046, "step": 288800 }, { "epoch": 6.654692824905591, "grad_norm": 3.673153877258301, "learning_rate": 2.1075427931530954e-05, "loss": 0.5994, "step": 289000 }, { "epoch": 6.65929814865985, "grad_norm": 3.486978530883789, "learning_rate": 2.106815636770844e-05, "loss": 0.5981, "step": 289200 }, { "epoch": 6.663903472414111, "grad_norm": 3.4665493965148926, "learning_rate": 2.1060884803885926e-05, "loss": 0.6127, "step": 289400 }, { "epoch": 6.6685087961683704, "grad_norm": 3.6639597415924072, "learning_rate": 2.1053613240063408e-05, "loss": 0.6067, "step": 289600 }, { "epoch": 6.673114119922631, "grad_norm": 3.012031316757202, "learning_rate": 2.1046341676240894e-05, "loss": 0.596, "step": 289800 }, { "epoch": 6.6777194436768905, "grad_norm": 2.717411518096924, "learning_rate": 2.1039070112418377e-05, "loss": 0.6002, "step": 290000 }, { "epoch": 6.68232476743115, "grad_norm": 2.571988105773926, "learning_rate": 2.1031798548595862e-05, "loss": 0.5986, "step": 290200 }, { "epoch": 6.686930091185411, "grad_norm": 2.4127187728881836, "learning_rate": 2.1024526984773345e-05, "loss": 0.594, "step": 290400 }, { "epoch": 6.69153541493967, "grad_norm": 3.4181158542633057, "learning_rate": 2.1017291778769944e-05, "loss": 0.6023, "step": 290600 }, { "epoch": 6.696140738693931, "grad_norm": 3.2288289070129395, "learning_rate": 2.101005657276654e-05, "loss": 0.6004, "step": 290800 }, { "epoch": 6.70074606244819, "grad_norm": 3.2876336574554443, "learning_rate": 2.1002785008944025e-05, "loss": 0.6054, "step": 291000 }, { "epoch": 6.70535138620245, "grad_norm": 2.789076328277588, "learning_rate": 2.099551344512151e-05, "loss": 0.6041, "step": 291200 }, { "epoch": 6.70995670995671, "grad_norm": 2.86173939704895, "learning_rate": 2.0988241881298993e-05, "loss": 0.5944, "step": 291400 }, { "epoch": 6.71456203371097, "grad_norm": 2.8522419929504395, "learning_rate": 2.0980970317476476e-05, "loss": 0.5996, "step": 291600 }, { "epoch": 6.7191673574652295, "grad_norm": 3.7493839263916016, "learning_rate": 2.097369875365396e-05, "loss": 0.5959, "step": 291800 }, { "epoch": 6.72377268121949, "grad_norm": 3.5623674392700195, "learning_rate": 2.0966427189831444e-05, "loss": 0.5982, "step": 292000 }, { "epoch": 6.72837800497375, "grad_norm": 3.4356765747070312, "learning_rate": 2.095915562600893e-05, "loss": 0.5966, "step": 292200 }, { "epoch": 6.732983328728009, "grad_norm": 3.1786386966705322, "learning_rate": 2.0951884062186416e-05, "loss": 0.6007, "step": 292400 }, { "epoch": 6.73758865248227, "grad_norm": 2.424635410308838, "learning_rate": 2.09446124983639e-05, "loss": 0.6155, "step": 292600 }, { "epoch": 6.742193976236529, "grad_norm": 3.4390957355499268, "learning_rate": 2.0937340934541384e-05, "loss": 0.5994, "step": 292800 }, { "epoch": 6.746799299990789, "grad_norm": 4.376504421234131, "learning_rate": 2.0930069370718867e-05, "loss": 0.5985, "step": 293000 }, { "epoch": 6.751404623745049, "grad_norm": 2.7441396713256836, "learning_rate": 2.092279780689635e-05, "loss": 0.6037, "step": 293200 }, { "epoch": 6.756009947499309, "grad_norm": 2.771723985671997, "learning_rate": 2.0915562600892948e-05, "loss": 0.5969, "step": 293400 }, { "epoch": 6.760615271253569, "grad_norm": 3.066767692565918, "learning_rate": 2.0908291037070434e-05, "loss": 0.5907, "step": 293600 }, { "epoch": 6.765220595007829, "grad_norm": 3.7000675201416016, "learning_rate": 2.0901019473247916e-05, "loss": 0.5968, "step": 293800 }, { "epoch": 6.769825918762089, "grad_norm": 2.6257290840148926, "learning_rate": 2.0893747909425402e-05, "loss": 0.6015, "step": 294000 }, { "epoch": 6.774431242516349, "grad_norm": 2.972223997116089, "learning_rate": 2.0886476345602888e-05, "loss": 0.6104, "step": 294200 }, { "epoch": 6.779036566270609, "grad_norm": 2.6368885040283203, "learning_rate": 2.0879204781780367e-05, "loss": 0.6009, "step": 294400 }, { "epoch": 6.783641890024869, "grad_norm": 2.8268954753875732, "learning_rate": 2.0871933217957853e-05, "loss": 0.5894, "step": 294600 }, { "epoch": 6.788247213779129, "grad_norm": 2.7802374362945557, "learning_rate": 2.086466165413534e-05, "loss": 0.6079, "step": 294800 }, { "epoch": 6.792852537533388, "grad_norm": 3.244723320007324, "learning_rate": 2.0857390090312822e-05, "loss": 0.5933, "step": 295000 }, { "epoch": 6.797457861287649, "grad_norm": 2.8554670810699463, "learning_rate": 2.0850118526490308e-05, "loss": 0.5893, "step": 295200 }, { "epoch": 6.802063185041908, "grad_norm": 3.411585807800293, "learning_rate": 2.0842846962667794e-05, "loss": 0.5874, "step": 295400 }, { "epoch": 6.806668508796168, "grad_norm": 3.076263904571533, "learning_rate": 2.0835575398845276e-05, "loss": 0.5866, "step": 295600 }, { "epoch": 6.8112738325504285, "grad_norm": 3.0649490356445312, "learning_rate": 2.082830383502276e-05, "loss": 0.6006, "step": 295800 }, { "epoch": 6.815879156304688, "grad_norm": 3.0832936763763428, "learning_rate": 2.0821032271200245e-05, "loss": 0.5896, "step": 296000 }, { "epoch": 6.820484480058948, "grad_norm": 4.413976192474365, "learning_rate": 2.081376070737773e-05, "loss": 0.5995, "step": 296200 }, { "epoch": 6.825089803813208, "grad_norm": 2.9547083377838135, "learning_rate": 2.0806489143555213e-05, "loss": 0.588, "step": 296400 }, { "epoch": 6.829695127567468, "grad_norm": 3.1978492736816406, "learning_rate": 2.07992175797327e-05, "loss": 0.605, "step": 296600 }, { "epoch": 6.834300451321728, "grad_norm": 2.5067381858825684, "learning_rate": 2.0791946015910185e-05, "loss": 0.5945, "step": 296800 }, { "epoch": 6.838905775075988, "grad_norm": 2.9180538654327393, "learning_rate": 2.0784674452087664e-05, "loss": 0.6033, "step": 297000 }, { "epoch": 6.843511098830247, "grad_norm": 2.9533698558807373, "learning_rate": 2.077740288826515e-05, "loss": 0.5826, "step": 297200 }, { "epoch": 6.848116422584508, "grad_norm": 3.087790012359619, "learning_rate": 2.0770131324442636e-05, "loss": 0.6035, "step": 297400 }, { "epoch": 6.8527217463387675, "grad_norm": 3.130922317504883, "learning_rate": 2.076285976062012e-05, "loss": 0.599, "step": 297600 }, { "epoch": 6.857327070093028, "grad_norm": 3.0297648906707764, "learning_rate": 2.0755588196797604e-05, "loss": 0.6053, "step": 297800 }, { "epoch": 6.861932393847288, "grad_norm": 2.8300116062164307, "learning_rate": 2.0748352990794203e-05, "loss": 0.6033, "step": 298000 }, { "epoch": 6.866537717601547, "grad_norm": 3.1619298458099365, "learning_rate": 2.0741081426971685e-05, "loss": 0.5995, "step": 298200 }, { "epoch": 6.871143041355808, "grad_norm": 3.3405187129974365, "learning_rate": 2.0733809863149168e-05, "loss": 0.5918, "step": 298400 }, { "epoch": 6.875748365110067, "grad_norm": 2.8210740089416504, "learning_rate": 2.0726538299326654e-05, "loss": 0.599, "step": 298600 }, { "epoch": 6.880353688864327, "grad_norm": 3.2792325019836426, "learning_rate": 2.0719266735504136e-05, "loss": 0.5899, "step": 298800 }, { "epoch": 6.884959012618587, "grad_norm": 2.485522508621216, "learning_rate": 2.0711995171681622e-05, "loss": 0.5864, "step": 299000 }, { "epoch": 6.889564336372847, "grad_norm": 2.585327625274658, "learning_rate": 2.0704723607859108e-05, "loss": 0.5904, "step": 299200 }, { "epoch": 6.8941696601271065, "grad_norm": 2.3402791023254395, "learning_rate": 2.069745204403659e-05, "loss": 0.5845, "step": 299400 }, { "epoch": 6.898774983881367, "grad_norm": 2.9729385375976562, "learning_rate": 2.0690180480214073e-05, "loss": 0.5963, "step": 299600 }, { "epoch": 6.903380307635627, "grad_norm": 3.0972092151641846, "learning_rate": 2.068290891639156e-05, "loss": 0.6085, "step": 299800 }, { "epoch": 6.907985631389887, "grad_norm": 3.688617467880249, "learning_rate": 2.0675637352569045e-05, "loss": 0.6013, "step": 300000 }, { "epoch": 6.912590955144147, "grad_norm": 2.644268751144409, "learning_rate": 2.0668365788746528e-05, "loss": 0.5893, "step": 300200 }, { "epoch": 6.917196278898406, "grad_norm": 3.49151873588562, "learning_rate": 2.0661094224924013e-05, "loss": 0.5896, "step": 300400 }, { "epoch": 6.921801602652667, "grad_norm": 3.661813974380493, "learning_rate": 2.06538226611015e-05, "loss": 0.5972, "step": 300600 }, { "epoch": 6.926406926406926, "grad_norm": 3.322665214538574, "learning_rate": 2.0646551097278982e-05, "loss": 0.5989, "step": 300800 }, { "epoch": 6.931012250161187, "grad_norm": 3.0434811115264893, "learning_rate": 2.0639279533456464e-05, "loss": 0.5886, "step": 301000 }, { "epoch": 6.935617573915446, "grad_norm": 3.6743383407592773, "learning_rate": 2.063204432745306e-05, "loss": 0.6096, "step": 301200 }, { "epoch": 6.940222897669706, "grad_norm": 3.656269073486328, "learning_rate": 2.0624772763630546e-05, "loss": 0.5816, "step": 301400 }, { "epoch": 6.9448282214239665, "grad_norm": 2.7301547527313232, "learning_rate": 2.061750119980803e-05, "loss": 0.5909, "step": 301600 }, { "epoch": 6.949433545178226, "grad_norm": 2.843813896179199, "learning_rate": 2.0610229635985517e-05, "loss": 0.5987, "step": 301800 }, { "epoch": 6.954038868932486, "grad_norm": 3.3553197383880615, "learning_rate": 2.0602958072163e-05, "loss": 0.5974, "step": 302000 }, { "epoch": 6.958644192686746, "grad_norm": 2.880885601043701, "learning_rate": 2.0595686508340486e-05, "loss": 0.6037, "step": 302200 }, { "epoch": 6.963249516441006, "grad_norm": 4.332859039306641, "learning_rate": 2.058841494451797e-05, "loss": 0.5863, "step": 302400 }, { "epoch": 6.967854840195265, "grad_norm": 2.701078176498413, "learning_rate": 2.058114338069545e-05, "loss": 0.6154, "step": 302600 }, { "epoch": 6.972460163949526, "grad_norm": 3.1130638122558594, "learning_rate": 2.0573871816872937e-05, "loss": 0.5895, "step": 302800 }, { "epoch": 6.977065487703785, "grad_norm": 3.547954559326172, "learning_rate": 2.0566600253050423e-05, "loss": 0.6219, "step": 303000 }, { "epoch": 6.981670811458045, "grad_norm": 2.777357816696167, "learning_rate": 2.0559328689227905e-05, "loss": 0.5969, "step": 303200 }, { "epoch": 6.9862761352123055, "grad_norm": 3.05338978767395, "learning_rate": 2.055205712540539e-05, "loss": 0.5945, "step": 303400 }, { "epoch": 6.990881458966565, "grad_norm": 2.9107885360717773, "learning_rate": 2.0544785561582877e-05, "loss": 0.5978, "step": 303600 }, { "epoch": 6.995486782720826, "grad_norm": 3.5367624759674072, "learning_rate": 2.0537550355579472e-05, "loss": 0.6023, "step": 303800 }, { "epoch": 7.0, "eval_loss": 0.567062258720398, "eval_runtime": 145.2794, "eval_samples_per_second": 195.217, "eval_steps_per_second": 12.204, "step": 303996 }, { "epoch": 7.000092106475085, "grad_norm": 4.047206878662109, "learning_rate": 2.0530278791756955e-05, "loss": 0.5904, "step": 304000 }, { "epoch": 7.004697430229345, "grad_norm": 2.5840604305267334, "learning_rate": 2.052300722793444e-05, "loss": 0.5862, "step": 304200 }, { "epoch": 7.009302753983605, "grad_norm": 3.08518385887146, "learning_rate": 2.0515735664111923e-05, "loss": 0.5918, "step": 304400 }, { "epoch": 7.013908077737865, "grad_norm": 2.9152956008911133, "learning_rate": 2.050846410028941e-05, "loss": 0.5858, "step": 304600 }, { "epoch": 7.018513401492125, "grad_norm": 3.358140468597412, "learning_rate": 2.0501192536466895e-05, "loss": 0.5958, "step": 304800 }, { "epoch": 7.023118725246385, "grad_norm": 2.961735248565674, "learning_rate": 2.0493920972644378e-05, "loss": 0.5929, "step": 305000 }, { "epoch": 7.0277240490006445, "grad_norm": 2.8176980018615723, "learning_rate": 2.048664940882186e-05, "loss": 0.5862, "step": 305200 }, { "epoch": 7.032329372754905, "grad_norm": 3.275162696838379, "learning_rate": 2.0479377844999346e-05, "loss": 0.5934, "step": 305400 }, { "epoch": 7.036934696509165, "grad_norm": 3.5453879833221436, "learning_rate": 2.047210628117683e-05, "loss": 0.588, "step": 305600 }, { "epoch": 7.041540020263424, "grad_norm": 2.8470346927642822, "learning_rate": 2.0464834717354314e-05, "loss": 0.597, "step": 305800 }, { "epoch": 7.046145344017685, "grad_norm": 2.6055123805999756, "learning_rate": 2.0457599511350913e-05, "loss": 0.591, "step": 306000 }, { "epoch": 7.050750667771944, "grad_norm": 2.6110992431640625, "learning_rate": 2.0450327947528396e-05, "loss": 0.5906, "step": 306200 }, { "epoch": 7.055355991526204, "grad_norm": 3.3166911602020264, "learning_rate": 2.044305638370588e-05, "loss": 0.582, "step": 306400 }, { "epoch": 7.059961315280464, "grad_norm": 2.7513551712036133, "learning_rate": 2.0435784819883364e-05, "loss": 0.5985, "step": 306600 }, { "epoch": 7.064566639034724, "grad_norm": 3.547184944152832, "learning_rate": 2.0428513256060847e-05, "loss": 0.6041, "step": 306800 }, { "epoch": 7.069171962788984, "grad_norm": 3.074101686477661, "learning_rate": 2.0421241692238332e-05, "loss": 0.5915, "step": 307000 }, { "epoch": 7.073777286543244, "grad_norm": 3.347473621368408, "learning_rate": 2.041397012841582e-05, "loss": 0.5927, "step": 307200 }, { "epoch": 7.078382610297504, "grad_norm": 3.3471734523773193, "learning_rate": 2.04066985645933e-05, "loss": 0.5954, "step": 307400 }, { "epoch": 7.082987934051764, "grad_norm": 2.5890464782714844, "learning_rate": 2.0399427000770787e-05, "loss": 0.6003, "step": 307600 }, { "epoch": 7.087593257806024, "grad_norm": 2.5832886695861816, "learning_rate": 2.0392155436948273e-05, "loss": 0.5964, "step": 307800 }, { "epoch": 7.092198581560283, "grad_norm": 3.1692888736724854, "learning_rate": 2.0384883873125755e-05, "loss": 0.5985, "step": 308000 }, { "epoch": 7.096803905314544, "grad_norm": 2.8189423084259033, "learning_rate": 2.0377612309303238e-05, "loss": 0.5802, "step": 308200 }, { "epoch": 7.101409229068803, "grad_norm": 2.9584109783172607, "learning_rate": 2.0370340745480724e-05, "loss": 0.5949, "step": 308400 }, { "epoch": 7.106014552823064, "grad_norm": 2.7427165508270264, "learning_rate": 2.036306918165821e-05, "loss": 0.59, "step": 308600 }, { "epoch": 7.110619876577323, "grad_norm": 3.3998868465423584, "learning_rate": 2.0355797617835692e-05, "loss": 0.5907, "step": 308800 }, { "epoch": 7.115225200331583, "grad_norm": 3.8071186542510986, "learning_rate": 2.0348526054013178e-05, "loss": 0.5865, "step": 309000 }, { "epoch": 7.1198305240858435, "grad_norm": 2.8909687995910645, "learning_rate": 2.0341290848009773e-05, "loss": 0.5883, "step": 309200 }, { "epoch": 7.124435847840103, "grad_norm": 2.901247262954712, "learning_rate": 2.0334019284187256e-05, "loss": 0.6004, "step": 309400 }, { "epoch": 7.129041171594363, "grad_norm": 3.5564959049224854, "learning_rate": 2.0326747720364742e-05, "loss": 0.5866, "step": 309600 }, { "epoch": 7.133646495348623, "grad_norm": 3.2295401096343994, "learning_rate": 2.0319476156542228e-05, "loss": 0.6012, "step": 309800 }, { "epoch": 7.138251819102883, "grad_norm": 2.9399068355560303, "learning_rate": 2.031220459271971e-05, "loss": 0.6112, "step": 310000 }, { "epoch": 7.142857142857143, "grad_norm": 3.2517316341400146, "learning_rate": 2.0304933028897196e-05, "loss": 0.5686, "step": 310200 }, { "epoch": 7.147462466611403, "grad_norm": 2.8699967861175537, "learning_rate": 2.0297661465074682e-05, "loss": 0.5826, "step": 310400 }, { "epoch": 7.152067790365662, "grad_norm": 3.1824159622192383, "learning_rate": 2.0290389901252165e-05, "loss": 0.5878, "step": 310600 }, { "epoch": 7.156673114119923, "grad_norm": 2.710801839828491, "learning_rate": 2.0283118337429647e-05, "loss": 0.5989, "step": 310800 }, { "epoch": 7.1612784378741825, "grad_norm": 3.2563531398773193, "learning_rate": 2.0275846773607133e-05, "loss": 0.5956, "step": 311000 }, { "epoch": 7.165883761628442, "grad_norm": 3.0747923851013184, "learning_rate": 2.0268575209784615e-05, "loss": 0.5924, "step": 311200 }, { "epoch": 7.170489085382703, "grad_norm": 3.374802589416504, "learning_rate": 2.02613036459621e-05, "loss": 0.594, "step": 311400 }, { "epoch": 7.175094409136962, "grad_norm": 3.170912504196167, "learning_rate": 2.0254032082139587e-05, "loss": 0.5821, "step": 311600 }, { "epoch": 7.179699732891223, "grad_norm": 2.800078868865967, "learning_rate": 2.024676051831707e-05, "loss": 0.5837, "step": 311800 }, { "epoch": 7.184305056645482, "grad_norm": 4.168891906738281, "learning_rate": 2.0239488954494552e-05, "loss": 0.5977, "step": 312000 }, { "epoch": 7.188910380399742, "grad_norm": 4.314828872680664, "learning_rate": 2.023225374849115e-05, "loss": 0.5871, "step": 312200 }, { "epoch": 7.193515704154002, "grad_norm": 2.661975383758545, "learning_rate": 2.0224982184668633e-05, "loss": 0.6009, "step": 312400 }, { "epoch": 7.198121027908262, "grad_norm": 2.217339277267456, "learning_rate": 2.021771062084612e-05, "loss": 0.5916, "step": 312600 }, { "epoch": 7.2027263516625215, "grad_norm": 3.5877535343170166, "learning_rate": 2.0210439057023605e-05, "loss": 0.5889, "step": 312800 }, { "epoch": 7.207331675416782, "grad_norm": 3.6511406898498535, "learning_rate": 2.0203167493201088e-05, "loss": 0.5906, "step": 313000 }, { "epoch": 7.211936999171042, "grad_norm": 3.414182186126709, "learning_rate": 2.0195895929378574e-05, "loss": 0.5968, "step": 313200 }, { "epoch": 7.216542322925302, "grad_norm": 3.0940256118774414, "learning_rate": 2.0188624365556056e-05, "loss": 0.5972, "step": 313400 }, { "epoch": 7.221147646679562, "grad_norm": 2.9856083393096924, "learning_rate": 2.018135280173354e-05, "loss": 0.5918, "step": 313600 }, { "epoch": 7.225752970433821, "grad_norm": 3.411344528198242, "learning_rate": 2.0174081237911025e-05, "loss": 0.5816, "step": 313800 }, { "epoch": 7.230358294188082, "grad_norm": 3.6355364322662354, "learning_rate": 2.016680967408851e-05, "loss": 0.6008, "step": 314000 }, { "epoch": 7.234963617942341, "grad_norm": 3.124440908432007, "learning_rate": 2.0159538110265993e-05, "loss": 0.5892, "step": 314200 }, { "epoch": 7.239568941696601, "grad_norm": 3.7005481719970703, "learning_rate": 2.015226654644348e-05, "loss": 0.5834, "step": 314400 }, { "epoch": 7.244174265450861, "grad_norm": 2.2314252853393555, "learning_rate": 2.0144994982620965e-05, "loss": 0.5888, "step": 314600 }, { "epoch": 7.248779589205121, "grad_norm": 3.4673757553100586, "learning_rate": 2.0137723418798448e-05, "loss": 0.6033, "step": 314800 }, { "epoch": 7.2533849129593815, "grad_norm": 3.7593634128570557, "learning_rate": 2.013045185497593e-05, "loss": 0.6022, "step": 315000 }, { "epoch": 7.257990236713641, "grad_norm": 3.1982200145721436, "learning_rate": 2.0123180291153416e-05, "loss": 0.5823, "step": 315200 }, { "epoch": 7.262595560467901, "grad_norm": 2.9602959156036377, "learning_rate": 2.0115908727330902e-05, "loss": 0.5856, "step": 315400 }, { "epoch": 7.267200884222161, "grad_norm": 3.3312976360321045, "learning_rate": 2.0108637163508384e-05, "loss": 0.5826, "step": 315600 }, { "epoch": 7.271806207976421, "grad_norm": 3.437633514404297, "learning_rate": 2.010136559968587e-05, "loss": 0.5889, "step": 315800 }, { "epoch": 7.27641153173068, "grad_norm": 4.305769443511963, "learning_rate": 2.0094094035863353e-05, "loss": 0.5868, "step": 316000 }, { "epoch": 7.281016855484941, "grad_norm": 3.3740241527557373, "learning_rate": 2.0086822472040835e-05, "loss": 0.5826, "step": 316200 }, { "epoch": 7.2856221792392, "grad_norm": 2.570251226425171, "learning_rate": 2.0079623623856547e-05, "loss": 0.5914, "step": 316400 }, { "epoch": 7.29022750299346, "grad_norm": 4.058407783508301, "learning_rate": 2.007235206003403e-05, "loss": 0.6015, "step": 316600 }, { "epoch": 7.2948328267477205, "grad_norm": 3.7470202445983887, "learning_rate": 2.0065080496211515e-05, "loss": 0.5866, "step": 316800 }, { "epoch": 7.29943815050198, "grad_norm": 3.243685007095337, "learning_rate": 2.0057808932389e-05, "loss": 0.5829, "step": 317000 }, { "epoch": 7.304043474256241, "grad_norm": 3.5165421962738037, "learning_rate": 2.0050537368566484e-05, "loss": 0.5883, "step": 317200 }, { "epoch": 7.3086487980105, "grad_norm": 2.5565083026885986, "learning_rate": 2.004326580474397e-05, "loss": 0.5944, "step": 317400 }, { "epoch": 7.31325412176476, "grad_norm": 2.588308095932007, "learning_rate": 2.0035994240921455e-05, "loss": 0.5903, "step": 317600 }, { "epoch": 7.31785944551902, "grad_norm": 3.13081431388855, "learning_rate": 2.0028722677098938e-05, "loss": 0.5962, "step": 317800 }, { "epoch": 7.32246476927328, "grad_norm": 3.2789556980133057, "learning_rate": 2.002145111327642e-05, "loss": 0.5912, "step": 318000 }, { "epoch": 7.327070093027539, "grad_norm": 2.885380744934082, "learning_rate": 2.0014179549453906e-05, "loss": 0.5992, "step": 318200 }, { "epoch": 7.3316754167818, "grad_norm": 2.445709705352783, "learning_rate": 2.0006907985631392e-05, "loss": 0.5915, "step": 318400 }, { "epoch": 7.3362807405360595, "grad_norm": 4.696218967437744, "learning_rate": 1.9999636421808875e-05, "loss": 0.602, "step": 318600 }, { "epoch": 7.34088606429032, "grad_norm": 2.6832456588745117, "learning_rate": 1.999236485798636e-05, "loss": 0.5936, "step": 318800 }, { "epoch": 7.34549138804458, "grad_norm": 2.7151763439178467, "learning_rate": 1.9985093294163843e-05, "loss": 0.5932, "step": 319000 }, { "epoch": 7.350096711798839, "grad_norm": 2.803915500640869, "learning_rate": 1.9977821730341326e-05, "loss": 0.6016, "step": 319200 }, { "epoch": 7.3547020355531, "grad_norm": 2.600085973739624, "learning_rate": 1.997055016651881e-05, "loss": 0.5952, "step": 319400 }, { "epoch": 7.359307359307359, "grad_norm": 3.053616523742676, "learning_rate": 1.9963278602696298e-05, "loss": 0.5924, "step": 319600 }, { "epoch": 7.363912683061619, "grad_norm": 2.9440255165100098, "learning_rate": 1.995600703887378e-05, "loss": 0.5995, "step": 319800 }, { "epoch": 7.368518006815879, "grad_norm": 3.2415082454681396, "learning_rate": 1.9948735475051266e-05, "loss": 0.5919, "step": 320000 }, { "epoch": 7.373123330570139, "grad_norm": 2.619912624359131, "learning_rate": 1.994146391122875e-05, "loss": 0.5929, "step": 320200 }, { "epoch": 7.377728654324399, "grad_norm": 3.2725462913513184, "learning_rate": 1.993419234740623e-05, "loss": 0.5849, "step": 320400 }, { "epoch": 7.382333978078659, "grad_norm": 4.5242743492126465, "learning_rate": 1.9926920783583717e-05, "loss": 0.5891, "step": 320600 }, { "epoch": 7.386939301832919, "grad_norm": 3.2568304538726807, "learning_rate": 1.9919649219761203e-05, "loss": 0.5892, "step": 320800 }, { "epoch": 7.391544625587179, "grad_norm": 3.1149260997772217, "learning_rate": 1.9912414013757798e-05, "loss": 0.5955, "step": 321000 }, { "epoch": 7.396149949341439, "grad_norm": 3.272343158721924, "learning_rate": 1.9905142449935284e-05, "loss": 0.5938, "step": 321200 }, { "epoch": 7.400755273095698, "grad_norm": 2.8861818313598633, "learning_rate": 1.989787088611277e-05, "loss": 0.5973, "step": 321400 }, { "epoch": 7.405360596849959, "grad_norm": 4.322711944580078, "learning_rate": 1.9890599322290252e-05, "loss": 0.5979, "step": 321600 }, { "epoch": 7.409965920604218, "grad_norm": 3.2034659385681152, "learning_rate": 1.9883327758467735e-05, "loss": 0.5897, "step": 321800 }, { "epoch": 7.414571244358479, "grad_norm": 2.6252517700195312, "learning_rate": 1.987605619464522e-05, "loss": 0.593, "step": 322000 }, { "epoch": 7.419176568112738, "grad_norm": 2.79892897605896, "learning_rate": 1.9868784630822703e-05, "loss": 0.5988, "step": 322200 }, { "epoch": 7.423781891866998, "grad_norm": 4.176153659820557, "learning_rate": 1.986151306700019e-05, "loss": 0.5828, "step": 322400 }, { "epoch": 7.4283872156212585, "grad_norm": 4.084776878356934, "learning_rate": 1.9854241503177675e-05, "loss": 0.5949, "step": 322600 }, { "epoch": 7.432992539375518, "grad_norm": 3.2946360111236572, "learning_rate": 1.984696993935516e-05, "loss": 0.5911, "step": 322800 }, { "epoch": 7.437597863129778, "grad_norm": 2.784147262573242, "learning_rate": 1.983969837553264e-05, "loss": 0.5864, "step": 323000 }, { "epoch": 7.442203186884038, "grad_norm": 3.3848652839660645, "learning_rate": 1.9832426811710126e-05, "loss": 0.579, "step": 323200 }, { "epoch": 7.446808510638298, "grad_norm": 3.4433703422546387, "learning_rate": 1.9825155247887612e-05, "loss": 0.5887, "step": 323400 }, { "epoch": 7.451413834392557, "grad_norm": 3.3405020236968994, "learning_rate": 1.9817920041884207e-05, "loss": 0.5876, "step": 323600 }, { "epoch": 7.456019158146818, "grad_norm": 3.7312204837799072, "learning_rate": 1.9810648478061693e-05, "loss": 0.5955, "step": 323800 }, { "epoch": 7.460624481901077, "grad_norm": 4.358093738555908, "learning_rate": 1.9803376914239176e-05, "loss": 0.5866, "step": 324000 }, { "epoch": 7.465229805655338, "grad_norm": 2.68795108795166, "learning_rate": 1.979610535041666e-05, "loss": 0.6004, "step": 324200 }, { "epoch": 7.4698351294095975, "grad_norm": 2.8480021953582764, "learning_rate": 1.978887014441326e-05, "loss": 0.5983, "step": 324400 }, { "epoch": 7.474440453163857, "grad_norm": 4.532684803009033, "learning_rate": 1.9781598580590743e-05, "loss": 0.5917, "step": 324600 }, { "epoch": 7.479045776918118, "grad_norm": 3.242600202560425, "learning_rate": 1.9774327016768225e-05, "loss": 0.5981, "step": 324800 }, { "epoch": 7.483651100672377, "grad_norm": 3.071572780609131, "learning_rate": 1.976705545294571e-05, "loss": 0.5892, "step": 325000 }, { "epoch": 7.488256424426638, "grad_norm": 3.3507227897644043, "learning_rate": 1.9759783889123194e-05, "loss": 0.593, "step": 325200 }, { "epoch": 7.492861748180897, "grad_norm": 3.0197222232818604, "learning_rate": 1.975251232530068e-05, "loss": 0.598, "step": 325400 }, { "epoch": 7.497467071935157, "grad_norm": 2.7512412071228027, "learning_rate": 1.9745240761478166e-05, "loss": 0.5942, "step": 325600 }, { "epoch": 7.502072395689417, "grad_norm": 3.136456251144409, "learning_rate": 1.9737969197655648e-05, "loss": 0.5828, "step": 325800 }, { "epoch": 7.506677719443677, "grad_norm": 4.431463241577148, "learning_rate": 1.973069763383313e-05, "loss": 0.6002, "step": 326000 }, { "epoch": 7.5112830431979365, "grad_norm": 2.8027350902557373, "learning_rate": 1.9723426070010617e-05, "loss": 0.5935, "step": 326200 }, { "epoch": 7.515888366952197, "grad_norm": 4.174149990081787, "learning_rate": 1.9716154506188102e-05, "loss": 0.5837, "step": 326400 }, { "epoch": 7.520493690706457, "grad_norm": 4.200664043426514, "learning_rate": 1.9708919300184698e-05, "loss": 0.6054, "step": 326600 }, { "epoch": 7.525099014460716, "grad_norm": 2.414337158203125, "learning_rate": 1.9701647736362184e-05, "loss": 0.5915, "step": 326800 }, { "epoch": 7.529704338214977, "grad_norm": 2.9279587268829346, "learning_rate": 1.9694376172539666e-05, "loss": 0.5921, "step": 327000 }, { "epoch": 7.534309661969236, "grad_norm": 3.7588605880737305, "learning_rate": 1.9687104608717152e-05, "loss": 0.5839, "step": 327200 }, { "epoch": 7.538914985723497, "grad_norm": 3.5354812145233154, "learning_rate": 1.9679833044894635e-05, "loss": 0.5954, "step": 327400 }, { "epoch": 7.543520309477756, "grad_norm": 3.489088773727417, "learning_rate": 1.967256148107212e-05, "loss": 0.6005, "step": 327600 }, { "epoch": 7.548125633232016, "grad_norm": 3.0802366733551025, "learning_rate": 1.9665289917249603e-05, "loss": 0.5928, "step": 327800 }, { "epoch": 7.552730956986276, "grad_norm": 3.6488418579101562, "learning_rate": 1.965801835342709e-05, "loss": 0.5977, "step": 328000 }, { "epoch": 7.557336280740536, "grad_norm": 3.7551910877227783, "learning_rate": 1.9650746789604575e-05, "loss": 0.5832, "step": 328200 }, { "epoch": 7.5619416044947965, "grad_norm": 2.494746685028076, "learning_rate": 1.9643475225782057e-05, "loss": 0.5867, "step": 328400 }, { "epoch": 7.566546928249056, "grad_norm": 3.4806318283081055, "learning_rate": 1.9636240019778656e-05, "loss": 0.5818, "step": 328600 }, { "epoch": 7.571152252003316, "grad_norm": 3.278502941131592, "learning_rate": 1.962896845595614e-05, "loss": 0.5864, "step": 328800 }, { "epoch": 7.575757575757576, "grad_norm": 3.046190023422241, "learning_rate": 1.962169689213362e-05, "loss": 0.5904, "step": 329000 }, { "epoch": 7.580362899511836, "grad_norm": 2.7647550106048584, "learning_rate": 1.9614425328311107e-05, "loss": 0.6045, "step": 329200 }, { "epoch": 7.584968223266095, "grad_norm": 3.201040506362915, "learning_rate": 1.9607153764488593e-05, "loss": 0.5932, "step": 329400 }, { "epoch": 7.589573547020356, "grad_norm": 4.180473327636719, "learning_rate": 1.9599882200666075e-05, "loss": 0.5883, "step": 329600 }, { "epoch": 7.594178870774615, "grad_norm": 2.905902862548828, "learning_rate": 1.959261063684356e-05, "loss": 0.593, "step": 329800 }, { "epoch": 7.598784194528875, "grad_norm": 3.984675884246826, "learning_rate": 1.9585339073021047e-05, "loss": 0.6001, "step": 330000 }, { "epoch": 7.6033895182831355, "grad_norm": 3.31915545463562, "learning_rate": 1.9578067509198526e-05, "loss": 0.5868, "step": 330200 }, { "epoch": 7.607994842037395, "grad_norm": 2.907419204711914, "learning_rate": 1.9570795945376012e-05, "loss": 0.5885, "step": 330400 }, { "epoch": 7.612600165791655, "grad_norm": 3.3257858753204346, "learning_rate": 1.9563524381553498e-05, "loss": 0.5821, "step": 330600 }, { "epoch": 7.617205489545915, "grad_norm": 3.4570491313934326, "learning_rate": 1.955625281773098e-05, "loss": 0.5999, "step": 330800 }, { "epoch": 7.621810813300175, "grad_norm": 3.399859666824341, "learning_rate": 1.9548981253908467e-05, "loss": 0.5977, "step": 331000 }, { "epoch": 7.626416137054435, "grad_norm": 3.104308605194092, "learning_rate": 1.9541709690085952e-05, "loss": 0.5842, "step": 331200 }, { "epoch": 7.631021460808695, "grad_norm": 3.126985788345337, "learning_rate": 1.953443812626343e-05, "loss": 0.6003, "step": 331400 }, { "epoch": 7.635626784562954, "grad_norm": 2.7562856674194336, "learning_rate": 1.9527166562440918e-05, "loss": 0.5831, "step": 331600 }, { "epoch": 7.640232108317215, "grad_norm": 2.559391975402832, "learning_rate": 1.9519894998618403e-05, "loss": 0.5963, "step": 331800 }, { "epoch": 7.6448374320714745, "grad_norm": 3.1173386573791504, "learning_rate": 1.9512659792615e-05, "loss": 0.6023, "step": 332000 }, { "epoch": 7.649442755825735, "grad_norm": 2.7219719886779785, "learning_rate": 1.9505388228792485e-05, "loss": 0.5869, "step": 332200 }, { "epoch": 7.654048079579995, "grad_norm": 2.787108898162842, "learning_rate": 1.949811666496997e-05, "loss": 0.6002, "step": 332400 }, { "epoch": 7.658653403334254, "grad_norm": 2.7488784790039062, "learning_rate": 1.9490845101147453e-05, "loss": 0.5946, "step": 332600 }, { "epoch": 7.663258727088515, "grad_norm": 3.056337833404541, "learning_rate": 1.948357353732494e-05, "loss": 0.5933, "step": 332800 }, { "epoch": 7.667864050842774, "grad_norm": 3.6845805644989014, "learning_rate": 1.947630197350242e-05, "loss": 0.5947, "step": 333000 }, { "epoch": 7.672469374597034, "grad_norm": 2.5756258964538574, "learning_rate": 1.9469030409679904e-05, "loss": 0.5984, "step": 333200 }, { "epoch": 7.677074698351294, "grad_norm": 3.2758781909942627, "learning_rate": 1.946175884585739e-05, "loss": 0.5955, "step": 333400 }, { "epoch": 7.681680022105554, "grad_norm": 3.465355634689331, "learning_rate": 1.9454487282034876e-05, "loss": 0.5903, "step": 333600 }, { "epoch": 7.6862853458598135, "grad_norm": 3.2135512828826904, "learning_rate": 1.944721571821236e-05, "loss": 0.5864, "step": 333800 }, { "epoch": 7.690890669614074, "grad_norm": 2.3275580406188965, "learning_rate": 1.9439944154389844e-05, "loss": 0.5797, "step": 334000 }, { "epoch": 7.695495993368334, "grad_norm": 2.834540843963623, "learning_rate": 1.9432672590567327e-05, "loss": 0.5891, "step": 334200 }, { "epoch": 7.700101317122594, "grad_norm": 2.680785894393921, "learning_rate": 1.9425401026744813e-05, "loss": 0.5916, "step": 334400 }, { "epoch": 7.704706640876854, "grad_norm": 4.272080421447754, "learning_rate": 1.9418129462922295e-05, "loss": 0.5999, "step": 334600 }, { "epoch": 7.709311964631113, "grad_norm": 2.508699417114258, "learning_rate": 1.941085789909978e-05, "loss": 0.5903, "step": 334800 }, { "epoch": 7.713917288385374, "grad_norm": 4.05361270904541, "learning_rate": 1.9403586335277267e-05, "loss": 0.5947, "step": 335000 }, { "epoch": 7.718522612139633, "grad_norm": 2.704547882080078, "learning_rate": 1.939631477145475e-05, "loss": 0.5867, "step": 335200 }, { "epoch": 7.723127935893894, "grad_norm": 2.987114667892456, "learning_rate": 1.9389043207632235e-05, "loss": 0.5883, "step": 335400 }, { "epoch": 7.727733259648153, "grad_norm": 2.5852272510528564, "learning_rate": 1.9381771643809718e-05, "loss": 0.6022, "step": 335600 }, { "epoch": 7.732338583402413, "grad_norm": 2.52404522895813, "learning_rate": 1.93745000799872e-05, "loss": 0.5791, "step": 335800 }, { "epoch": 7.7369439071566735, "grad_norm": 2.9356820583343506, "learning_rate": 1.9367228516164686e-05, "loss": 0.5922, "step": 336000 }, { "epoch": 7.741549230910933, "grad_norm": 4.60886287689209, "learning_rate": 1.9359956952342172e-05, "loss": 0.583, "step": 336200 }, { "epoch": 7.746154554665193, "grad_norm": 4.0157856941223145, "learning_rate": 1.9352685388519655e-05, "loss": 0.5979, "step": 336400 }, { "epoch": 7.750759878419453, "grad_norm": 2.8278868198394775, "learning_rate": 1.934541382469714e-05, "loss": 0.603, "step": 336600 }, { "epoch": 7.755365202173713, "grad_norm": 2.852208375930786, "learning_rate": 1.9338142260874623e-05, "loss": 0.5775, "step": 336800 }, { "epoch": 7.759970525927972, "grad_norm": 3.623089075088501, "learning_rate": 1.933087069705211e-05, "loss": 0.589, "step": 337000 }, { "epoch": 7.764575849682233, "grad_norm": 2.7371058464050293, "learning_rate": 1.9323635491048704e-05, "loss": 0.5884, "step": 337200 }, { "epoch": 7.769181173436492, "grad_norm": 4.192208290100098, "learning_rate": 1.931636392722619e-05, "loss": 0.5906, "step": 337400 }, { "epoch": 7.773786497190753, "grad_norm": 4.169963836669922, "learning_rate": 1.9309128721222786e-05, "loss": 0.6003, "step": 337600 }, { "epoch": 7.7783918209450125, "grad_norm": 3.134167432785034, "learning_rate": 1.930185715740027e-05, "loss": 0.6032, "step": 337800 }, { "epoch": 7.782997144699272, "grad_norm": 3.1043667793273926, "learning_rate": 1.9294585593577757e-05, "loss": 0.5872, "step": 338000 }, { "epoch": 7.7876024684535325, "grad_norm": 2.8810665607452393, "learning_rate": 1.928731402975524e-05, "loss": 0.5848, "step": 338200 }, { "epoch": 7.792207792207792, "grad_norm": 2.3954150676727295, "learning_rate": 1.9280042465932722e-05, "loss": 0.587, "step": 338400 }, { "epoch": 7.796813115962053, "grad_norm": 3.5883309841156006, "learning_rate": 1.927277090211021e-05, "loss": 0.5954, "step": 338600 }, { "epoch": 7.801418439716312, "grad_norm": 2.6461853981018066, "learning_rate": 1.926549933828769e-05, "loss": 0.5901, "step": 338800 }, { "epoch": 7.806023763470572, "grad_norm": 3.625852108001709, "learning_rate": 1.9258227774465177e-05, "loss": 0.5784, "step": 339000 }, { "epoch": 7.810629087224832, "grad_norm": 3.1065762042999268, "learning_rate": 1.9250956210642663e-05, "loss": 0.5917, "step": 339200 }, { "epoch": 7.815234410979092, "grad_norm": 2.77862286567688, "learning_rate": 1.9243684646820145e-05, "loss": 0.5922, "step": 339400 }, { "epoch": 7.8198397347333515, "grad_norm": 4.303182601928711, "learning_rate": 1.923641308299763e-05, "loss": 0.5842, "step": 339600 }, { "epoch": 7.824445058487612, "grad_norm": 3.4562771320343018, "learning_rate": 1.9229141519175114e-05, "loss": 0.6, "step": 339800 }, { "epoch": 7.8290503822418716, "grad_norm": 2.983295440673828, "learning_rate": 1.9221869955352596e-05, "loss": 0.6038, "step": 340000 }, { "epoch": 7.833655705996131, "grad_norm": 3.2224607467651367, "learning_rate": 1.9214598391530082e-05, "loss": 0.5834, "step": 340200 }, { "epoch": 7.838261029750392, "grad_norm": 3.168325185775757, "learning_rate": 1.9207326827707568e-05, "loss": 0.5772, "step": 340400 }, { "epoch": 7.842866353504651, "grad_norm": 2.240220546722412, "learning_rate": 1.9200055263885054e-05, "loss": 0.592, "step": 340600 }, { "epoch": 7.847471677258911, "grad_norm": 3.125988245010376, "learning_rate": 1.9192783700062536e-05, "loss": 0.579, "step": 340800 }, { "epoch": 7.852077001013171, "grad_norm": 3.029654026031494, "learning_rate": 1.918551213624002e-05, "loss": 0.6008, "step": 341000 }, { "epoch": 7.856682324767431, "grad_norm": 3.496350049972534, "learning_rate": 1.9178240572417505e-05, "loss": 0.584, "step": 341200 }, { "epoch": 7.861287648521691, "grad_norm": 3.7270963191986084, "learning_rate": 1.9170969008594987e-05, "loss": 0.5942, "step": 341400 }, { "epoch": 7.865892972275951, "grad_norm": 3.109341621398926, "learning_rate": 1.9163697444772473e-05, "loss": 0.5729, "step": 341600 }, { "epoch": 7.870498296030211, "grad_norm": 2.5632474422454834, "learning_rate": 1.915642588094996e-05, "loss": 0.5945, "step": 341800 }, { "epoch": 7.875103619784471, "grad_norm": 2.6529979705810547, "learning_rate": 1.9149154317127442e-05, "loss": 0.5764, "step": 342000 }, { "epoch": 7.879708943538731, "grad_norm": 2.466392993927002, "learning_rate": 1.9141882753304928e-05, "loss": 0.5857, "step": 342200 }, { "epoch": 7.884314267292991, "grad_norm": 4.523742198944092, "learning_rate": 1.913461118948241e-05, "loss": 0.5771, "step": 342400 }, { "epoch": 7.888919591047251, "grad_norm": 3.0389695167541504, "learning_rate": 1.9127339625659893e-05, "loss": 0.587, "step": 342600 }, { "epoch": 7.89352491480151, "grad_norm": 3.1406664848327637, "learning_rate": 1.912006806183738e-05, "loss": 0.5812, "step": 342800 }, { "epoch": 7.898130238555771, "grad_norm": 2.3682878017425537, "learning_rate": 1.9112796498014865e-05, "loss": 0.5873, "step": 343000 }, { "epoch": 7.90273556231003, "grad_norm": 2.9673991203308105, "learning_rate": 1.910552493419235e-05, "loss": 0.5781, "step": 343200 }, { "epoch": 7.90734088606429, "grad_norm": 3.378129243850708, "learning_rate": 1.9098253370369833e-05, "loss": 0.5848, "step": 343400 }, { "epoch": 7.9119462098185505, "grad_norm": 3.566913366317749, "learning_rate": 1.9090981806547316e-05, "loss": 0.5836, "step": 343600 }, { "epoch": 7.91655153357281, "grad_norm": 3.525033712387085, "learning_rate": 1.90837102427248e-05, "loss": 0.594, "step": 343800 }, { "epoch": 7.92115685732707, "grad_norm": 2.8252060413360596, "learning_rate": 1.9076438678902284e-05, "loss": 0.5913, "step": 344000 }, { "epoch": 7.92576218108133, "grad_norm": 4.159696102142334, "learning_rate": 1.9069203472898883e-05, "loss": 0.5845, "step": 344200 }, { "epoch": 7.93036750483559, "grad_norm": 3.0841197967529297, "learning_rate": 1.9061968266895478e-05, "loss": 0.5936, "step": 344400 }, { "epoch": 7.93497282858985, "grad_norm": 2.3615498542785645, "learning_rate": 1.9054696703072964e-05, "loss": 0.5897, "step": 344600 }, { "epoch": 7.93957815234411, "grad_norm": 2.4957964420318604, "learning_rate": 1.904742513925045e-05, "loss": 0.5839, "step": 344800 }, { "epoch": 7.944183476098369, "grad_norm": 3.6795175075531006, "learning_rate": 1.9040153575427932e-05, "loss": 0.5876, "step": 345000 }, { "epoch": 7.94878879985263, "grad_norm": 2.6692540645599365, "learning_rate": 1.9032882011605415e-05, "loss": 0.5847, "step": 345200 }, { "epoch": 7.9533941236068895, "grad_norm": 3.4338459968566895, "learning_rate": 1.90256104477829e-05, "loss": 0.5909, "step": 345400 }, { "epoch": 7.95799944736115, "grad_norm": 2.6297929286956787, "learning_rate": 1.9018375241779496e-05, "loss": 0.5925, "step": 345600 }, { "epoch": 7.9626047711154095, "grad_norm": 3.2749218940734863, "learning_rate": 1.9011103677956982e-05, "loss": 0.586, "step": 345800 }, { "epoch": 7.967210094869669, "grad_norm": 2.956953763961792, "learning_rate": 1.9003832114134468e-05, "loss": 0.5992, "step": 346000 }, { "epoch": 7.97181541862393, "grad_norm": 3.2748615741729736, "learning_rate": 1.899656055031195e-05, "loss": 0.5948, "step": 346200 }, { "epoch": 7.976420742378189, "grad_norm": 3.0381953716278076, "learning_rate": 1.8989288986489436e-05, "loss": 0.5946, "step": 346400 }, { "epoch": 7.981026066132449, "grad_norm": 2.761564254760742, "learning_rate": 1.8982017422666922e-05, "loss": 0.5925, "step": 346600 }, { "epoch": 7.985631389886709, "grad_norm": 2.955965042114258, "learning_rate": 1.89747458588444e-05, "loss": 0.5809, "step": 346800 }, { "epoch": 7.990236713640969, "grad_norm": 3.134218454360962, "learning_rate": 1.8967474295021887e-05, "loss": 0.5939, "step": 347000 }, { "epoch": 7.9948420373952285, "grad_norm": 2.840864896774292, "learning_rate": 1.8960202731199373e-05, "loss": 0.5862, "step": 347200 }, { "epoch": 7.999447361149489, "grad_norm": 2.9396276473999023, "learning_rate": 1.8952931167376855e-05, "loss": 0.588, "step": 347400 }, { "epoch": 8.0, "eval_loss": 0.5577627420425415, "eval_runtime": 145.2006, "eval_samples_per_second": 195.323, "eval_steps_per_second": 12.211, "step": 347424 }, { "epoch": 8.00405268490375, "grad_norm": 2.531428098678589, "learning_rate": 1.894565960355434e-05, "loss": 0.5814, "step": 347600 }, { "epoch": 8.008658008658008, "grad_norm": 3.6581764221191406, "learning_rate": 1.8938388039731827e-05, "loss": 0.5853, "step": 347800 }, { "epoch": 8.013263332412269, "grad_norm": 2.494723081588745, "learning_rate": 1.8931116475909306e-05, "loss": 0.5866, "step": 348000 }, { "epoch": 8.017868656166529, "grad_norm": 2.6664531230926514, "learning_rate": 1.8923844912086792e-05, "loss": 0.5971, "step": 348200 }, { "epoch": 8.022473979920788, "grad_norm": 2.837907552719116, "learning_rate": 1.8916573348264278e-05, "loss": 0.583, "step": 348400 }, { "epoch": 8.027079303675048, "grad_norm": 3.521794557571411, "learning_rate": 1.8909301784441764e-05, "loss": 0.5916, "step": 348600 }, { "epoch": 8.031684627429309, "grad_norm": 2.823016881942749, "learning_rate": 1.8902030220619247e-05, "loss": 0.5879, "step": 348800 }, { "epoch": 8.036289951183567, "grad_norm": 2.7679924964904785, "learning_rate": 1.8894758656796733e-05, "loss": 0.5776, "step": 349000 }, { "epoch": 8.040895274937828, "grad_norm": 4.573503017425537, "learning_rate": 1.888748709297422e-05, "loss": 0.5819, "step": 349200 }, { "epoch": 8.045500598692088, "grad_norm": 2.8802812099456787, "learning_rate": 1.8880215529151698e-05, "loss": 0.5816, "step": 349400 }, { "epoch": 8.050105922446347, "grad_norm": 2.96124005317688, "learning_rate": 1.8872943965329184e-05, "loss": 0.6029, "step": 349600 }, { "epoch": 8.054711246200608, "grad_norm": 2.664660930633545, "learning_rate": 1.886567240150667e-05, "loss": 0.5707, "step": 349800 }, { "epoch": 8.059316569954868, "grad_norm": 3.073604106903076, "learning_rate": 1.8858400837684152e-05, "loss": 0.5928, "step": 350000 }, { "epoch": 8.063921893709129, "grad_norm": 3.5625150203704834, "learning_rate": 1.8851129273861638e-05, "loss": 0.5908, "step": 350200 }, { "epoch": 8.068527217463387, "grad_norm": 2.9049417972564697, "learning_rate": 1.8843857710039124e-05, "loss": 0.5835, "step": 350400 }, { "epoch": 8.073132541217648, "grad_norm": 3.092867851257324, "learning_rate": 1.8836586146216603e-05, "loss": 0.5788, "step": 350600 }, { "epoch": 8.077737864971908, "grad_norm": 2.719918727874756, "learning_rate": 1.88293509402132e-05, "loss": 0.5823, "step": 350800 }, { "epoch": 8.082343188726167, "grad_norm": 4.278101444244385, "learning_rate": 1.8822115734209797e-05, "loss": 0.5818, "step": 351000 }, { "epoch": 8.086948512480427, "grad_norm": 2.67177677154541, "learning_rate": 1.8814844170387283e-05, "loss": 0.5893, "step": 351200 }, { "epoch": 8.091553836234688, "grad_norm": 3.964667558670044, "learning_rate": 1.880757260656477e-05, "loss": 0.5855, "step": 351400 }, { "epoch": 8.096159159988947, "grad_norm": 3.183964252471924, "learning_rate": 1.880030104274225e-05, "loss": 0.5794, "step": 351600 }, { "epoch": 8.100764483743207, "grad_norm": 2.9570157527923584, "learning_rate": 1.8793029478919737e-05, "loss": 0.5979, "step": 351800 }, { "epoch": 8.105369807497468, "grad_norm": 3.8420159816741943, "learning_rate": 1.8785757915097223e-05, "loss": 0.5792, "step": 352000 }, { "epoch": 8.109975131251726, "grad_norm": 4.49518346786499, "learning_rate": 1.8778486351274705e-05, "loss": 0.5845, "step": 352200 }, { "epoch": 8.114580455005987, "grad_norm": 2.9264657497406006, "learning_rate": 1.8771214787452188e-05, "loss": 0.5866, "step": 352400 }, { "epoch": 8.119185778760247, "grad_norm": 2.69858455657959, "learning_rate": 1.8763943223629674e-05, "loss": 0.5859, "step": 352600 }, { "epoch": 8.123791102514506, "grad_norm": 3.450650453567505, "learning_rate": 1.875667165980716e-05, "loss": 0.5838, "step": 352800 }, { "epoch": 8.128396426268766, "grad_norm": 2.7137413024902344, "learning_rate": 1.8749400095984642e-05, "loss": 0.5824, "step": 353000 }, { "epoch": 8.133001750023027, "grad_norm": 3.0916788578033447, "learning_rate": 1.8742128532162128e-05, "loss": 0.5959, "step": 353200 }, { "epoch": 8.137607073777286, "grad_norm": 3.159905433654785, "learning_rate": 1.8734856968339614e-05, "loss": 0.5792, "step": 353400 }, { "epoch": 8.142212397531546, "grad_norm": 3.4873342514038086, "learning_rate": 1.8727585404517093e-05, "loss": 0.5834, "step": 353600 }, { "epoch": 8.146817721285807, "grad_norm": 3.2853617668151855, "learning_rate": 1.872031384069458e-05, "loss": 0.5818, "step": 353800 }, { "epoch": 8.151423045040067, "grad_norm": 4.945338726043701, "learning_rate": 1.8713042276872065e-05, "loss": 0.579, "step": 354000 }, { "epoch": 8.156028368794326, "grad_norm": 2.7136645317077637, "learning_rate": 1.8705770713049548e-05, "loss": 0.5743, "step": 354200 }, { "epoch": 8.160633692548586, "grad_norm": 3.545706272125244, "learning_rate": 1.8698535507046146e-05, "loss": 0.5861, "step": 354400 }, { "epoch": 8.165239016302847, "grad_norm": 2.9433093070983887, "learning_rate": 1.8691263943223632e-05, "loss": 0.5775, "step": 354600 }, { "epoch": 8.169844340057105, "grad_norm": 3.4373202323913574, "learning_rate": 1.8683992379401115e-05, "loss": 0.589, "step": 354800 }, { "epoch": 8.174449663811366, "grad_norm": 3.6623363494873047, "learning_rate": 1.8676720815578597e-05, "loss": 0.5737, "step": 355000 }, { "epoch": 8.179054987565626, "grad_norm": 3.4433813095092773, "learning_rate": 1.8669449251756083e-05, "loss": 0.5852, "step": 355200 }, { "epoch": 8.183660311319885, "grad_norm": 3.2509543895721436, "learning_rate": 1.8662177687933566e-05, "loss": 0.5787, "step": 355400 }, { "epoch": 8.188265635074146, "grad_norm": 2.7837679386138916, "learning_rate": 1.865490612411105e-05, "loss": 0.5808, "step": 355600 }, { "epoch": 8.192870958828406, "grad_norm": 3.2710013389587402, "learning_rate": 1.8647634560288538e-05, "loss": 0.5764, "step": 355800 }, { "epoch": 8.197476282582665, "grad_norm": 3.4434151649475098, "learning_rate": 1.864036299646602e-05, "loss": 0.5755, "step": 356000 }, { "epoch": 8.202081606336925, "grad_norm": 3.4400975704193115, "learning_rate": 1.8633091432643503e-05, "loss": 0.5874, "step": 356200 }, { "epoch": 8.206686930091186, "grad_norm": 3.25714111328125, "learning_rate": 1.862581986882099e-05, "loss": 0.5868, "step": 356400 }, { "epoch": 8.211292253845444, "grad_norm": 5.997415542602539, "learning_rate": 1.8618548304998474e-05, "loss": 0.5689, "step": 356600 }, { "epoch": 8.215897577599705, "grad_norm": 3.3374671936035156, "learning_rate": 1.861131309899507e-05, "loss": 0.5931, "step": 356800 }, { "epoch": 8.220502901353965, "grad_norm": 2.680866241455078, "learning_rate": 1.8604041535172556e-05, "loss": 0.5855, "step": 357000 }, { "epoch": 8.225108225108226, "grad_norm": 3.1401114463806152, "learning_rate": 1.8596769971350038e-05, "loss": 0.5864, "step": 357200 }, { "epoch": 8.229713548862485, "grad_norm": 2.7683680057525635, "learning_rate": 1.8589498407527524e-05, "loss": 0.578, "step": 357400 }, { "epoch": 8.234318872616745, "grad_norm": 3.4666707515716553, "learning_rate": 1.858222684370501e-05, "loss": 0.5788, "step": 357600 }, { "epoch": 8.238924196371006, "grad_norm": 2.9666600227355957, "learning_rate": 1.857495527988249e-05, "loss": 0.5948, "step": 357800 }, { "epoch": 8.243529520125264, "grad_norm": 3.1708505153656006, "learning_rate": 1.8567683716059975e-05, "loss": 0.5724, "step": 358000 }, { "epoch": 8.248134843879525, "grad_norm": 3.3768389225006104, "learning_rate": 1.856041215223746e-05, "loss": 0.5721, "step": 358200 }, { "epoch": 8.252740167633785, "grad_norm": 2.644984722137451, "learning_rate": 1.8553140588414947e-05, "loss": 0.5743, "step": 358400 }, { "epoch": 8.257345491388044, "grad_norm": 3.155850648880005, "learning_rate": 1.854586902459243e-05, "loss": 0.5792, "step": 358600 }, { "epoch": 8.261950815142304, "grad_norm": 2.7669639587402344, "learning_rate": 1.8538597460769915e-05, "loss": 0.5915, "step": 358800 }, { "epoch": 8.266556138896565, "grad_norm": 3.183368682861328, "learning_rate": 1.8531325896947398e-05, "loss": 0.5899, "step": 359000 }, { "epoch": 8.271161462650824, "grad_norm": 2.635927677154541, "learning_rate": 1.852405433312488e-05, "loss": 0.588, "step": 359200 }, { "epoch": 8.275766786405084, "grad_norm": 3.761402130126953, "learning_rate": 1.8516782769302366e-05, "loss": 0.5888, "step": 359400 }, { "epoch": 8.280372110159345, "grad_norm": 2.46097469329834, "learning_rate": 1.850954756329896e-05, "loss": 0.5802, "step": 359600 }, { "epoch": 8.284977433913603, "grad_norm": 2.9108171463012695, "learning_rate": 1.8502275999476447e-05, "loss": 0.5836, "step": 359800 }, { "epoch": 8.289582757667864, "grad_norm": 3.4605486392974854, "learning_rate": 1.8495004435653933e-05, "loss": 0.5948, "step": 360000 }, { "epoch": 8.294188081422124, "grad_norm": 3.1106951236724854, "learning_rate": 1.848773287183142e-05, "loss": 0.5734, "step": 360200 }, { "epoch": 8.298793405176385, "grad_norm": 2.4587388038635254, "learning_rate": 1.84804613080089e-05, "loss": 0.5776, "step": 360400 }, { "epoch": 8.303398728930643, "grad_norm": 2.9624781608581543, "learning_rate": 1.8473189744186384e-05, "loss": 0.5843, "step": 360600 }, { "epoch": 8.308004052684904, "grad_norm": 2.9920835494995117, "learning_rate": 1.846591818036387e-05, "loss": 0.5915, "step": 360800 }, { "epoch": 8.312609376439164, "grad_norm": 2.7066526412963867, "learning_rate": 1.8458646616541353e-05, "loss": 0.5826, "step": 361000 }, { "epoch": 8.317214700193423, "grad_norm": 3.3276214599609375, "learning_rate": 1.845141141053795e-05, "loss": 0.5697, "step": 361200 }, { "epoch": 8.321820023947684, "grad_norm": 3.875354290008545, "learning_rate": 1.8444139846715434e-05, "loss": 0.5921, "step": 361400 }, { "epoch": 8.326425347701944, "grad_norm": 2.708665132522583, "learning_rate": 1.8436904640712032e-05, "loss": 0.5803, "step": 361600 }, { "epoch": 8.331030671456203, "grad_norm": 3.221430540084839, "learning_rate": 1.8429633076889518e-05, "loss": 0.5827, "step": 361800 }, { "epoch": 8.335635995210463, "grad_norm": 2.921086549758911, "learning_rate": 1.8422361513067e-05, "loss": 0.5774, "step": 362000 }, { "epoch": 8.340241318964724, "grad_norm": 3.2136096954345703, "learning_rate": 1.8415089949244483e-05, "loss": 0.5872, "step": 362200 }, { "epoch": 8.344846642718982, "grad_norm": 2.5168769359588623, "learning_rate": 1.840781838542197e-05, "loss": 0.5836, "step": 362400 }, { "epoch": 8.349451966473243, "grad_norm": 2.8332345485687256, "learning_rate": 1.8400546821599452e-05, "loss": 0.5748, "step": 362600 }, { "epoch": 8.354057290227503, "grad_norm": 2.76617431640625, "learning_rate": 1.8393275257776938e-05, "loss": 0.5802, "step": 362800 }, { "epoch": 8.358662613981762, "grad_norm": 3.170368194580078, "learning_rate": 1.8386003693954424e-05, "loss": 0.5819, "step": 363000 }, { "epoch": 8.363267937736023, "grad_norm": 3.0185418128967285, "learning_rate": 1.8378732130131906e-05, "loss": 0.5835, "step": 363200 }, { "epoch": 8.367873261490283, "grad_norm": 3.179088830947876, "learning_rate": 1.837146056630939e-05, "loss": 0.5827, "step": 363400 }, { "epoch": 8.372478585244544, "grad_norm": 2.731213331222534, "learning_rate": 1.8364189002486875e-05, "loss": 0.5866, "step": 363600 }, { "epoch": 8.377083908998802, "grad_norm": 2.9844472408294678, "learning_rate": 1.835691743866436e-05, "loss": 0.5795, "step": 363800 }, { "epoch": 8.381689232753063, "grad_norm": 3.1267082691192627, "learning_rate": 1.8349645874841843e-05, "loss": 0.5794, "step": 364000 }, { "epoch": 8.386294556507323, "grad_norm": 2.6979808807373047, "learning_rate": 1.834237431101933e-05, "loss": 0.5956, "step": 364200 }, { "epoch": 8.390899880261582, "grad_norm": 2.9630825519561768, "learning_rate": 1.8335102747196815e-05, "loss": 0.5796, "step": 364400 }, { "epoch": 8.395505204015842, "grad_norm": 3.268383741378784, "learning_rate": 1.8327831183374297e-05, "loss": 0.5943, "step": 364600 }, { "epoch": 8.400110527770103, "grad_norm": 2.594475507736206, "learning_rate": 1.832055961955178e-05, "loss": 0.5741, "step": 364800 }, { "epoch": 8.404715851524362, "grad_norm": 2.8668203353881836, "learning_rate": 1.8313288055729266e-05, "loss": 0.5843, "step": 365000 }, { "epoch": 8.409321175278622, "grad_norm": 2.9619016647338867, "learning_rate": 1.8306016491906748e-05, "loss": 0.5845, "step": 365200 }, { "epoch": 8.413926499032883, "grad_norm": 3.0649566650390625, "learning_rate": 1.8298744928084234e-05, "loss": 0.6053, "step": 365400 }, { "epoch": 8.418531822787141, "grad_norm": 3.129210948944092, "learning_rate": 1.829147336426172e-05, "loss": 0.5795, "step": 365600 }, { "epoch": 8.423137146541402, "grad_norm": 3.2703466415405273, "learning_rate": 1.8284201800439203e-05, "loss": 0.5817, "step": 365800 }, { "epoch": 8.427742470295662, "grad_norm": 3.3453192710876465, "learning_rate": 1.8276930236616685e-05, "loss": 0.5837, "step": 366000 }, { "epoch": 8.432347794049921, "grad_norm": 3.5036461353302, "learning_rate": 1.826965867279417e-05, "loss": 0.588, "step": 366200 }, { "epoch": 8.436953117804181, "grad_norm": 3.5221807956695557, "learning_rate": 1.8262387108971657e-05, "loss": 0.5866, "step": 366400 }, { "epoch": 8.441558441558442, "grad_norm": 3.2333667278289795, "learning_rate": 1.825511554514914e-05, "loss": 0.5848, "step": 366600 }, { "epoch": 8.4461637653127, "grad_norm": 3.349844455718994, "learning_rate": 1.8247843981326625e-05, "loss": 0.5864, "step": 366800 }, { "epoch": 8.450769089066961, "grad_norm": 3.0328314304351807, "learning_rate": 1.824057241750411e-05, "loss": 0.5713, "step": 367000 }, { "epoch": 8.455374412821222, "grad_norm": 3.0258967876434326, "learning_rate": 1.8233300853681594e-05, "loss": 0.5747, "step": 367200 }, { "epoch": 8.459979736575482, "grad_norm": 2.960977792739868, "learning_rate": 1.8226029289859076e-05, "loss": 0.5685, "step": 367400 }, { "epoch": 8.46458506032974, "grad_norm": 3.144864797592163, "learning_rate": 1.8218757726036562e-05, "loss": 0.5863, "step": 367600 }, { "epoch": 8.469190384084001, "grad_norm": 2.93837833404541, "learning_rate": 1.8211486162214045e-05, "loss": 0.5778, "step": 367800 }, { "epoch": 8.473795707838262, "grad_norm": 3.0120725631713867, "learning_rate": 1.8204250956210643e-05, "loss": 0.5898, "step": 368000 }, { "epoch": 8.47840103159252, "grad_norm": 2.907320976257324, "learning_rate": 1.819697939238813e-05, "loss": 0.58, "step": 368200 }, { "epoch": 8.483006355346781, "grad_norm": 2.7009003162384033, "learning_rate": 1.8189744186384725e-05, "loss": 0.5916, "step": 368400 }, { "epoch": 8.487611679101041, "grad_norm": 2.902514696121216, "learning_rate": 1.818247262256221e-05, "loss": 0.5683, "step": 368600 }, { "epoch": 8.4922170028553, "grad_norm": 3.5447628498077393, "learning_rate": 1.8175201058739693e-05, "loss": 0.5765, "step": 368800 }, { "epoch": 8.49682232660956, "grad_norm": 3.0981380939483643, "learning_rate": 1.8167929494917176e-05, "loss": 0.5716, "step": 369000 }, { "epoch": 8.501427650363821, "grad_norm": 2.814976453781128, "learning_rate": 1.816065793109466e-05, "loss": 0.5864, "step": 369200 }, { "epoch": 8.50603297411808, "grad_norm": 3.170222043991089, "learning_rate": 1.8153386367272144e-05, "loss": 0.5754, "step": 369400 }, { "epoch": 8.51063829787234, "grad_norm": 2.6524722576141357, "learning_rate": 1.814611480344963e-05, "loss": 0.5859, "step": 369600 }, { "epoch": 8.5152436216266, "grad_norm": 3.715766191482544, "learning_rate": 1.8138843239627116e-05, "loss": 0.5889, "step": 369800 }, { "epoch": 8.51984894538086, "grad_norm": 3.317150592803955, "learning_rate": 1.8131571675804602e-05, "loss": 0.5823, "step": 370000 }, { "epoch": 8.52445426913512, "grad_norm": 3.7694554328918457, "learning_rate": 1.812430011198208e-05, "loss": 0.5776, "step": 370200 }, { "epoch": 8.52905959288938, "grad_norm": 3.328153371810913, "learning_rate": 1.8117028548159567e-05, "loss": 0.5895, "step": 370400 }, { "epoch": 8.533664916643641, "grad_norm": 3.546926498413086, "learning_rate": 1.8109756984337053e-05, "loss": 0.5798, "step": 370600 }, { "epoch": 8.5382702403979, "grad_norm": 3.3275558948516846, "learning_rate": 1.8102485420514535e-05, "loss": 0.5787, "step": 370800 }, { "epoch": 8.54287556415216, "grad_norm": 2.915025234222412, "learning_rate": 1.809521385669202e-05, "loss": 0.5742, "step": 371000 }, { "epoch": 8.54748088790642, "grad_norm": 2.634218454360962, "learning_rate": 1.8087942292869507e-05, "loss": 0.5869, "step": 371200 }, { "epoch": 8.55208621166068, "grad_norm": 3.156991481781006, "learning_rate": 1.808067072904699e-05, "loss": 0.582, "step": 371400 }, { "epoch": 8.55669153541494, "grad_norm": 3.4311141967773438, "learning_rate": 1.8073399165224472e-05, "loss": 0.5955, "step": 371600 }, { "epoch": 8.5612968591692, "grad_norm": 2.9205784797668457, "learning_rate": 1.8066127601401958e-05, "loss": 0.5826, "step": 371800 }, { "epoch": 8.565902182923459, "grad_norm": 3.045046329498291, "learning_rate": 1.805885603757944e-05, "loss": 0.5948, "step": 372000 }, { "epoch": 8.57050750667772, "grad_norm": 3.506364583969116, "learning_rate": 1.8051584473756926e-05, "loss": 0.5762, "step": 372200 }, { "epoch": 8.57511283043198, "grad_norm": 3.3249881267547607, "learning_rate": 1.8044312909934412e-05, "loss": 0.5784, "step": 372400 }, { "epoch": 8.579718154186239, "grad_norm": 3.3042402267456055, "learning_rate": 1.8037041346111898e-05, "loss": 0.5907, "step": 372600 }, { "epoch": 8.584323477940499, "grad_norm": 2.80226469039917, "learning_rate": 1.8029769782289377e-05, "loss": 0.5917, "step": 372800 }, { "epoch": 8.58892880169476, "grad_norm": 3.677151679992676, "learning_rate": 1.8022498218466863e-05, "loss": 0.5947, "step": 373000 }, { "epoch": 8.593534125449018, "grad_norm": 2.8977012634277344, "learning_rate": 1.801522665464435e-05, "loss": 0.5859, "step": 373200 }, { "epoch": 8.598139449203279, "grad_norm": 3.019442558288574, "learning_rate": 1.8007955090821832e-05, "loss": 0.5862, "step": 373400 }, { "epoch": 8.60274477295754, "grad_norm": 3.2348787784576416, "learning_rate": 1.8000683526999318e-05, "loss": 0.5909, "step": 373600 }, { "epoch": 8.607350096711798, "grad_norm": 3.2550971508026123, "learning_rate": 1.7993411963176804e-05, "loss": 0.5851, "step": 373800 }, { "epoch": 8.611955420466058, "grad_norm": 2.6115951538085938, "learning_rate": 1.7986140399354286e-05, "loss": 0.5935, "step": 374000 }, { "epoch": 8.616560744220319, "grad_norm": 3.2661094665527344, "learning_rate": 1.797886883553177e-05, "loss": 0.581, "step": 374200 }, { "epoch": 8.62116606797458, "grad_norm": 3.3261373043060303, "learning_rate": 1.7971597271709255e-05, "loss": 0.5674, "step": 374400 }, { "epoch": 8.625771391728838, "grad_norm": 3.81058669090271, "learning_rate": 1.7964325707886737e-05, "loss": 0.5717, "step": 374600 }, { "epoch": 8.630376715483099, "grad_norm": 3.5395662784576416, "learning_rate": 1.7957090501883336e-05, "loss": 0.5872, "step": 374800 }, { "epoch": 8.634982039237359, "grad_norm": 3.262457847595215, "learning_rate": 1.794981893806082e-05, "loss": 0.5776, "step": 375000 }, { "epoch": 8.639587362991618, "grad_norm": 3.360276460647583, "learning_rate": 1.7942547374238304e-05, "loss": 0.5762, "step": 375200 }, { "epoch": 8.644192686745878, "grad_norm": 3.0414352416992188, "learning_rate": 1.793527581041579e-05, "loss": 0.5866, "step": 375400 }, { "epoch": 8.648798010500139, "grad_norm": 3.696171283721924, "learning_rate": 1.7928004246593273e-05, "loss": 0.5764, "step": 375600 }, { "epoch": 8.653403334254397, "grad_norm": 3.294072389602661, "learning_rate": 1.7920732682770755e-05, "loss": 0.5772, "step": 375800 }, { "epoch": 8.658008658008658, "grad_norm": 3.592628240585327, "learning_rate": 1.791346111894824e-05, "loss": 0.5712, "step": 376000 }, { "epoch": 8.662613981762918, "grad_norm": 2.2415127754211426, "learning_rate": 1.7906189555125727e-05, "loss": 0.5692, "step": 376200 }, { "epoch": 8.667219305517177, "grad_norm": 3.072105646133423, "learning_rate": 1.7898954349122322e-05, "loss": 0.5747, "step": 376400 }, { "epoch": 8.671824629271438, "grad_norm": 3.045968770980835, "learning_rate": 1.7891682785299808e-05, "loss": 0.5738, "step": 376600 }, { "epoch": 8.676429953025698, "grad_norm": 3.102426052093506, "learning_rate": 1.7884411221477294e-05, "loss": 0.5619, "step": 376800 }, { "epoch": 8.681035276779959, "grad_norm": 2.7489142417907715, "learning_rate": 1.7877139657654773e-05, "loss": 0.5673, "step": 377000 }, { "epoch": 8.685640600534217, "grad_norm": 2.9566493034362793, "learning_rate": 1.786986809383226e-05, "loss": 0.5868, "step": 377200 }, { "epoch": 8.690245924288478, "grad_norm": 3.6835222244262695, "learning_rate": 1.7862596530009745e-05, "loss": 0.5878, "step": 377400 }, { "epoch": 8.694851248042738, "grad_norm": 3.0927047729492188, "learning_rate": 1.7855324966187227e-05, "loss": 0.59, "step": 377600 }, { "epoch": 8.699456571796997, "grad_norm": 3.5249454975128174, "learning_rate": 1.7848053402364713e-05, "loss": 0.5731, "step": 377800 }, { "epoch": 8.704061895551257, "grad_norm": 2.711740732192993, "learning_rate": 1.78407818385422e-05, "loss": 0.587, "step": 378000 }, { "epoch": 8.708667219305518, "grad_norm": 3.440530776977539, "learning_rate": 1.7833510274719682e-05, "loss": 0.582, "step": 378200 }, { "epoch": 8.713272543059777, "grad_norm": 3.404754400253296, "learning_rate": 1.7826238710897164e-05, "loss": 0.585, "step": 378400 }, { "epoch": 8.717877866814037, "grad_norm": 3.556629180908203, "learning_rate": 1.781896714707465e-05, "loss": 0.5731, "step": 378600 }, { "epoch": 8.722483190568298, "grad_norm": 2.7652220726013184, "learning_rate": 1.7811695583252133e-05, "loss": 0.576, "step": 378800 }, { "epoch": 8.727088514322556, "grad_norm": 3.707284450531006, "learning_rate": 1.780442401942962e-05, "loss": 0.5766, "step": 379000 }, { "epoch": 8.731693838076817, "grad_norm": 2.5321168899536133, "learning_rate": 1.7797152455607105e-05, "loss": 0.5838, "step": 379200 }, { "epoch": 8.736299161831077, "grad_norm": 2.458303213119507, "learning_rate": 1.778988089178459e-05, "loss": 0.5953, "step": 379400 }, { "epoch": 8.740904485585336, "grad_norm": 3.712862253189087, "learning_rate": 1.778260932796207e-05, "loss": 0.5852, "step": 379600 }, { "epoch": 8.745509809339596, "grad_norm": 3.2254207134246826, "learning_rate": 1.7775337764139556e-05, "loss": 0.5876, "step": 379800 }, { "epoch": 8.750115133093857, "grad_norm": 2.612973690032959, "learning_rate": 1.776806620031704e-05, "loss": 0.5762, "step": 380000 }, { "epoch": 8.754720456848116, "grad_norm": 3.134018898010254, "learning_rate": 1.7760794636494524e-05, "loss": 0.573, "step": 380200 }, { "epoch": 8.759325780602376, "grad_norm": 3.392620801925659, "learning_rate": 1.7753559430491123e-05, "loss": 0.5713, "step": 380400 }, { "epoch": 8.763931104356637, "grad_norm": 2.867246627807617, "learning_rate": 1.774628786666861e-05, "loss": 0.5774, "step": 380600 }, { "epoch": 8.768536428110895, "grad_norm": 2.381554126739502, "learning_rate": 1.7739052660665204e-05, "loss": 0.5897, "step": 380800 }, { "epoch": 8.773141751865156, "grad_norm": 2.7664806842803955, "learning_rate": 1.773178109684269e-05, "loss": 0.5864, "step": 381000 }, { "epoch": 8.777747075619416, "grad_norm": 3.2918121814727783, "learning_rate": 1.7724509533020172e-05, "loss": 0.5768, "step": 381200 }, { "epoch": 8.782352399373677, "grad_norm": 3.4607903957366943, "learning_rate": 1.7717237969197655e-05, "loss": 0.5853, "step": 381400 }, { "epoch": 8.786957723127935, "grad_norm": 3.931750535964966, "learning_rate": 1.770996640537514e-05, "loss": 0.5778, "step": 381600 }, { "epoch": 8.791563046882196, "grad_norm": 2.7144854068756104, "learning_rate": 1.7702694841552623e-05, "loss": 0.5793, "step": 381800 }, { "epoch": 8.796168370636456, "grad_norm": 3.4606308937072754, "learning_rate": 1.769542327773011e-05, "loss": 0.5813, "step": 382000 }, { "epoch": 8.800773694390715, "grad_norm": 2.6973564624786377, "learning_rate": 1.7688151713907595e-05, "loss": 0.587, "step": 382200 }, { "epoch": 8.805379018144976, "grad_norm": 2.9456260204315186, "learning_rate": 1.768088015008508e-05, "loss": 0.5725, "step": 382400 }, { "epoch": 8.809984341899236, "grad_norm": 3.2708475589752197, "learning_rate": 1.767360858626256e-05, "loss": 0.5781, "step": 382600 }, { "epoch": 8.814589665653495, "grad_norm": 3.055715799331665, "learning_rate": 1.7666337022440046e-05, "loss": 0.5829, "step": 382800 }, { "epoch": 8.819194989407755, "grad_norm": 2.2745134830474854, "learning_rate": 1.7659065458617532e-05, "loss": 0.5836, "step": 383000 }, { "epoch": 8.823800313162016, "grad_norm": 2.850818157196045, "learning_rate": 1.7651793894795014e-05, "loss": 0.5843, "step": 383200 }, { "epoch": 8.828405636916274, "grad_norm": 3.06870436668396, "learning_rate": 1.76445223309725e-05, "loss": 0.5718, "step": 383400 }, { "epoch": 8.833010960670535, "grad_norm": 3.3040614128112793, "learning_rate": 1.7637250767149986e-05, "loss": 0.5822, "step": 383600 }, { "epoch": 8.837616284424795, "grad_norm": 2.4403414726257324, "learning_rate": 1.7629979203327465e-05, "loss": 0.5899, "step": 383800 }, { "epoch": 8.842221608179056, "grad_norm": 2.8603031635284424, "learning_rate": 1.762270763950495e-05, "loss": 0.5811, "step": 384000 }, { "epoch": 8.846826931933315, "grad_norm": 2.7087676525115967, "learning_rate": 1.7615436075682437e-05, "loss": 0.5904, "step": 384200 }, { "epoch": 8.851432255687575, "grad_norm": 2.9180452823638916, "learning_rate": 1.760816451185992e-05, "loss": 0.5789, "step": 384400 }, { "epoch": 8.856037579441836, "grad_norm": 3.6480114459991455, "learning_rate": 1.7600892948037406e-05, "loss": 0.5775, "step": 384600 }, { "epoch": 8.860642903196094, "grad_norm": 3.7235474586486816, "learning_rate": 1.7593657742034004e-05, "loss": 0.5756, "step": 384800 }, { "epoch": 8.865248226950355, "grad_norm": 2.3067126274108887, "learning_rate": 1.7586386178211487e-05, "loss": 0.5848, "step": 385000 }, { "epoch": 8.869853550704615, "grad_norm": 3.3306350708007812, "learning_rate": 1.7579114614388973e-05, "loss": 0.5807, "step": 385200 }, { "epoch": 8.874458874458874, "grad_norm": 3.0547428131103516, "learning_rate": 1.7571879408385568e-05, "loss": 0.5596, "step": 385400 }, { "epoch": 8.879064198213134, "grad_norm": 3.389777660369873, "learning_rate": 1.756460784456305e-05, "loss": 0.5688, "step": 385600 }, { "epoch": 8.883669521967395, "grad_norm": 3.0648694038391113, "learning_rate": 1.7557336280740536e-05, "loss": 0.5934, "step": 385800 }, { "epoch": 8.888274845721654, "grad_norm": 2.877584457397461, "learning_rate": 1.7550064716918022e-05, "loss": 0.5749, "step": 386000 }, { "epoch": 8.892880169475914, "grad_norm": 3.3926568031311035, "learning_rate": 1.7542793153095505e-05, "loss": 0.5716, "step": 386200 }, { "epoch": 8.897485493230175, "grad_norm": 2.8832592964172363, "learning_rate": 1.753552158927299e-05, "loss": 0.577, "step": 386400 }, { "epoch": 8.902090816984433, "grad_norm": 3.812521457672119, "learning_rate": 1.7528250025450477e-05, "loss": 0.5931, "step": 386600 }, { "epoch": 8.906696140738694, "grad_norm": 3.376232624053955, "learning_rate": 1.7520978461627956e-05, "loss": 0.5758, "step": 386800 }, { "epoch": 8.911301464492954, "grad_norm": 3.098276138305664, "learning_rate": 1.751370689780544e-05, "loss": 0.579, "step": 387000 }, { "epoch": 8.915906788247213, "grad_norm": 3.7928085327148438, "learning_rate": 1.7506435333982927e-05, "loss": 0.5814, "step": 387200 }, { "epoch": 8.920512112001473, "grad_norm": 2.6792008876800537, "learning_rate": 1.749916377016041e-05, "loss": 0.5924, "step": 387400 }, { "epoch": 8.925117435755734, "grad_norm": 3.9660255908966064, "learning_rate": 1.7491892206337896e-05, "loss": 0.5773, "step": 387600 }, { "epoch": 8.929722759509993, "grad_norm": 3.670153856277466, "learning_rate": 1.7484657000334495e-05, "loss": 0.5863, "step": 387800 }, { "epoch": 8.934328083264253, "grad_norm": 2.9507553577423096, "learning_rate": 1.7477385436511977e-05, "loss": 0.5849, "step": 388000 }, { "epoch": 8.938933407018514, "grad_norm": 3.5913052558898926, "learning_rate": 1.7470113872689463e-05, "loss": 0.5853, "step": 388200 }, { "epoch": 8.943538730772774, "grad_norm": 3.083280086517334, "learning_rate": 1.7462842308866945e-05, "loss": 0.5752, "step": 388400 }, { "epoch": 8.948144054527033, "grad_norm": 3.377192497253418, "learning_rate": 1.7455570745044428e-05, "loss": 0.5705, "step": 388600 }, { "epoch": 8.952749378281293, "grad_norm": 3.040677785873413, "learning_rate": 1.7448299181221914e-05, "loss": 0.5838, "step": 388800 }, { "epoch": 8.957354702035554, "grad_norm": 2.774031400680542, "learning_rate": 1.74410276173994e-05, "loss": 0.5814, "step": 389000 }, { "epoch": 8.961960025789812, "grad_norm": 3.28867244720459, "learning_rate": 1.7433756053576882e-05, "loss": 0.5915, "step": 389200 }, { "epoch": 8.966565349544073, "grad_norm": 2.8730006217956543, "learning_rate": 1.7426484489754368e-05, "loss": 0.579, "step": 389400 }, { "epoch": 8.971170673298333, "grad_norm": 3.185356855392456, "learning_rate": 1.741921292593185e-05, "loss": 0.5694, "step": 389600 }, { "epoch": 8.975775997052592, "grad_norm": 3.238297700881958, "learning_rate": 1.7411941362109333e-05, "loss": 0.5869, "step": 389800 }, { "epoch": 8.980381320806853, "grad_norm": 2.7840898036956787, "learning_rate": 1.740466979828682e-05, "loss": 0.5765, "step": 390000 }, { "epoch": 8.984986644561113, "grad_norm": 3.177186965942383, "learning_rate": 1.7397398234464305e-05, "loss": 0.587, "step": 390200 }, { "epoch": 8.989591968315372, "grad_norm": 2.8901917934417725, "learning_rate": 1.739012667064179e-05, "loss": 0.5862, "step": 390400 }, { "epoch": 8.994197292069632, "grad_norm": 2.768673896789551, "learning_rate": 1.7382855106819274e-05, "loss": 0.584, "step": 390600 }, { "epoch": 8.998802615823893, "grad_norm": 2.805500030517578, "learning_rate": 1.7375583542996756e-05, "loss": 0.5783, "step": 390800 }, { "epoch": 9.0, "eval_loss": 0.5523322224617004, "eval_runtime": 146.0681, "eval_samples_per_second": 194.163, "eval_steps_per_second": 12.138, "step": 390852 }, { "epoch": 9.003407939578153, "grad_norm": 4.481179714202881, "learning_rate": 1.736834833699335e-05, "loss": 0.5692, "step": 391000 }, { "epoch": 9.008013263332412, "grad_norm": 3.07963228225708, "learning_rate": 1.7361076773170837e-05, "loss": 0.5692, "step": 391200 }, { "epoch": 9.012618587086672, "grad_norm": 3.5437726974487305, "learning_rate": 1.7353805209348323e-05, "loss": 0.5569, "step": 391400 }, { "epoch": 9.017223910840933, "grad_norm": 4.162001609802246, "learning_rate": 1.7346533645525806e-05, "loss": 0.5681, "step": 391600 }, { "epoch": 9.021829234595192, "grad_norm": 3.081472873687744, "learning_rate": 1.733926208170329e-05, "loss": 0.5798, "step": 391800 }, { "epoch": 9.026434558349452, "grad_norm": 2.833381175994873, "learning_rate": 1.7331990517880778e-05, "loss": 0.5837, "step": 392000 }, { "epoch": 9.031039882103713, "grad_norm": 3.1221764087677, "learning_rate": 1.7324718954058263e-05, "loss": 0.5657, "step": 392200 }, { "epoch": 9.035645205857971, "grad_norm": 3.7719027996063232, "learning_rate": 1.7317447390235743e-05, "loss": 0.5752, "step": 392400 }, { "epoch": 9.040250529612232, "grad_norm": 3.361680507659912, "learning_rate": 1.731017582641323e-05, "loss": 0.5688, "step": 392600 }, { "epoch": 9.044855853366492, "grad_norm": 3.1694486141204834, "learning_rate": 1.7302904262590714e-05, "loss": 0.5802, "step": 392800 }, { "epoch": 9.049461177120751, "grad_norm": 3.455324649810791, "learning_rate": 1.7295632698768197e-05, "loss": 0.5688, "step": 393000 }, { "epoch": 9.054066500875011, "grad_norm": 3.718388795852661, "learning_rate": 1.7288361134945683e-05, "loss": 0.5823, "step": 393200 }, { "epoch": 9.058671824629272, "grad_norm": 3.018453359603882, "learning_rate": 1.728108957112317e-05, "loss": 0.5785, "step": 393400 }, { "epoch": 9.06327714838353, "grad_norm": 2.9057810306549072, "learning_rate": 1.7273818007300648e-05, "loss": 0.5837, "step": 393600 }, { "epoch": 9.067882472137791, "grad_norm": 3.081549644470215, "learning_rate": 1.7266546443478134e-05, "loss": 0.5735, "step": 393800 }, { "epoch": 9.072487795892052, "grad_norm": 3.056722640991211, "learning_rate": 1.725927487965562e-05, "loss": 0.5877, "step": 394000 }, { "epoch": 9.07709311964631, "grad_norm": 2.4788143634796143, "learning_rate": 1.7252003315833102e-05, "loss": 0.562, "step": 394200 }, { "epoch": 9.08169844340057, "grad_norm": 2.9332191944122314, "learning_rate": 1.7244731752010588e-05, "loss": 0.575, "step": 394400 }, { "epoch": 9.086303767154831, "grad_norm": 2.4606101512908936, "learning_rate": 1.7237460188188074e-05, "loss": 0.579, "step": 394600 }, { "epoch": 9.090909090909092, "grad_norm": 2.8705406188964844, "learning_rate": 1.7230188624365553e-05, "loss": 0.5667, "step": 394800 }, { "epoch": 9.09551441466335, "grad_norm": 3.100910186767578, "learning_rate": 1.722291706054304e-05, "loss": 0.5774, "step": 395000 }, { "epoch": 9.100119738417611, "grad_norm": 3.324007034301758, "learning_rate": 1.7215645496720525e-05, "loss": 0.579, "step": 395200 }, { "epoch": 9.104725062171871, "grad_norm": 3.0006039142608643, "learning_rate": 1.720837393289801e-05, "loss": 0.5762, "step": 395400 }, { "epoch": 9.10933038592613, "grad_norm": 3.3208751678466797, "learning_rate": 1.7201102369075493e-05, "loss": 0.5756, "step": 395600 }, { "epoch": 9.11393570968039, "grad_norm": 2.7627346515655518, "learning_rate": 1.7193867163072092e-05, "loss": 0.5721, "step": 395800 }, { "epoch": 9.118541033434651, "grad_norm": 2.2746336460113525, "learning_rate": 1.7186595599249575e-05, "loss": 0.5729, "step": 396000 }, { "epoch": 9.12314635718891, "grad_norm": 3.013868808746338, "learning_rate": 1.717932403542706e-05, "loss": 0.5684, "step": 396200 }, { "epoch": 9.12775168094317, "grad_norm": 3.658040761947632, "learning_rate": 1.7172052471604543e-05, "loss": 0.5754, "step": 396400 }, { "epoch": 9.13235700469743, "grad_norm": 3.7302823066711426, "learning_rate": 1.7164780907782026e-05, "loss": 0.5768, "step": 396600 }, { "epoch": 9.13696232845169, "grad_norm": 3.326700448989868, "learning_rate": 1.715750934395951e-05, "loss": 0.5704, "step": 396800 }, { "epoch": 9.14156765220595, "grad_norm": 2.6242971420288086, "learning_rate": 1.7150237780136997e-05, "loss": 0.5749, "step": 397000 }, { "epoch": 9.14617297596021, "grad_norm": 3.557640552520752, "learning_rate": 1.7142966216314483e-05, "loss": 0.5707, "step": 397200 }, { "epoch": 9.150778299714469, "grad_norm": 4.010790824890137, "learning_rate": 1.7135694652491966e-05, "loss": 0.5678, "step": 397400 }, { "epoch": 9.15538362346873, "grad_norm": 2.8452792167663574, "learning_rate": 1.7128459446488564e-05, "loss": 0.5773, "step": 397600 }, { "epoch": 9.15998894722299, "grad_norm": 3.276362895965576, "learning_rate": 1.7121187882666044e-05, "loss": 0.5768, "step": 397800 }, { "epoch": 9.16459427097725, "grad_norm": 2.479546546936035, "learning_rate": 1.711391631884353e-05, "loss": 0.5864, "step": 398000 }, { "epoch": 9.16919959473151, "grad_norm": 2.4583699703216553, "learning_rate": 1.7106644755021015e-05, "loss": 0.5769, "step": 398200 }, { "epoch": 9.17380491848577, "grad_norm": 3.225541830062866, "learning_rate": 1.70993731911985e-05, "loss": 0.5712, "step": 398400 }, { "epoch": 9.17841024224003, "grad_norm": 2.6798887252807617, "learning_rate": 1.7092101627375984e-05, "loss": 0.5776, "step": 398600 }, { "epoch": 9.183015565994289, "grad_norm": 4.995772838592529, "learning_rate": 1.708483006355347e-05, "loss": 0.5714, "step": 398800 }, { "epoch": 9.18762088974855, "grad_norm": 3.2529380321502686, "learning_rate": 1.7077558499730956e-05, "loss": 0.5678, "step": 399000 }, { "epoch": 9.19222621350281, "grad_norm": 2.858065366744995, "learning_rate": 1.7070286935908435e-05, "loss": 0.5693, "step": 399200 }, { "epoch": 9.196831537257069, "grad_norm": 2.9561386108398438, "learning_rate": 1.706301537208592e-05, "loss": 0.5748, "step": 399400 }, { "epoch": 9.201436861011329, "grad_norm": 3.378155469894409, "learning_rate": 1.7055743808263407e-05, "loss": 0.5608, "step": 399600 }, { "epoch": 9.20604218476559, "grad_norm": 3.1857070922851562, "learning_rate": 1.704847224444089e-05, "loss": 0.5818, "step": 399800 }, { "epoch": 9.210647508519848, "grad_norm": 4.287872791290283, "learning_rate": 1.7041200680618375e-05, "loss": 0.5751, "step": 400000 }, { "epoch": 9.215252832274109, "grad_norm": 3.0986175537109375, "learning_rate": 1.703392911679586e-05, "loss": 0.5718, "step": 400200 }, { "epoch": 9.21985815602837, "grad_norm": 3.245264768600464, "learning_rate": 1.702665755297334e-05, "loss": 0.5748, "step": 400400 }, { "epoch": 9.224463479782628, "grad_norm": 3.5556859970092773, "learning_rate": 1.7019385989150826e-05, "loss": 0.5736, "step": 400600 }, { "epoch": 9.229068803536888, "grad_norm": 3.7929844856262207, "learning_rate": 1.7012114425328312e-05, "loss": 0.5812, "step": 400800 }, { "epoch": 9.233674127291149, "grad_norm": 3.4577155113220215, "learning_rate": 1.7004879219324907e-05, "loss": 0.5725, "step": 401000 }, { "epoch": 9.238279451045408, "grad_norm": 3.2667465209960938, "learning_rate": 1.6997607655502393e-05, "loss": 0.5845, "step": 401200 }, { "epoch": 9.242884774799668, "grad_norm": 3.0420265197753906, "learning_rate": 1.699033609167988e-05, "loss": 0.5881, "step": 401400 }, { "epoch": 9.247490098553929, "grad_norm": 2.9382212162017822, "learning_rate": 1.698306452785736e-05, "loss": 0.5904, "step": 401600 }, { "epoch": 9.252095422308189, "grad_norm": 3.358121395111084, "learning_rate": 1.6975792964034844e-05, "loss": 0.5678, "step": 401800 }, { "epoch": 9.256700746062448, "grad_norm": 6.984818458557129, "learning_rate": 1.696852140021233e-05, "loss": 0.5678, "step": 402000 }, { "epoch": 9.261306069816708, "grad_norm": 2.786919355392456, "learning_rate": 1.6961249836389812e-05, "loss": 0.5801, "step": 402200 }, { "epoch": 9.265911393570969, "grad_norm": 2.3352646827697754, "learning_rate": 1.69539782725673e-05, "loss": 0.5722, "step": 402400 }, { "epoch": 9.270516717325227, "grad_norm": 3.7076637744903564, "learning_rate": 1.6946706708744784e-05, "loss": 0.5773, "step": 402600 }, { "epoch": 9.275122041079488, "grad_norm": 2.3307411670684814, "learning_rate": 1.6939435144922267e-05, "loss": 0.559, "step": 402800 }, { "epoch": 9.279727364833748, "grad_norm": 3.8850185871124268, "learning_rate": 1.6932163581099753e-05, "loss": 0.5656, "step": 403000 }, { "epoch": 9.284332688588007, "grad_norm": 3.246624231338501, "learning_rate": 1.6924892017277235e-05, "loss": 0.5893, "step": 403200 }, { "epoch": 9.288938012342268, "grad_norm": 2.8703010082244873, "learning_rate": 1.691762045345472e-05, "loss": 0.5618, "step": 403400 }, { "epoch": 9.293543336096528, "grad_norm": 3.0698583126068115, "learning_rate": 1.6910385247451316e-05, "loss": 0.5576, "step": 403600 }, { "epoch": 9.298148659850787, "grad_norm": 2.5422704219818115, "learning_rate": 1.6903113683628802e-05, "loss": 0.5816, "step": 403800 }, { "epoch": 9.302753983605047, "grad_norm": 3.1674082279205322, "learning_rate": 1.6895842119806285e-05, "loss": 0.586, "step": 404000 }, { "epoch": 9.307359307359308, "grad_norm": 3.551118850708008, "learning_rate": 1.688857055598377e-05, "loss": 0.5837, "step": 404200 }, { "epoch": 9.311964631113566, "grad_norm": 2.9174249172210693, "learning_rate": 1.6881298992161257e-05, "loss": 0.5796, "step": 404400 }, { "epoch": 9.316569954867827, "grad_norm": 3.165064573287964, "learning_rate": 1.6874027428338736e-05, "loss": 0.5811, "step": 404600 }, { "epoch": 9.321175278622087, "grad_norm": 3.249115228652954, "learning_rate": 1.6866755864516222e-05, "loss": 0.5701, "step": 404800 }, { "epoch": 9.325780602376348, "grad_norm": 3.103433847427368, "learning_rate": 1.6859484300693708e-05, "loss": 0.5762, "step": 405000 }, { "epoch": 9.330385926130607, "grad_norm": 3.612250328063965, "learning_rate": 1.6852212736871194e-05, "loss": 0.5697, "step": 405200 }, { "epoch": 9.334991249884867, "grad_norm": 4.119315147399902, "learning_rate": 1.6844941173048676e-05, "loss": 0.5787, "step": 405400 }, { "epoch": 9.339596573639128, "grad_norm": 4.298466205596924, "learning_rate": 1.6837669609226162e-05, "loss": 0.563, "step": 405600 }, { "epoch": 9.344201897393386, "grad_norm": 2.9533872604370117, "learning_rate": 1.6830398045403648e-05, "loss": 0.5874, "step": 405800 }, { "epoch": 9.348807221147647, "grad_norm": 3.106743812561035, "learning_rate": 1.6823126481581127e-05, "loss": 0.5742, "step": 406000 }, { "epoch": 9.353412544901907, "grad_norm": 2.491485118865967, "learning_rate": 1.6815854917758613e-05, "loss": 0.5726, "step": 406200 }, { "epoch": 9.358017868656166, "grad_norm": 3.787768602371216, "learning_rate": 1.680861971175521e-05, "loss": 0.5682, "step": 406400 }, { "epoch": 9.362623192410426, "grad_norm": 3.0979113578796387, "learning_rate": 1.6801348147932694e-05, "loss": 0.5698, "step": 406600 }, { "epoch": 9.367228516164687, "grad_norm": 2.7929654121398926, "learning_rate": 1.679407658411018e-05, "loss": 0.5644, "step": 406800 }, { "epoch": 9.371833839918946, "grad_norm": 2.568842887878418, "learning_rate": 1.6786805020287666e-05, "loss": 0.5827, "step": 407000 }, { "epoch": 9.376439163673206, "grad_norm": 3.327233076095581, "learning_rate": 1.677953345646515e-05, "loss": 0.5885, "step": 407200 }, { "epoch": 9.381044487427467, "grad_norm": 2.4469523429870605, "learning_rate": 1.677226189264263e-05, "loss": 0.5692, "step": 407400 }, { "epoch": 9.385649811181725, "grad_norm": 2.9710397720336914, "learning_rate": 1.6764990328820117e-05, "loss": 0.5651, "step": 407600 }, { "epoch": 9.390255134935986, "grad_norm": 3.184701442718506, "learning_rate": 1.67577187649976e-05, "loss": 0.5642, "step": 407800 }, { "epoch": 9.394860458690246, "grad_norm": 2.20974063873291, "learning_rate": 1.6750447201175085e-05, "loss": 0.5758, "step": 408000 }, { "epoch": 9.399465782444507, "grad_norm": 2.655507802963257, "learning_rate": 1.674317563735257e-05, "loss": 0.5758, "step": 408200 }, { "epoch": 9.404071106198765, "grad_norm": 2.7525951862335205, "learning_rate": 1.6735904073530054e-05, "loss": 0.5656, "step": 408400 }, { "epoch": 9.408676429953026, "grad_norm": 3.151881694793701, "learning_rate": 1.6728632509707536e-05, "loss": 0.5701, "step": 408600 }, { "epoch": 9.413281753707286, "grad_norm": 2.949288845062256, "learning_rate": 1.6721360945885022e-05, "loss": 0.5764, "step": 408800 }, { "epoch": 9.417887077461545, "grad_norm": 3.4469354152679443, "learning_rate": 1.6714089382062505e-05, "loss": 0.577, "step": 409000 }, { "epoch": 9.422492401215806, "grad_norm": 3.0155844688415527, "learning_rate": 1.670681781823999e-05, "loss": 0.5771, "step": 409200 }, { "epoch": 9.427097724970066, "grad_norm": 3.070678472518921, "learning_rate": 1.669958261223659e-05, "loss": 0.5845, "step": 409400 }, { "epoch": 9.431703048724325, "grad_norm": 2.2283005714416504, "learning_rate": 1.6692311048414072e-05, "loss": 0.5636, "step": 409600 }, { "epoch": 9.436308372478585, "grad_norm": 2.9284727573394775, "learning_rate": 1.6685039484591558e-05, "loss": 0.581, "step": 409800 }, { "epoch": 9.440913696232846, "grad_norm": 3.039658308029175, "learning_rate": 1.6677767920769044e-05, "loss": 0.568, "step": 410000 }, { "epoch": 9.445519019987104, "grad_norm": 2.3291382789611816, "learning_rate": 1.6670496356946523e-05, "loss": 0.5628, "step": 410200 }, { "epoch": 9.450124343741365, "grad_norm": 2.804224967956543, "learning_rate": 1.666322479312401e-05, "loss": 0.5728, "step": 410400 }, { "epoch": 9.454729667495625, "grad_norm": 3.0607471466064453, "learning_rate": 1.6655989587120607e-05, "loss": 0.5655, "step": 410600 }, { "epoch": 9.459334991249884, "grad_norm": 2.556657075881958, "learning_rate": 1.664871802329809e-05, "loss": 0.5723, "step": 410800 }, { "epoch": 9.463940315004145, "grad_norm": 2.7533397674560547, "learning_rate": 1.6641446459475576e-05, "loss": 0.574, "step": 411000 }, { "epoch": 9.468545638758405, "grad_norm": 2.0265374183654785, "learning_rate": 1.663417489565306e-05, "loss": 0.5751, "step": 411200 }, { "epoch": 9.473150962512666, "grad_norm": 3.6705782413482666, "learning_rate": 1.6626903331830544e-05, "loss": 0.5787, "step": 411400 }, { "epoch": 9.477756286266924, "grad_norm": 2.9727859497070312, "learning_rate": 1.6619631768008027e-05, "loss": 0.5658, "step": 411600 }, { "epoch": 9.482361610021185, "grad_norm": 2.764256477355957, "learning_rate": 1.6612360204185513e-05, "loss": 0.5674, "step": 411800 }, { "epoch": 9.486966933775445, "grad_norm": 4.042858123779297, "learning_rate": 1.6605088640362995e-05, "loss": 0.5726, "step": 412000 }, { "epoch": 9.491572257529704, "grad_norm": 3.3755908012390137, "learning_rate": 1.659781707654048e-05, "loss": 0.5881, "step": 412200 }, { "epoch": 9.496177581283964, "grad_norm": 3.193547487258911, "learning_rate": 1.6590545512717967e-05, "loss": 0.5649, "step": 412400 }, { "epoch": 9.500782905038225, "grad_norm": 3.653513193130493, "learning_rate": 1.658327394889545e-05, "loss": 0.5807, "step": 412600 }, { "epoch": 9.505388228792484, "grad_norm": 2.6081292629241943, "learning_rate": 1.6576002385072935e-05, "loss": 0.5756, "step": 412800 }, { "epoch": 9.509993552546744, "grad_norm": 3.2407031059265137, "learning_rate": 1.6568730821250418e-05, "loss": 0.5617, "step": 413000 }, { "epoch": 9.514598876301005, "grad_norm": 3.620607614517212, "learning_rate": 1.6561459257427904e-05, "loss": 0.5697, "step": 413200 }, { "epoch": 9.519204200055263, "grad_norm": 3.521552085876465, "learning_rate": 1.6554187693605386e-05, "loss": 0.5804, "step": 413400 }, { "epoch": 9.523809523809524, "grad_norm": 3.074605941772461, "learning_rate": 1.6546916129782872e-05, "loss": 0.5629, "step": 413600 }, { "epoch": 9.528414847563784, "grad_norm": 3.518260955810547, "learning_rate": 1.6539644565960358e-05, "loss": 0.5772, "step": 413800 }, { "epoch": 9.533020171318043, "grad_norm": 4.271399974822998, "learning_rate": 1.653237300213784e-05, "loss": 0.5713, "step": 414000 }, { "epoch": 9.537625495072303, "grad_norm": 2.3440682888031006, "learning_rate": 1.6525101438315323e-05, "loss": 0.5715, "step": 414200 }, { "epoch": 9.542230818826564, "grad_norm": 3.0813117027282715, "learning_rate": 1.651782987449281e-05, "loss": 0.5761, "step": 414400 }, { "epoch": 9.546836142580823, "grad_norm": 3.250023126602173, "learning_rate": 1.651055831067029e-05, "loss": 0.5737, "step": 414600 }, { "epoch": 9.551441466335083, "grad_norm": 3.33170485496521, "learning_rate": 1.6503286746847778e-05, "loss": 0.5758, "step": 414800 }, { "epoch": 9.556046790089344, "grad_norm": 3.526592254638672, "learning_rate": 1.6496015183025263e-05, "loss": 0.5632, "step": 415000 }, { "epoch": 9.560652113843604, "grad_norm": 3.5739214420318604, "learning_rate": 1.6488743619202746e-05, "loss": 0.5862, "step": 415200 }, { "epoch": 9.565257437597863, "grad_norm": 3.0339114665985107, "learning_rate": 1.648147205538023e-05, "loss": 0.5733, "step": 415400 }, { "epoch": 9.569862761352123, "grad_norm": 2.7391164302825928, "learning_rate": 1.6474200491557714e-05, "loss": 0.5671, "step": 415600 }, { "epoch": 9.574468085106384, "grad_norm": 2.494318962097168, "learning_rate": 1.64669289277352e-05, "loss": 0.586, "step": 415800 }, { "epoch": 9.579073408860642, "grad_norm": 2.9275710582733154, "learning_rate": 1.6459657363912683e-05, "loss": 0.5868, "step": 416000 }, { "epoch": 9.583678732614903, "grad_norm": 2.980813980102539, "learning_rate": 1.645238580009017e-05, "loss": 0.5784, "step": 416200 }, { "epoch": 9.588284056369163, "grad_norm": 2.879387617111206, "learning_rate": 1.6445114236267655e-05, "loss": 0.5534, "step": 416400 }, { "epoch": 9.592889380123422, "grad_norm": 2.6000664234161377, "learning_rate": 1.643787903026425e-05, "loss": 0.5756, "step": 416600 }, { "epoch": 9.597494703877683, "grad_norm": 2.941392660140991, "learning_rate": 1.6430607466441736e-05, "loss": 0.5787, "step": 416800 }, { "epoch": 9.602100027631943, "grad_norm": 3.4506592750549316, "learning_rate": 1.6423335902619215e-05, "loss": 0.5694, "step": 417000 }, { "epoch": 9.606705351386202, "grad_norm": 3.014477014541626, "learning_rate": 1.64160643387967e-05, "loss": 0.5785, "step": 417200 }, { "epoch": 9.611310675140462, "grad_norm": 3.4871177673339844, "learning_rate": 1.6408792774974187e-05, "loss": 0.58, "step": 417400 }, { "epoch": 9.615915998894723, "grad_norm": 3.0921521186828613, "learning_rate": 1.6401521211151673e-05, "loss": 0.5571, "step": 417600 }, { "epoch": 9.620521322648981, "grad_norm": 3.4543838500976562, "learning_rate": 1.6394249647329155e-05, "loss": 0.5689, "step": 417800 }, { "epoch": 9.625126646403242, "grad_norm": 3.799140453338623, "learning_rate": 1.638697808350664e-05, "loss": 0.5898, "step": 418000 }, { "epoch": 9.629731970157502, "grad_norm": 2.701723575592041, "learning_rate": 1.6379706519684124e-05, "loss": 0.584, "step": 418200 }, { "epoch": 9.634337293911763, "grad_norm": 3.6430578231811523, "learning_rate": 1.6372434955861606e-05, "loss": 0.5649, "step": 418400 }, { "epoch": 9.638942617666022, "grad_norm": 3.381641387939453, "learning_rate": 1.6365163392039092e-05, "loss": 0.5755, "step": 418600 }, { "epoch": 9.643547941420282, "grad_norm": 3.1972997188568115, "learning_rate": 1.6357891828216578e-05, "loss": 0.5767, "step": 418800 }, { "epoch": 9.648153265174543, "grad_norm": 2.762204647064209, "learning_rate": 1.635062026439406e-05, "loss": 0.5659, "step": 419000 }, { "epoch": 9.652758588928801, "grad_norm": 2.9068100452423096, "learning_rate": 1.6343348700571546e-05, "loss": 0.5822, "step": 419200 }, { "epoch": 9.657363912683062, "grad_norm": 3.140155553817749, "learning_rate": 1.633607713674903e-05, "loss": 0.5617, "step": 419400 }, { "epoch": 9.661969236437322, "grad_norm": 3.393486738204956, "learning_rate": 1.632880557292651e-05, "loss": 0.59, "step": 419600 }, { "epoch": 9.666574560191581, "grad_norm": 3.3374438285827637, "learning_rate": 1.6321534009103997e-05, "loss": 0.5863, "step": 419800 }, { "epoch": 9.671179883945841, "grad_norm": 2.8088021278381348, "learning_rate": 1.6314262445281483e-05, "loss": 0.5748, "step": 420000 }, { "epoch": 9.675785207700102, "grad_norm": 3.9555068016052246, "learning_rate": 1.630702723927808e-05, "loss": 0.5809, "step": 420200 }, { "epoch": 9.68039053145436, "grad_norm": 3.5197412967681885, "learning_rate": 1.6299755675455564e-05, "loss": 0.5796, "step": 420400 }, { "epoch": 9.684995855208621, "grad_norm": 2.782156229019165, "learning_rate": 1.629248411163305e-05, "loss": 0.5738, "step": 420600 }, { "epoch": 9.689601178962882, "grad_norm": 3.482156276702881, "learning_rate": 1.6285212547810533e-05, "loss": 0.5766, "step": 420800 }, { "epoch": 9.69420650271714, "grad_norm": 2.7525062561035156, "learning_rate": 1.6277940983988015e-05, "loss": 0.5774, "step": 421000 }, { "epoch": 9.6988118264714, "grad_norm": 3.4421098232269287, "learning_rate": 1.62706694201655e-05, "loss": 0.5709, "step": 421200 }, { "epoch": 9.703417150225661, "grad_norm": 2.6279489994049072, "learning_rate": 1.6263397856342984e-05, "loss": 0.5647, "step": 421400 }, { "epoch": 9.70802247397992, "grad_norm": 2.9035749435424805, "learning_rate": 1.625612629252047e-05, "loss": 0.5715, "step": 421600 }, { "epoch": 9.71262779773418, "grad_norm": 3.324402093887329, "learning_rate": 1.6248854728697956e-05, "loss": 0.5828, "step": 421800 }, { "epoch": 9.71723312148844, "grad_norm": 3.438358783721924, "learning_rate": 1.6241583164875438e-05, "loss": 0.5718, "step": 422000 }, { "epoch": 9.721838445242701, "grad_norm": 3.8631820678710938, "learning_rate": 1.623431160105292e-05, "loss": 0.5787, "step": 422200 }, { "epoch": 9.72644376899696, "grad_norm": 3.755342483520508, "learning_rate": 1.6227040037230407e-05, "loss": 0.5799, "step": 422400 }, { "epoch": 9.73104909275122, "grad_norm": 4.419692039489746, "learning_rate": 1.6219768473407893e-05, "loss": 0.572, "step": 422600 }, { "epoch": 9.735654416505481, "grad_norm": 3.5479319095611572, "learning_rate": 1.6212496909585375e-05, "loss": 0.5864, "step": 422800 }, { "epoch": 9.74025974025974, "grad_norm": 2.9561350345611572, "learning_rate": 1.620522534576286e-05, "loss": 0.5806, "step": 423000 }, { "epoch": 9.744865064014, "grad_norm": 2.8360917568206787, "learning_rate": 1.6197953781940347e-05, "loss": 0.5824, "step": 423200 }, { "epoch": 9.74947038776826, "grad_norm": 3.090214967727661, "learning_rate": 1.619068221811783e-05, "loss": 0.5732, "step": 423400 }, { "epoch": 9.75407571152252, "grad_norm": 3.477980613708496, "learning_rate": 1.6183447012114428e-05, "loss": 0.5688, "step": 423600 }, { "epoch": 9.75868103527678, "grad_norm": 2.7006947994232178, "learning_rate": 1.617617544829191e-05, "loss": 0.5744, "step": 423800 }, { "epoch": 9.76328635903104, "grad_norm": 3.080148696899414, "learning_rate": 1.6168903884469393e-05, "loss": 0.5751, "step": 424000 }, { "epoch": 9.767891682785299, "grad_norm": 3.5952165126800537, "learning_rate": 1.616163232064688e-05, "loss": 0.5706, "step": 424200 }, { "epoch": 9.77249700653956, "grad_norm": 3.141179323196411, "learning_rate": 1.6154360756824365e-05, "loss": 0.5758, "step": 424400 }, { "epoch": 9.77710233029382, "grad_norm": 2.961484909057617, "learning_rate": 1.6147089193001847e-05, "loss": 0.5813, "step": 424600 }, { "epoch": 9.78170765404808, "grad_norm": 3.8218839168548584, "learning_rate": 1.6139817629179333e-05, "loss": 0.5677, "step": 424800 }, { "epoch": 9.78631297780234, "grad_norm": 2.3090195655822754, "learning_rate": 1.6132546065356816e-05, "loss": 0.5806, "step": 425000 }, { "epoch": 9.7909183015566, "grad_norm": 2.9039793014526367, "learning_rate": 1.61252745015343e-05, "loss": 0.5748, "step": 425200 }, { "epoch": 9.79552362531086, "grad_norm": 3.138223171234131, "learning_rate": 1.6118002937711784e-05, "loss": 0.5711, "step": 425400 }, { "epoch": 9.800128949065119, "grad_norm": 3.0506985187530518, "learning_rate": 1.611073137388927e-05, "loss": 0.5705, "step": 425600 }, { "epoch": 9.80473427281938, "grad_norm": 2.8752284049987793, "learning_rate": 1.6103496167885865e-05, "loss": 0.5763, "step": 425800 }, { "epoch": 9.80933959657364, "grad_norm": 3.2525858879089355, "learning_rate": 1.609622460406335e-05, "loss": 0.5703, "step": 426000 }, { "epoch": 9.813944920327899, "grad_norm": 3.1471686363220215, "learning_rate": 1.6088953040240837e-05, "loss": 0.5665, "step": 426200 }, { "epoch": 9.818550244082159, "grad_norm": 2.207984685897827, "learning_rate": 1.6081681476418316e-05, "loss": 0.5853, "step": 426400 }, { "epoch": 9.82315556783642, "grad_norm": 3.0319111347198486, "learning_rate": 1.6074409912595802e-05, "loss": 0.5681, "step": 426600 }, { "epoch": 9.827760891590678, "grad_norm": 2.59087872505188, "learning_rate": 1.6067138348773288e-05, "loss": 0.5699, "step": 426800 }, { "epoch": 9.832366215344939, "grad_norm": 2.8990542888641357, "learning_rate": 1.605986678495077e-05, "loss": 0.5683, "step": 427000 }, { "epoch": 9.8369715390992, "grad_norm": 2.702047109603882, "learning_rate": 1.6052595221128257e-05, "loss": 0.5793, "step": 427200 }, { "epoch": 9.841576862853458, "grad_norm": 3.4502487182617188, "learning_rate": 1.6045323657305743e-05, "loss": 0.5725, "step": 427400 }, { "epoch": 9.846182186607718, "grad_norm": 2.59299635887146, "learning_rate": 1.6038052093483225e-05, "loss": 0.5812, "step": 427600 }, { "epoch": 9.850787510361979, "grad_norm": 3.2523722648620605, "learning_rate": 1.6030780529660708e-05, "loss": 0.5743, "step": 427800 }, { "epoch": 9.855392834116238, "grad_norm": 2.8594510555267334, "learning_rate": 1.6023508965838194e-05, "loss": 0.5623, "step": 428000 }, { "epoch": 9.859998157870498, "grad_norm": 3.209397792816162, "learning_rate": 1.6016237402015676e-05, "loss": 0.5766, "step": 428200 }, { "epoch": 9.864603481624759, "grad_norm": 3.1015477180480957, "learning_rate": 1.6008965838193162e-05, "loss": 0.5793, "step": 428400 }, { "epoch": 9.869208805379017, "grad_norm": 2.951237440109253, "learning_rate": 1.600173063218976e-05, "loss": 0.5745, "step": 428600 }, { "epoch": 9.873814129133278, "grad_norm": 3.060861825942993, "learning_rate": 1.5994459068367243e-05, "loss": 0.58, "step": 428800 }, { "epoch": 9.878419452887538, "grad_norm": 2.895648956298828, "learning_rate": 1.598718750454473e-05, "loss": 0.5711, "step": 429000 }, { "epoch": 9.883024776641799, "grad_norm": 3.3986594676971436, "learning_rate": 1.597991594072221e-05, "loss": 0.5773, "step": 429200 }, { "epoch": 9.887630100396057, "grad_norm": 3.4484481811523438, "learning_rate": 1.5972644376899694e-05, "loss": 0.5774, "step": 429400 }, { "epoch": 9.892235424150318, "grad_norm": 3.4718239307403564, "learning_rate": 1.596537281307718e-05, "loss": 0.577, "step": 429600 }, { "epoch": 9.896840747904578, "grad_norm": 2.872313976287842, "learning_rate": 1.5958101249254666e-05, "loss": 0.5752, "step": 429800 }, { "epoch": 9.901446071658837, "grad_norm": 3.031860589981079, "learning_rate": 1.595082968543215e-05, "loss": 0.5822, "step": 430000 }, { "epoch": 9.906051395413098, "grad_norm": 3.6577560901641846, "learning_rate": 1.5943558121609634e-05, "loss": 0.5665, "step": 430200 }, { "epoch": 9.910656719167358, "grad_norm": 2.5794246196746826, "learning_rate": 1.593628655778712e-05, "loss": 0.584, "step": 430400 }, { "epoch": 9.915262042921617, "grad_norm": 3.3950586318969727, "learning_rate": 1.5929014993964603e-05, "loss": 0.5548, "step": 430600 }, { "epoch": 9.919867366675877, "grad_norm": 2.942765474319458, "learning_rate": 1.5921743430142085e-05, "loss": 0.5763, "step": 430800 }, { "epoch": 9.924472690430138, "grad_norm": 2.925250291824341, "learning_rate": 1.591447186631957e-05, "loss": 0.5572, "step": 431000 }, { "epoch": 9.929078014184396, "grad_norm": 2.779973030090332, "learning_rate": 1.5907200302497057e-05, "loss": 0.5651, "step": 431200 }, { "epoch": 9.933683337938657, "grad_norm": 2.9650769233703613, "learning_rate": 1.589992873867454e-05, "loss": 0.5765, "step": 431400 }, { "epoch": 9.938288661692917, "grad_norm": 2.562681198120117, "learning_rate": 1.5892657174852026e-05, "loss": 0.5653, "step": 431600 }, { "epoch": 9.942893985447178, "grad_norm": 3.5627505779266357, "learning_rate": 1.5885385611029508e-05, "loss": 0.5617, "step": 431800 }, { "epoch": 9.947499309201437, "grad_norm": 3.4282848834991455, "learning_rate": 1.5878150405026103e-05, "loss": 0.5673, "step": 432000 }, { "epoch": 9.952104632955697, "grad_norm": 3.556774139404297, "learning_rate": 1.587087884120359e-05, "loss": 0.5806, "step": 432200 }, { "epoch": 9.956709956709958, "grad_norm": 3.773305654525757, "learning_rate": 1.5863607277381075e-05, "loss": 0.5731, "step": 432400 }, { "epoch": 9.961315280464216, "grad_norm": 2.997891902923584, "learning_rate": 1.5856335713558558e-05, "loss": 0.5713, "step": 432600 }, { "epoch": 9.965920604218477, "grad_norm": 3.1296310424804688, "learning_rate": 1.5849064149736044e-05, "loss": 0.5618, "step": 432800 }, { "epoch": 9.970525927972737, "grad_norm": 3.2844841480255127, "learning_rate": 1.584179258591353e-05, "loss": 0.5774, "step": 433000 }, { "epoch": 9.975131251726996, "grad_norm": 2.3057990074157715, "learning_rate": 1.583452102209101e-05, "loss": 0.5783, "step": 433200 }, { "epoch": 9.979736575481256, "grad_norm": 2.7492430210113525, "learning_rate": 1.5827249458268495e-05, "loss": 0.5757, "step": 433400 }, { "epoch": 9.984341899235517, "grad_norm": 2.937319278717041, "learning_rate": 1.581997789444598e-05, "loss": 0.5814, "step": 433600 }, { "epoch": 9.988947222989776, "grad_norm": 2.906705141067505, "learning_rate": 1.5812706330623463e-05, "loss": 0.5783, "step": 433800 }, { "epoch": 9.993552546744036, "grad_norm": 2.5989561080932617, "learning_rate": 1.580543476680095e-05, "loss": 0.5865, "step": 434000 }, { "epoch": 9.998157870498297, "grad_norm": 3.9318482875823975, "learning_rate": 1.5798163202978435e-05, "loss": 0.5763, "step": 434200 }, { "epoch": 10.0, "eval_loss": 0.5503791570663452, "eval_runtime": 162.7661, "eval_samples_per_second": 174.244, "eval_steps_per_second": 10.893, "step": 434280 }, { "epoch": 10.002763194252555, "grad_norm": 3.2857744693756104, "learning_rate": 1.579092799697503e-05, "loss": 0.584, "step": 434400 }, { "epoch": 10.007368518006816, "grad_norm": 2.8815836906433105, "learning_rate": 1.578369279097163e-05, "loss": 0.5577, "step": 434600 }, { "epoch": 10.011973841761076, "grad_norm": 2.8259472846984863, "learning_rate": 1.577642122714911e-05, "loss": 0.5706, "step": 434800 }, { "epoch": 10.016579165515335, "grad_norm": 2.6986682415008545, "learning_rate": 1.5769149663326594e-05, "loss": 0.5662, "step": 435000 }, { "epoch": 10.021184489269595, "grad_norm": 3.565535545349121, "learning_rate": 1.576187809950408e-05, "loss": 0.5678, "step": 435200 }, { "epoch": 10.025789813023856, "grad_norm": 2.950361967086792, "learning_rate": 1.5754606535681565e-05, "loss": 0.5743, "step": 435400 }, { "epoch": 10.030395136778116, "grad_norm": 2.67488694190979, "learning_rate": 1.5747334971859048e-05, "loss": 0.5798, "step": 435600 }, { "epoch": 10.035000460532375, "grad_norm": 2.76383113861084, "learning_rate": 1.5740063408036534e-05, "loss": 0.5616, "step": 435800 }, { "epoch": 10.039605784286636, "grad_norm": 2.7756881713867188, "learning_rate": 1.573279184421402e-05, "loss": 0.582, "step": 436000 }, { "epoch": 10.044211108040896, "grad_norm": 3.5233657360076904, "learning_rate": 1.57255202803915e-05, "loss": 0.5697, "step": 436200 }, { "epoch": 10.048816431795155, "grad_norm": 2.4349958896636963, "learning_rate": 1.5718248716568985e-05, "loss": 0.565, "step": 436400 }, { "epoch": 10.053421755549415, "grad_norm": 2.8675363063812256, "learning_rate": 1.571097715274647e-05, "loss": 0.5669, "step": 436600 }, { "epoch": 10.058027079303676, "grad_norm": 2.97993540763855, "learning_rate": 1.5703705588923953e-05, "loss": 0.5759, "step": 436800 }, { "epoch": 10.062632403057934, "grad_norm": 3.144256353378296, "learning_rate": 1.569643402510144e-05, "loss": 0.5606, "step": 437000 }, { "epoch": 10.067237726812195, "grad_norm": 2.9724130630493164, "learning_rate": 1.5689162461278925e-05, "loss": 0.5811, "step": 437200 }, { "epoch": 10.071843050566455, "grad_norm": 3.0126194953918457, "learning_rate": 1.5681890897456408e-05, "loss": 0.5646, "step": 437400 }, { "epoch": 10.076448374320714, "grad_norm": 2.4599967002868652, "learning_rate": 1.567461933363389e-05, "loss": 0.582, "step": 437600 }, { "epoch": 10.081053698074975, "grad_norm": 2.899773120880127, "learning_rate": 1.5667347769811376e-05, "loss": 0.5706, "step": 437800 }, { "epoch": 10.085659021829235, "grad_norm": 2.810063600540161, "learning_rate": 1.566007620598886e-05, "loss": 0.5678, "step": 438000 }, { "epoch": 10.090264345583494, "grad_norm": 2.5231738090515137, "learning_rate": 1.5652804642166345e-05, "loss": 0.5784, "step": 438200 }, { "epoch": 10.094869669337754, "grad_norm": 2.406407594680786, "learning_rate": 1.564553307834383e-05, "loss": 0.5653, "step": 438400 }, { "epoch": 10.099474993092015, "grad_norm": 3.0736372470855713, "learning_rate": 1.5638261514521316e-05, "loss": 0.5703, "step": 438600 }, { "epoch": 10.104080316846275, "grad_norm": 2.5752131938934326, "learning_rate": 1.5630989950698796e-05, "loss": 0.5667, "step": 438800 }, { "epoch": 10.108685640600534, "grad_norm": 3.5142014026641846, "learning_rate": 1.562371838687628e-05, "loss": 0.5829, "step": 439000 }, { "epoch": 10.113290964354794, "grad_norm": 3.4527642726898193, "learning_rate": 1.5616446823053767e-05, "loss": 0.5663, "step": 439200 }, { "epoch": 10.117896288109055, "grad_norm": 3.4308338165283203, "learning_rate": 1.560917525923125e-05, "loss": 0.5725, "step": 439400 }, { "epoch": 10.122501611863314, "grad_norm": 2.8218986988067627, "learning_rate": 1.5601903695408736e-05, "loss": 0.5723, "step": 439600 }, { "epoch": 10.127106935617574, "grad_norm": 3.292224407196045, "learning_rate": 1.559463213158622e-05, "loss": 0.5753, "step": 439800 }, { "epoch": 10.131712259371835, "grad_norm": 2.748878240585327, "learning_rate": 1.5587396925582817e-05, "loss": 0.5636, "step": 440000 }, { "epoch": 10.136317583126093, "grad_norm": 3.109516143798828, "learning_rate": 1.55801253617603e-05, "loss": 0.5534, "step": 440200 }, { "epoch": 10.140922906880354, "grad_norm": 3.0865907669067383, "learning_rate": 1.5572853797937785e-05, "loss": 0.5619, "step": 440400 }, { "epoch": 10.145528230634614, "grad_norm": 3.90614914894104, "learning_rate": 1.5565582234115268e-05, "loss": 0.5706, "step": 440600 }, { "epoch": 10.150133554388873, "grad_norm": 3.342874526977539, "learning_rate": 1.5558310670292754e-05, "loss": 0.5666, "step": 440800 }, { "epoch": 10.154738878143133, "grad_norm": 3.8713204860687256, "learning_rate": 1.555103910647024e-05, "loss": 0.57, "step": 441000 }, { "epoch": 10.159344201897394, "grad_norm": 3.356518507003784, "learning_rate": 1.5543767542647722e-05, "loss": 0.5671, "step": 441200 }, { "epoch": 10.163949525651653, "grad_norm": 3.4761784076690674, "learning_rate": 1.5536495978825208e-05, "loss": 0.5756, "step": 441400 }, { "epoch": 10.168554849405913, "grad_norm": 3.0958569049835205, "learning_rate": 1.552922441500269e-05, "loss": 0.5545, "step": 441600 }, { "epoch": 10.173160173160174, "grad_norm": 3.5366578102111816, "learning_rate": 1.5521952851180173e-05, "loss": 0.5569, "step": 441800 }, { "epoch": 10.177765496914432, "grad_norm": 3.0976240634918213, "learning_rate": 1.551468128735766e-05, "loss": 0.5787, "step": 442000 }, { "epoch": 10.182370820668693, "grad_norm": 2.9487667083740234, "learning_rate": 1.5507409723535145e-05, "loss": 0.5719, "step": 442200 }, { "epoch": 10.186976144422953, "grad_norm": 3.9748101234436035, "learning_rate": 1.5500138159712628e-05, "loss": 0.5772, "step": 442400 }, { "epoch": 10.191581468177214, "grad_norm": 3.862164258956909, "learning_rate": 1.5492866595890113e-05, "loss": 0.5625, "step": 442600 }, { "epoch": 10.196186791931472, "grad_norm": 3.5025646686553955, "learning_rate": 1.5485595032067596e-05, "loss": 0.5744, "step": 442800 }, { "epoch": 10.200792115685733, "grad_norm": 2.911168336868286, "learning_rate": 1.5478323468245082e-05, "loss": 0.5606, "step": 443000 }, { "epoch": 10.205397439439993, "grad_norm": 3.7616069316864014, "learning_rate": 1.5471051904422564e-05, "loss": 0.5661, "step": 443200 }, { "epoch": 10.210002763194252, "grad_norm": 2.7809619903564453, "learning_rate": 1.546378034060005e-05, "loss": 0.5583, "step": 443400 }, { "epoch": 10.214608086948513, "grad_norm": 4.306921005249023, "learning_rate": 1.5456508776777536e-05, "loss": 0.5722, "step": 443600 }, { "epoch": 10.219213410702773, "grad_norm": 2.9962635040283203, "learning_rate": 1.544923721295502e-05, "loss": 0.5667, "step": 443800 }, { "epoch": 10.223818734457032, "grad_norm": 2.8903093338012695, "learning_rate": 1.54419656491325e-05, "loss": 0.5655, "step": 444000 }, { "epoch": 10.228424058211292, "grad_norm": 3.401038885116577, "learning_rate": 1.5434694085309987e-05, "loss": 0.5708, "step": 444200 }, { "epoch": 10.233029381965553, "grad_norm": 2.52870774269104, "learning_rate": 1.542742252148747e-05, "loss": 0.564, "step": 444400 }, { "epoch": 10.237634705719811, "grad_norm": 3.640711784362793, "learning_rate": 1.542018731548407e-05, "loss": 0.5723, "step": 444600 }, { "epoch": 10.242240029474072, "grad_norm": 2.800980806350708, "learning_rate": 1.5412915751661554e-05, "loss": 0.5613, "step": 444800 }, { "epoch": 10.246845353228332, "grad_norm": 3.3499021530151367, "learning_rate": 1.5405644187839037e-05, "loss": 0.5739, "step": 445000 }, { "epoch": 10.251450676982591, "grad_norm": 2.415259838104248, "learning_rate": 1.5398408981835635e-05, "loss": 0.566, "step": 445200 }, { "epoch": 10.256056000736852, "grad_norm": 3.0290565490722656, "learning_rate": 1.5391137418013118e-05, "loss": 0.5834, "step": 445400 }, { "epoch": 10.260661324491112, "grad_norm": 2.8429722785949707, "learning_rate": 1.5383865854190604e-05, "loss": 0.5733, "step": 445600 }, { "epoch": 10.265266648245372, "grad_norm": 2.889953136444092, "learning_rate": 1.5376594290368086e-05, "loss": 0.5635, "step": 445800 }, { "epoch": 10.269871971999631, "grad_norm": 3.5304625034332275, "learning_rate": 1.536932272654557e-05, "loss": 0.5689, "step": 446000 }, { "epoch": 10.274477295753892, "grad_norm": 3.2472240924835205, "learning_rate": 1.5362051162723055e-05, "loss": 0.5626, "step": 446200 }, { "epoch": 10.279082619508152, "grad_norm": 3.443258047103882, "learning_rate": 1.535477959890054e-05, "loss": 0.5686, "step": 446400 }, { "epoch": 10.28368794326241, "grad_norm": 3.0054471492767334, "learning_rate": 1.5347508035078027e-05, "loss": 0.563, "step": 446600 }, { "epoch": 10.288293267016671, "grad_norm": 3.115903854370117, "learning_rate": 1.534023647125551e-05, "loss": 0.5618, "step": 446800 }, { "epoch": 10.292898590770932, "grad_norm": 3.3110814094543457, "learning_rate": 1.533296490743299e-05, "loss": 0.5567, "step": 447000 }, { "epoch": 10.29750391452519, "grad_norm": 2.7418558597564697, "learning_rate": 1.5325693343610478e-05, "loss": 0.5526, "step": 447200 }, { "epoch": 10.302109238279451, "grad_norm": 3.7277021408081055, "learning_rate": 1.531842177978796e-05, "loss": 0.5549, "step": 447400 }, { "epoch": 10.306714562033712, "grad_norm": 3.4062893390655518, "learning_rate": 1.5311150215965446e-05, "loss": 0.5749, "step": 447600 }, { "epoch": 10.31131988578797, "grad_norm": 3.7285375595092773, "learning_rate": 1.5303878652142932e-05, "loss": 0.5635, "step": 447800 }, { "epoch": 10.31592520954223, "grad_norm": 3.451294183731079, "learning_rate": 1.5296643446139527e-05, "loss": 0.5615, "step": 448000 }, { "epoch": 10.320530533296491, "grad_norm": 2.3311824798583984, "learning_rate": 1.5289371882317013e-05, "loss": 0.5668, "step": 448200 }, { "epoch": 10.32513585705075, "grad_norm": 3.451138734817505, "learning_rate": 1.52821003184945e-05, "loss": 0.5659, "step": 448400 }, { "epoch": 10.32974118080501, "grad_norm": 2.754631757736206, "learning_rate": 1.5274828754671978e-05, "loss": 0.5746, "step": 448600 }, { "epoch": 10.33434650455927, "grad_norm": 3.7678756713867188, "learning_rate": 1.5267557190849464e-05, "loss": 0.5647, "step": 448800 }, { "epoch": 10.33895182831353, "grad_norm": 2.977546215057373, "learning_rate": 1.526028562702695e-05, "loss": 0.5675, "step": 449000 }, { "epoch": 10.34355715206779, "grad_norm": 2.2833847999572754, "learning_rate": 1.5253014063204434e-05, "loss": 0.5571, "step": 449200 }, { "epoch": 10.34816247582205, "grad_norm": 3.352088689804077, "learning_rate": 1.5245742499381918e-05, "loss": 0.5803, "step": 449400 }, { "epoch": 10.352767799576311, "grad_norm": 3.5555100440979004, "learning_rate": 1.5238470935559403e-05, "loss": 0.5733, "step": 449600 }, { "epoch": 10.35737312333057, "grad_norm": 2.3247013092041016, "learning_rate": 1.5231199371736885e-05, "loss": 0.5573, "step": 449800 }, { "epoch": 10.36197844708483, "grad_norm": 2.8674051761627197, "learning_rate": 1.5223964165733482e-05, "loss": 0.5689, "step": 450000 }, { "epoch": 10.36658377083909, "grad_norm": 3.7046046257019043, "learning_rate": 1.5216692601910966e-05, "loss": 0.5753, "step": 450200 }, { "epoch": 10.37118909459335, "grad_norm": 3.3615944385528564, "learning_rate": 1.520942103808845e-05, "loss": 0.5823, "step": 450400 }, { "epoch": 10.37579441834761, "grad_norm": 4.1015238761901855, "learning_rate": 1.5202149474265936e-05, "loss": 0.5746, "step": 450600 }, { "epoch": 10.38039974210187, "grad_norm": 2.8031654357910156, "learning_rate": 1.519487791044342e-05, "loss": 0.5707, "step": 450800 }, { "epoch": 10.385005065856129, "grad_norm": 3.370823860168457, "learning_rate": 1.5187606346620907e-05, "loss": 0.5652, "step": 451000 }, { "epoch": 10.38961038961039, "grad_norm": 3.259626626968384, "learning_rate": 1.5180371140617503e-05, "loss": 0.5666, "step": 451200 }, { "epoch": 10.39421571336465, "grad_norm": 4.263233184814453, "learning_rate": 1.5173099576794984e-05, "loss": 0.5697, "step": 451400 }, { "epoch": 10.398821037118909, "grad_norm": 2.861125946044922, "learning_rate": 1.5165828012972468e-05, "loss": 0.5782, "step": 451600 }, { "epoch": 10.40342636087317, "grad_norm": 3.5409388542175293, "learning_rate": 1.5158556449149954e-05, "loss": 0.5651, "step": 451800 }, { "epoch": 10.40803168462743, "grad_norm": 3.4147825241088867, "learning_rate": 1.5151284885327439e-05, "loss": 0.5646, "step": 452000 }, { "epoch": 10.412637008381688, "grad_norm": 2.7869393825531006, "learning_rate": 1.5144013321504925e-05, "loss": 0.5703, "step": 452200 }, { "epoch": 10.417242332135949, "grad_norm": 3.2392425537109375, "learning_rate": 1.5136741757682409e-05, "loss": 0.5604, "step": 452400 }, { "epoch": 10.42184765589021, "grad_norm": 3.0997209548950195, "learning_rate": 1.5129470193859893e-05, "loss": 0.5643, "step": 452600 }, { "epoch": 10.42645297964447, "grad_norm": 2.82871150970459, "learning_rate": 1.5122198630037375e-05, "loss": 0.5727, "step": 452800 }, { "epoch": 10.431058303398729, "grad_norm": 3.5015945434570312, "learning_rate": 1.511492706621486e-05, "loss": 0.566, "step": 453000 }, { "epoch": 10.435663627152989, "grad_norm": 2.741576671600342, "learning_rate": 1.5107655502392344e-05, "loss": 0.5739, "step": 453200 }, { "epoch": 10.44026895090725, "grad_norm": 3.6131269931793213, "learning_rate": 1.510038393856983e-05, "loss": 0.5577, "step": 453400 }, { "epoch": 10.444874274661508, "grad_norm": 3.369382858276367, "learning_rate": 1.5093112374747314e-05, "loss": 0.5676, "step": 453600 }, { "epoch": 10.449479598415769, "grad_norm": 2.8960883617401123, "learning_rate": 1.50858408109248e-05, "loss": 0.5564, "step": 453800 }, { "epoch": 10.45408492217003, "grad_norm": 3.1670303344726562, "learning_rate": 1.507856924710228e-05, "loss": 0.5777, "step": 454000 }, { "epoch": 10.458690245924288, "grad_norm": 2.9549036026000977, "learning_rate": 1.5071297683279765e-05, "loss": 0.5686, "step": 454200 }, { "epoch": 10.463295569678548, "grad_norm": 3.2000997066497803, "learning_rate": 1.5064026119457251e-05, "loss": 0.5699, "step": 454400 }, { "epoch": 10.467900893432809, "grad_norm": 3.4854366779327393, "learning_rate": 1.5056754555634735e-05, "loss": 0.5676, "step": 454600 }, { "epoch": 10.472506217187068, "grad_norm": 3.309793710708618, "learning_rate": 1.504948299181222e-05, "loss": 0.5569, "step": 454800 }, { "epoch": 10.477111540941328, "grad_norm": 2.8395392894744873, "learning_rate": 1.5042211427989705e-05, "loss": 0.569, "step": 455000 }, { "epoch": 10.481716864695588, "grad_norm": 3.1417343616485596, "learning_rate": 1.503493986416719e-05, "loss": 0.5696, "step": 455200 }, { "epoch": 10.486322188449847, "grad_norm": 3.1475086212158203, "learning_rate": 1.5027668300344672e-05, "loss": 0.5647, "step": 455400 }, { "epoch": 10.490927512204108, "grad_norm": 3.277684450149536, "learning_rate": 1.5020396736522156e-05, "loss": 0.58, "step": 455600 }, { "epoch": 10.495532835958368, "grad_norm": 3.4847934246063232, "learning_rate": 1.501312517269964e-05, "loss": 0.5613, "step": 455800 }, { "epoch": 10.500138159712627, "grad_norm": 2.999443292617798, "learning_rate": 1.5005853608877126e-05, "loss": 0.5617, "step": 456000 }, { "epoch": 10.504743483466887, "grad_norm": 2.7356905937194824, "learning_rate": 1.4998582045054609e-05, "loss": 0.5679, "step": 456200 }, { "epoch": 10.509348807221148, "grad_norm": 3.4266111850738525, "learning_rate": 1.4991310481232093e-05, "loss": 0.5498, "step": 456400 }, { "epoch": 10.513954130975408, "grad_norm": 3.2002484798431396, "learning_rate": 1.4984038917409579e-05, "loss": 0.5548, "step": 456600 }, { "epoch": 10.518559454729667, "grad_norm": 2.8026347160339355, "learning_rate": 1.4976767353587063e-05, "loss": 0.5744, "step": 456800 }, { "epoch": 10.523164778483928, "grad_norm": 3.1921815872192383, "learning_rate": 1.4969495789764547e-05, "loss": 0.5578, "step": 457000 }, { "epoch": 10.527770102238188, "grad_norm": 3.4548532962799072, "learning_rate": 1.4962224225942032e-05, "loss": 0.5696, "step": 457200 }, { "epoch": 10.532375425992447, "grad_norm": 2.9152286052703857, "learning_rate": 1.4954952662119516e-05, "loss": 0.5592, "step": 457400 }, { "epoch": 10.536980749746707, "grad_norm": 2.514460802078247, "learning_rate": 1.4947681098297e-05, "loss": 0.5629, "step": 457600 }, { "epoch": 10.541586073500968, "grad_norm": 3.525857448577881, "learning_rate": 1.4940409534474484e-05, "loss": 0.5682, "step": 457800 }, { "epoch": 10.546191397255226, "grad_norm": 3.7510156631469727, "learning_rate": 1.4933137970651969e-05, "loss": 0.5724, "step": 458000 }, { "epoch": 10.550796721009487, "grad_norm": 3.3571460247039795, "learning_rate": 1.4925902764648565e-05, "loss": 0.568, "step": 458200 }, { "epoch": 10.555402044763747, "grad_norm": 2.8560373783111572, "learning_rate": 1.491863120082605e-05, "loss": 0.5629, "step": 458400 }, { "epoch": 10.560007368518006, "grad_norm": 3.2522172927856445, "learning_rate": 1.4911359637003534e-05, "loss": 0.5643, "step": 458600 }, { "epoch": 10.564612692272267, "grad_norm": 3.196812152862549, "learning_rate": 1.490408807318102e-05, "loss": 0.5603, "step": 458800 }, { "epoch": 10.569218016026527, "grad_norm": 3.4622724056243896, "learning_rate": 1.4896816509358502e-05, "loss": 0.5625, "step": 459000 }, { "epoch": 10.573823339780787, "grad_norm": 2.8027701377868652, "learning_rate": 1.4889544945535987e-05, "loss": 0.5714, "step": 459200 }, { "epoch": 10.578428663535046, "grad_norm": 2.9170937538146973, "learning_rate": 1.4882273381713472e-05, "loss": 0.5723, "step": 459400 }, { "epoch": 10.583033987289307, "grad_norm": 3.9101593494415283, "learning_rate": 1.4875001817890955e-05, "loss": 0.5641, "step": 459600 }, { "epoch": 10.587639311043567, "grad_norm": 3.0541505813598633, "learning_rate": 1.4867730254068441e-05, "loss": 0.5576, "step": 459800 }, { "epoch": 10.592244634797826, "grad_norm": 2.2873919010162354, "learning_rate": 1.4860458690245925e-05, "loss": 0.5682, "step": 460000 }, { "epoch": 10.596849958552086, "grad_norm": 4.237215518951416, "learning_rate": 1.485318712642341e-05, "loss": 0.5709, "step": 460200 }, { "epoch": 10.601455282306347, "grad_norm": 3.327172040939331, "learning_rate": 1.4845915562600894e-05, "loss": 0.5712, "step": 460400 }, { "epoch": 10.606060606060606, "grad_norm": 3.6685967445373535, "learning_rate": 1.4838643998778378e-05, "loss": 0.5654, "step": 460600 }, { "epoch": 10.610665929814866, "grad_norm": 3.0738847255706787, "learning_rate": 1.4831372434955862e-05, "loss": 0.5676, "step": 460800 }, { "epoch": 10.615271253569126, "grad_norm": 2.7393765449523926, "learning_rate": 1.4824100871133346e-05, "loss": 0.5645, "step": 461000 }, { "epoch": 10.619876577323385, "grad_norm": 2.6728312969207764, "learning_rate": 1.481682930731083e-05, "loss": 0.5598, "step": 461200 }, { "epoch": 10.624481901077646, "grad_norm": 2.915443181991577, "learning_rate": 1.4809557743488316e-05, "loss": 0.5659, "step": 461400 }, { "epoch": 10.629087224831906, "grad_norm": 3.617243528366089, "learning_rate": 1.4802286179665799e-05, "loss": 0.5706, "step": 461600 }, { "epoch": 10.633692548586165, "grad_norm": 3.1753506660461426, "learning_rate": 1.4795014615843283e-05, "loss": 0.5551, "step": 461800 }, { "epoch": 10.638297872340425, "grad_norm": 3.585939407348633, "learning_rate": 1.4787743052020769e-05, "loss": 0.5634, "step": 462000 }, { "epoch": 10.642903196094686, "grad_norm": 2.8085319995880127, "learning_rate": 1.4780471488198252e-05, "loss": 0.572, "step": 462200 }, { "epoch": 10.647508519848945, "grad_norm": 3.3687798976898193, "learning_rate": 1.4773236282194848e-05, "loss": 0.5583, "step": 462400 }, { "epoch": 10.652113843603205, "grad_norm": 3.3423590660095215, "learning_rate": 1.4765964718372333e-05, "loss": 0.572, "step": 462600 }, { "epoch": 10.656719167357465, "grad_norm": 2.939920425415039, "learning_rate": 1.4758693154549819e-05, "loss": 0.5757, "step": 462800 }, { "epoch": 10.661324491111726, "grad_norm": 3.0086398124694824, "learning_rate": 1.4751457948546416e-05, "loss": 0.5608, "step": 463000 }, { "epoch": 10.665929814865985, "grad_norm": 3.477020740509033, "learning_rate": 1.4744186384723898e-05, "loss": 0.5704, "step": 463200 }, { "epoch": 10.670535138620245, "grad_norm": 3.078016996383667, "learning_rate": 1.4736914820901384e-05, "loss": 0.5596, "step": 463400 }, { "epoch": 10.675140462374506, "grad_norm": 3.607464075088501, "learning_rate": 1.4729643257078868e-05, "loss": 0.5584, "step": 463600 }, { "epoch": 10.679745786128764, "grad_norm": 2.997453212738037, "learning_rate": 1.4722371693256352e-05, "loss": 0.5566, "step": 463800 }, { "epoch": 10.684351109883025, "grad_norm": 2.907045364379883, "learning_rate": 1.4715100129433837e-05, "loss": 0.5699, "step": 464000 }, { "epoch": 10.688956433637285, "grad_norm": 2.692117929458618, "learning_rate": 1.470782856561132e-05, "loss": 0.5607, "step": 464200 }, { "epoch": 10.693561757391544, "grad_norm": 3.213514804840088, "learning_rate": 1.4700557001788805e-05, "loss": 0.5622, "step": 464400 }, { "epoch": 10.698167081145805, "grad_norm": 2.8752119541168213, "learning_rate": 1.469328543796629e-05, "loss": 0.5759, "step": 464600 }, { "epoch": 10.702772404900065, "grad_norm": 2.7570505142211914, "learning_rate": 1.4686013874143773e-05, "loss": 0.5732, "step": 464800 }, { "epoch": 10.707377728654324, "grad_norm": 2.800178050994873, "learning_rate": 1.467877866814037e-05, "loss": 0.5693, "step": 465000 }, { "epoch": 10.711983052408584, "grad_norm": 2.608539342880249, "learning_rate": 1.4671507104317856e-05, "loss": 0.5683, "step": 465200 }, { "epoch": 10.716588376162845, "grad_norm": 3.1758687496185303, "learning_rate": 1.4664235540495339e-05, "loss": 0.571, "step": 465400 }, { "epoch": 10.721193699917103, "grad_norm": 2.868126392364502, "learning_rate": 1.4656963976672823e-05, "loss": 0.5706, "step": 465600 }, { "epoch": 10.725799023671364, "grad_norm": 3.5965654850006104, "learning_rate": 1.4649692412850309e-05, "loss": 0.5794, "step": 465800 }, { "epoch": 10.730404347425624, "grad_norm": 3.169706106185913, "learning_rate": 1.4642420849027791e-05, "loss": 0.5607, "step": 466000 }, { "epoch": 10.735009671179885, "grad_norm": 3.449915647506714, "learning_rate": 1.4635149285205276e-05, "loss": 0.5568, "step": 466200 }, { "epoch": 10.739614994934144, "grad_norm": 2.98413348197937, "learning_rate": 1.4627877721382762e-05, "loss": 0.5568, "step": 466400 }, { "epoch": 10.744220318688404, "grad_norm": 3.158750534057617, "learning_rate": 1.4620606157560244e-05, "loss": 0.5663, "step": 466600 }, { "epoch": 10.748825642442664, "grad_norm": 3.3182640075683594, "learning_rate": 1.461333459373773e-05, "loss": 0.547, "step": 466800 }, { "epoch": 10.753430966196923, "grad_norm": 2.8168752193450928, "learning_rate": 1.4606063029915214e-05, "loss": 0.5632, "step": 467000 }, { "epoch": 10.758036289951184, "grad_norm": 2.551912546157837, "learning_rate": 1.4598791466092697e-05, "loss": 0.564, "step": 467200 }, { "epoch": 10.762641613705444, "grad_norm": 3.1398086547851562, "learning_rate": 1.4591519902270183e-05, "loss": 0.5656, "step": 467400 }, { "epoch": 10.767246937459703, "grad_norm": 2.806156873703003, "learning_rate": 1.4584248338447667e-05, "loss": 0.5712, "step": 467600 }, { "epoch": 10.771852261213963, "grad_norm": 4.065160751342773, "learning_rate": 1.4576976774625151e-05, "loss": 0.5717, "step": 467800 }, { "epoch": 10.776457584968224, "grad_norm": 2.9523870944976807, "learning_rate": 1.4569705210802635e-05, "loss": 0.5639, "step": 468000 }, { "epoch": 10.781062908722483, "grad_norm": 2.7149343490600586, "learning_rate": 1.456243364698012e-05, "loss": 0.5587, "step": 468200 }, { "epoch": 10.785668232476743, "grad_norm": 3.6111390590667725, "learning_rate": 1.4555162083157606e-05, "loss": 0.5648, "step": 468400 }, { "epoch": 10.790273556231003, "grad_norm": 3.606182098388672, "learning_rate": 1.4547890519335088e-05, "loss": 0.5726, "step": 468600 }, { "epoch": 10.794878879985262, "grad_norm": 2.920095443725586, "learning_rate": 1.4540618955512572e-05, "loss": 0.5633, "step": 468800 }, { "epoch": 10.799484203739523, "grad_norm": 3.206716299057007, "learning_rate": 1.4533347391690058e-05, "loss": 0.5613, "step": 469000 }, { "epoch": 10.804089527493783, "grad_norm": 2.6802709102630615, "learning_rate": 1.452607582786754e-05, "loss": 0.5627, "step": 469200 }, { "epoch": 10.808694851248042, "grad_norm": 2.581068754196167, "learning_rate": 1.4518804264045027e-05, "loss": 0.5645, "step": 469400 }, { "epoch": 10.813300175002302, "grad_norm": 2.9014952182769775, "learning_rate": 1.451153270022251e-05, "loss": 0.5717, "step": 469600 }, { "epoch": 10.817905498756563, "grad_norm": 3.2331104278564453, "learning_rate": 1.4504261136399993e-05, "loss": 0.5561, "step": 469800 }, { "epoch": 10.822510822510823, "grad_norm": 3.1601359844207764, "learning_rate": 1.449698957257748e-05, "loss": 0.5606, "step": 470000 }, { "epoch": 10.827116146265082, "grad_norm": 2.7113850116729736, "learning_rate": 1.4489718008754963e-05, "loss": 0.5678, "step": 470200 }, { "epoch": 10.831721470019342, "grad_norm": 3.2414093017578125, "learning_rate": 1.4482446444932446e-05, "loss": 0.5614, "step": 470400 }, { "epoch": 10.836326793773603, "grad_norm": 2.453719139099121, "learning_rate": 1.4475174881109932e-05, "loss": 0.5608, "step": 470600 }, { "epoch": 10.840932117527862, "grad_norm": 3.3981575965881348, "learning_rate": 1.4467903317287416e-05, "loss": 0.5644, "step": 470800 }, { "epoch": 10.845537441282122, "grad_norm": 3.9584078788757324, "learning_rate": 1.4460631753464902e-05, "loss": 0.5649, "step": 471000 }, { "epoch": 10.850142765036383, "grad_norm": 2.609250545501709, "learning_rate": 1.4453360189642385e-05, "loss": 0.5737, "step": 471200 }, { "epoch": 10.854748088790641, "grad_norm": 2.888627767562866, "learning_rate": 1.4446088625819869e-05, "loss": 0.5763, "step": 471400 }, { "epoch": 10.859353412544902, "grad_norm": 3.1995785236358643, "learning_rate": 1.4438817061997355e-05, "loss": 0.5639, "step": 471600 }, { "epoch": 10.863958736299162, "grad_norm": 3.3186826705932617, "learning_rate": 1.4431545498174837e-05, "loss": 0.5643, "step": 471800 }, { "epoch": 10.868564060053421, "grad_norm": 3.200171709060669, "learning_rate": 1.4424310292171434e-05, "loss": 0.5615, "step": 472000 }, { "epoch": 10.873169383807681, "grad_norm": 3.0748026371002197, "learning_rate": 1.4417038728348918e-05, "loss": 0.5585, "step": 472200 }, { "epoch": 10.877774707561942, "grad_norm": 2.5233187675476074, "learning_rate": 1.4409767164526404e-05, "loss": 0.5693, "step": 472400 }, { "epoch": 10.882380031316202, "grad_norm": 3.0431416034698486, "learning_rate": 1.4402531958523001e-05, "loss": 0.5627, "step": 472600 }, { "epoch": 10.886985355070461, "grad_norm": 2.8230044841766357, "learning_rate": 1.4395260394700484e-05, "loss": 0.5725, "step": 472800 }, { "epoch": 10.891590678824722, "grad_norm": 2.9966495037078857, "learning_rate": 1.438798883087797e-05, "loss": 0.566, "step": 473000 }, { "epoch": 10.896196002578982, "grad_norm": 3.4623706340789795, "learning_rate": 1.4380717267055454e-05, "loss": 0.5664, "step": 473200 }, { "epoch": 10.90080132633324, "grad_norm": 2.6742422580718994, "learning_rate": 1.4373445703232936e-05, "loss": 0.5632, "step": 473400 }, { "epoch": 10.905406650087501, "grad_norm": 3.300477981567383, "learning_rate": 1.4366174139410422e-05, "loss": 0.5797, "step": 473600 }, { "epoch": 10.910011973841762, "grad_norm": 2.9274837970733643, "learning_rate": 1.4358902575587907e-05, "loss": 0.5604, "step": 473800 }, { "epoch": 10.91461729759602, "grad_norm": 2.8111343383789062, "learning_rate": 1.4351631011765389e-05, "loss": 0.5643, "step": 474000 }, { "epoch": 10.919222621350281, "grad_norm": 2.678849458694458, "learning_rate": 1.4344359447942875e-05, "loss": 0.5646, "step": 474200 }, { "epoch": 10.923827945104541, "grad_norm": 2.9185190200805664, "learning_rate": 1.433708788412036e-05, "loss": 0.5691, "step": 474400 }, { "epoch": 10.9284332688588, "grad_norm": 3.45511794090271, "learning_rate": 1.4329852678116956e-05, "loss": 0.568, "step": 474600 }, { "epoch": 10.93303859261306, "grad_norm": 3.163696527481079, "learning_rate": 1.4322617472113553e-05, "loss": 0.551, "step": 474800 }, { "epoch": 10.937643916367321, "grad_norm": 3.591355323791504, "learning_rate": 1.4315345908291039e-05, "loss": 0.5525, "step": 475000 }, { "epoch": 10.94224924012158, "grad_norm": 3.392062187194824, "learning_rate": 1.4308074344468521e-05, "loss": 0.562, "step": 475200 }, { "epoch": 10.94685456387584, "grad_norm": 3.2929904460906982, "learning_rate": 1.4300802780646006e-05, "loss": 0.5711, "step": 475400 }, { "epoch": 10.9514598876301, "grad_norm": 3.0882444381713867, "learning_rate": 1.4293531216823492e-05, "loss": 0.5573, "step": 475600 }, { "epoch": 10.95606521138436, "grad_norm": 3.073636054992676, "learning_rate": 1.4286259653000974e-05, "loss": 0.5738, "step": 475800 }, { "epoch": 10.96067053513862, "grad_norm": 3.676265001296997, "learning_rate": 1.4278988089178458e-05, "loss": 0.5551, "step": 476000 }, { "epoch": 10.96527585889288, "grad_norm": 3.5196962356567383, "learning_rate": 1.4271716525355944e-05, "loss": 0.565, "step": 476200 }, { "epoch": 10.96988118264714, "grad_norm": 3.1859378814697266, "learning_rate": 1.4264444961533427e-05, "loss": 0.565, "step": 476400 }, { "epoch": 10.9744865064014, "grad_norm": 2.839648962020874, "learning_rate": 1.4257173397710913e-05, "loss": 0.5656, "step": 476600 }, { "epoch": 10.97909183015566, "grad_norm": 3.086836338043213, "learning_rate": 1.4249901833888397e-05, "loss": 0.5653, "step": 476800 }, { "epoch": 10.98369715390992, "grad_norm": 3.1683554649353027, "learning_rate": 1.424263027006588e-05, "loss": 0.5679, "step": 477000 }, { "epoch": 10.98830247766418, "grad_norm": 3.869631052017212, "learning_rate": 1.4235358706243365e-05, "loss": 0.5605, "step": 477200 }, { "epoch": 10.99290780141844, "grad_norm": 3.0082695484161377, "learning_rate": 1.422808714242085e-05, "loss": 0.5558, "step": 477400 }, { "epoch": 10.9975131251727, "grad_norm": 2.398848295211792, "learning_rate": 1.4220815578598334e-05, "loss": 0.5681, "step": 477600 }, { "epoch": 11.0, "eval_loss": 0.5423869490623474, "eval_runtime": 166.7579, "eval_samples_per_second": 170.073, "eval_steps_per_second": 10.632, "step": 477708 }, { "epoch": 11.002118448926959, "grad_norm": 3.4873993396759033, "learning_rate": 1.4213544014775818e-05, "loss": 0.5714, "step": 477800 }, { "epoch": 11.00672377268122, "grad_norm": 2.91593337059021, "learning_rate": 1.4206272450953302e-05, "loss": 0.558, "step": 478000 }, { "epoch": 11.01132909643548, "grad_norm": 3.278059959411621, "learning_rate": 1.4199037244949899e-05, "loss": 0.5672, "step": 478200 }, { "epoch": 11.015934420189739, "grad_norm": 2.8523497581481934, "learning_rate": 1.4191765681127385e-05, "loss": 0.5712, "step": 478400 }, { "epoch": 11.020539743944, "grad_norm": 2.9555845260620117, "learning_rate": 1.4184494117304868e-05, "loss": 0.5571, "step": 478600 }, { "epoch": 11.02514506769826, "grad_norm": 3.0521042346954346, "learning_rate": 1.4177222553482352e-05, "loss": 0.5565, "step": 478800 }, { "epoch": 11.029750391452518, "grad_norm": 3.1665375232696533, "learning_rate": 1.4169950989659838e-05, "loss": 0.5573, "step": 479000 }, { "epoch": 11.034355715206779, "grad_norm": 2.929694890975952, "learning_rate": 1.416267942583732e-05, "loss": 0.5592, "step": 479200 }, { "epoch": 11.03896103896104, "grad_norm": 3.597895860671997, "learning_rate": 1.4155407862014806e-05, "loss": 0.5658, "step": 479400 }, { "epoch": 11.043566362715298, "grad_norm": 2.8076977729797363, "learning_rate": 1.414813629819229e-05, "loss": 0.554, "step": 479600 }, { "epoch": 11.048171686469558, "grad_norm": 3.267301559448242, "learning_rate": 1.4140864734369773e-05, "loss": 0.568, "step": 479800 }, { "epoch": 11.052777010223819, "grad_norm": 3.1478617191314697, "learning_rate": 1.4133593170547259e-05, "loss": 0.5702, "step": 480000 }, { "epoch": 11.05738233397808, "grad_norm": 2.9337422847747803, "learning_rate": 1.4126321606724743e-05, "loss": 0.5565, "step": 480200 }, { "epoch": 11.061987657732338, "grad_norm": 2.9447715282440186, "learning_rate": 1.4119050042902226e-05, "loss": 0.5574, "step": 480400 }, { "epoch": 11.066592981486599, "grad_norm": 3.1373238563537598, "learning_rate": 1.4111778479079711e-05, "loss": 0.5539, "step": 480600 }, { "epoch": 11.07119830524086, "grad_norm": 3.812917947769165, "learning_rate": 1.4104506915257196e-05, "loss": 0.5531, "step": 480800 }, { "epoch": 11.075803628995118, "grad_norm": 2.356658935546875, "learning_rate": 1.409723535143468e-05, "loss": 0.5652, "step": 481000 }, { "epoch": 11.080408952749378, "grad_norm": 3.141037940979004, "learning_rate": 1.4089963787612164e-05, "loss": 0.552, "step": 481200 }, { "epoch": 11.085014276503639, "grad_norm": 3.6183485984802246, "learning_rate": 1.4082692223789648e-05, "loss": 0.5601, "step": 481400 }, { "epoch": 11.089619600257898, "grad_norm": 2.416588306427002, "learning_rate": 1.4075420659967134e-05, "loss": 0.5571, "step": 481600 }, { "epoch": 11.094224924012158, "grad_norm": 3.343844175338745, "learning_rate": 1.4068149096144617e-05, "loss": 0.5558, "step": 481800 }, { "epoch": 11.098830247766418, "grad_norm": 2.7808258533477783, "learning_rate": 1.4060877532322101e-05, "loss": 0.5705, "step": 482000 }, { "epoch": 11.103435571520677, "grad_norm": 2.9986512660980225, "learning_rate": 1.4053605968499587e-05, "loss": 0.5589, "step": 482200 }, { "epoch": 11.108040895274938, "grad_norm": 3.053314685821533, "learning_rate": 1.404633440467707e-05, "loss": 0.5586, "step": 482400 }, { "epoch": 11.112646219029198, "grad_norm": 2.5446858406066895, "learning_rate": 1.4039062840854555e-05, "loss": 0.5498, "step": 482600 }, { "epoch": 11.117251542783457, "grad_norm": 3.446873664855957, "learning_rate": 1.403179127703204e-05, "loss": 0.5492, "step": 482800 }, { "epoch": 11.121856866537717, "grad_norm": 2.8828375339508057, "learning_rate": 1.4024519713209522e-05, "loss": 0.5557, "step": 483000 }, { "epoch": 11.126462190291978, "grad_norm": 2.941978693008423, "learning_rate": 1.4017248149387008e-05, "loss": 0.5594, "step": 483200 }, { "epoch": 11.131067514046238, "grad_norm": 3.2115888595581055, "learning_rate": 1.4009976585564492e-05, "loss": 0.5628, "step": 483400 }, { "epoch": 11.135672837800497, "grad_norm": 3.323775291442871, "learning_rate": 1.4002705021741975e-05, "loss": 0.5574, "step": 483600 }, { "epoch": 11.140278161554757, "grad_norm": 3.129638195037842, "learning_rate": 1.399543345791946e-05, "loss": 0.5684, "step": 483800 }, { "epoch": 11.144883485309018, "grad_norm": 3.986924886703491, "learning_rate": 1.3988161894096945e-05, "loss": 0.5628, "step": 484000 }, { "epoch": 11.149488809063277, "grad_norm": 3.6282601356506348, "learning_rate": 1.3980890330274429e-05, "loss": 0.563, "step": 484200 }, { "epoch": 11.154094132817537, "grad_norm": 3.6448795795440674, "learning_rate": 1.3973618766451913e-05, "loss": 0.5604, "step": 484400 }, { "epoch": 11.158699456571798, "grad_norm": 4.1878662109375, "learning_rate": 1.3966347202629397e-05, "loss": 0.5612, "step": 484600 }, { "epoch": 11.163304780326056, "grad_norm": 3.1330618858337402, "learning_rate": 1.3959075638806883e-05, "loss": 0.5677, "step": 484800 }, { "epoch": 11.167910104080317, "grad_norm": 3.0046327114105225, "learning_rate": 1.3951804074984366e-05, "loss": 0.5619, "step": 485000 }, { "epoch": 11.172515427834577, "grad_norm": 2.964937925338745, "learning_rate": 1.3944532511161852e-05, "loss": 0.5613, "step": 485200 }, { "epoch": 11.177120751588836, "grad_norm": 3.570549249649048, "learning_rate": 1.3937260947339336e-05, "loss": 0.556, "step": 485400 }, { "epoch": 11.181726075343096, "grad_norm": 2.8079946041107178, "learning_rate": 1.3930025741335933e-05, "loss": 0.5513, "step": 485600 }, { "epoch": 11.186331399097357, "grad_norm": 2.9145305156707764, "learning_rate": 1.3922754177513416e-05, "loss": 0.5556, "step": 485800 }, { "epoch": 11.190936722851616, "grad_norm": 3.74453067779541, "learning_rate": 1.3915482613690901e-05, "loss": 0.5637, "step": 486000 }, { "epoch": 11.195542046605876, "grad_norm": 3.717133045196533, "learning_rate": 1.3908211049868386e-05, "loss": 0.5556, "step": 486200 }, { "epoch": 11.200147370360137, "grad_norm": 3.242644786834717, "learning_rate": 1.3900939486045868e-05, "loss": 0.5659, "step": 486400 }, { "epoch": 11.204752694114397, "grad_norm": 3.173064708709717, "learning_rate": 1.3893667922223354e-05, "loss": 0.5541, "step": 486600 }, { "epoch": 11.209358017868656, "grad_norm": 2.6633315086364746, "learning_rate": 1.3886396358400838e-05, "loss": 0.575, "step": 486800 }, { "epoch": 11.213963341622916, "grad_norm": 3.097576856613159, "learning_rate": 1.3879124794578323e-05, "loss": 0.5574, "step": 487000 }, { "epoch": 11.218568665377177, "grad_norm": 3.0718281269073486, "learning_rate": 1.3871853230755807e-05, "loss": 0.5525, "step": 487200 }, { "epoch": 11.223173989131435, "grad_norm": 4.379641056060791, "learning_rate": 1.3864581666933291e-05, "loss": 0.5663, "step": 487400 }, { "epoch": 11.227779312885696, "grad_norm": 2.682002305984497, "learning_rate": 1.3857310103110775e-05, "loss": 0.5677, "step": 487600 }, { "epoch": 11.232384636639956, "grad_norm": 3.9355664253234863, "learning_rate": 1.385003853928826e-05, "loss": 0.5645, "step": 487800 }, { "epoch": 11.236989960394215, "grad_norm": 3.0530924797058105, "learning_rate": 1.3842766975465744e-05, "loss": 0.5584, "step": 488000 }, { "epoch": 11.241595284148476, "grad_norm": 3.311056613922119, "learning_rate": 1.383549541164323e-05, "loss": 0.5569, "step": 488200 }, { "epoch": 11.246200607902736, "grad_norm": 3.7261691093444824, "learning_rate": 1.3828260205639826e-05, "loss": 0.5502, "step": 488400 }, { "epoch": 11.250805931656995, "grad_norm": 2.7717695236206055, "learning_rate": 1.3820988641817309e-05, "loss": 0.566, "step": 488600 }, { "epoch": 11.255411255411255, "grad_norm": 2.931807041168213, "learning_rate": 1.3813717077994795e-05, "loss": 0.5555, "step": 488800 }, { "epoch": 11.260016579165516, "grad_norm": 3.2359492778778076, "learning_rate": 1.3806445514172279e-05, "loss": 0.5608, "step": 489000 }, { "epoch": 11.264621902919774, "grad_norm": 2.8134891986846924, "learning_rate": 1.3799173950349762e-05, "loss": 0.5694, "step": 489200 }, { "epoch": 11.269227226674035, "grad_norm": 3.166933059692383, "learning_rate": 1.3791902386527248e-05, "loss": 0.554, "step": 489400 }, { "epoch": 11.273832550428295, "grad_norm": 3.2484281063079834, "learning_rate": 1.3784630822704732e-05, "loss": 0.5577, "step": 489600 }, { "epoch": 11.278437874182554, "grad_norm": 3.1392292976379395, "learning_rate": 1.3777359258882214e-05, "loss": 0.5552, "step": 489800 }, { "epoch": 11.283043197936815, "grad_norm": 3.2946557998657227, "learning_rate": 1.37700876950597e-05, "loss": 0.5578, "step": 490000 }, { "epoch": 11.287648521691075, "grad_norm": 3.325127124786377, "learning_rate": 1.3762816131237184e-05, "loss": 0.546, "step": 490200 }, { "epoch": 11.292253845445336, "grad_norm": 2.8645405769348145, "learning_rate": 1.3755544567414669e-05, "loss": 0.5588, "step": 490400 }, { "epoch": 11.296859169199594, "grad_norm": 4.306154727935791, "learning_rate": 1.3748273003592153e-05, "loss": 0.5669, "step": 490600 }, { "epoch": 11.301464492953855, "grad_norm": 3.181588649749756, "learning_rate": 1.3741001439769637e-05, "loss": 0.5579, "step": 490800 }, { "epoch": 11.306069816708115, "grad_norm": 2.690722942352295, "learning_rate": 1.3733729875947121e-05, "loss": 0.5648, "step": 491000 }, { "epoch": 11.310675140462374, "grad_norm": 3.0725932121276855, "learning_rate": 1.3726458312124606e-05, "loss": 0.552, "step": 491200 }, { "epoch": 11.315280464216634, "grad_norm": 3.268697500228882, "learning_rate": 1.371918674830209e-05, "loss": 0.5616, "step": 491400 }, { "epoch": 11.319885787970895, "grad_norm": 3.456531524658203, "learning_rate": 1.3711915184479574e-05, "loss": 0.5647, "step": 491600 }, { "epoch": 11.324491111725154, "grad_norm": 2.9139323234558105, "learning_rate": 1.3704643620657058e-05, "loss": 0.5547, "step": 491800 }, { "epoch": 11.329096435479414, "grad_norm": 3.0424976348876953, "learning_rate": 1.3697372056834544e-05, "loss": 0.5564, "step": 492000 }, { "epoch": 11.333701759233675, "grad_norm": 3.656421661376953, "learning_rate": 1.3690100493012028e-05, "loss": 0.5547, "step": 492200 }, { "epoch": 11.338307082987933, "grad_norm": 2.804675340652466, "learning_rate": 1.368282892918951e-05, "loss": 0.5504, "step": 492400 }, { "epoch": 11.342912406742194, "grad_norm": 2.907458543777466, "learning_rate": 1.3675557365366997e-05, "loss": 0.5557, "step": 492600 }, { "epoch": 11.347517730496454, "grad_norm": 3.368783712387085, "learning_rate": 1.3668322159363594e-05, "loss": 0.5629, "step": 492800 }, { "epoch": 11.352123054250713, "grad_norm": 2.7372817993164062, "learning_rate": 1.366108695336019e-05, "loss": 0.5667, "step": 493000 }, { "epoch": 11.356728378004973, "grad_norm": 3.26176381111145, "learning_rate": 1.3653815389537675e-05, "loss": 0.5581, "step": 493200 }, { "epoch": 11.361333701759234, "grad_norm": 3.8855667114257812, "learning_rate": 1.3646543825715157e-05, "loss": 0.5589, "step": 493400 }, { "epoch": 11.365939025513494, "grad_norm": 2.7255947589874268, "learning_rate": 1.3639272261892643e-05, "loss": 0.5483, "step": 493600 }, { "epoch": 11.370544349267753, "grad_norm": 3.3710615634918213, "learning_rate": 1.3632000698070127e-05, "loss": 0.5443, "step": 493800 }, { "epoch": 11.375149673022014, "grad_norm": 3.4504647254943848, "learning_rate": 1.3624729134247612e-05, "loss": 0.5645, "step": 494000 }, { "epoch": 11.379754996776274, "grad_norm": 2.8603811264038086, "learning_rate": 1.3617493928244209e-05, "loss": 0.5617, "step": 494200 }, { "epoch": 11.384360320530533, "grad_norm": 3.494835376739502, "learning_rate": 1.3610222364421693e-05, "loss": 0.5618, "step": 494400 }, { "epoch": 11.388965644284793, "grad_norm": 2.891660213470459, "learning_rate": 1.3602950800599177e-05, "loss": 0.5602, "step": 494600 }, { "epoch": 11.393570968039054, "grad_norm": 3.5556230545043945, "learning_rate": 1.3595679236776661e-05, "loss": 0.5636, "step": 494800 }, { "epoch": 11.398176291793312, "grad_norm": 2.6204235553741455, "learning_rate": 1.3588407672954145e-05, "loss": 0.5588, "step": 495000 }, { "epoch": 11.402781615547573, "grad_norm": 3.596064805984497, "learning_rate": 1.3581136109131631e-05, "loss": 0.5591, "step": 495200 }, { "epoch": 11.407386939301833, "grad_norm": 3.3406505584716797, "learning_rate": 1.3573864545309114e-05, "loss": 0.5631, "step": 495400 }, { "epoch": 11.411992263056092, "grad_norm": 3.357797145843506, "learning_rate": 1.3566592981486598e-05, "loss": 0.5515, "step": 495600 }, { "epoch": 11.416597586810353, "grad_norm": 3.499816656112671, "learning_rate": 1.3559321417664084e-05, "loss": 0.5618, "step": 495800 }, { "epoch": 11.421202910564613, "grad_norm": 2.6165995597839355, "learning_rate": 1.3552049853841568e-05, "loss": 0.5569, "step": 496000 }, { "epoch": 11.425808234318872, "grad_norm": 2.801710844039917, "learning_rate": 1.354477829001905e-05, "loss": 0.5481, "step": 496200 }, { "epoch": 11.430413558073132, "grad_norm": 3.2075862884521484, "learning_rate": 1.3537506726196537e-05, "loss": 0.5672, "step": 496400 }, { "epoch": 11.435018881827393, "grad_norm": 2.7603681087493896, "learning_rate": 1.3530235162374021e-05, "loss": 0.5631, "step": 496600 }, { "epoch": 11.439624205581651, "grad_norm": 2.9977965354919434, "learning_rate": 1.3522963598551505e-05, "loss": 0.5488, "step": 496800 }, { "epoch": 11.444229529335912, "grad_norm": 2.803744316101074, "learning_rate": 1.351569203472899e-05, "loss": 0.5578, "step": 497000 }, { "epoch": 11.448834853090172, "grad_norm": 3.423523187637329, "learning_rate": 1.3508420470906474e-05, "loss": 0.563, "step": 497200 }, { "epoch": 11.453440176844433, "grad_norm": 3.3576598167419434, "learning_rate": 1.3501148907083958e-05, "loss": 0.5555, "step": 497400 }, { "epoch": 11.458045500598692, "grad_norm": 3.5315699577331543, "learning_rate": 1.3493877343261442e-05, "loss": 0.564, "step": 497600 }, { "epoch": 11.462650824352952, "grad_norm": 2.96769642829895, "learning_rate": 1.3486605779438926e-05, "loss": 0.5566, "step": 497800 }, { "epoch": 11.467256148107213, "grad_norm": 2.768280267715454, "learning_rate": 1.347933421561641e-05, "loss": 0.5495, "step": 498000 }, { "epoch": 11.471861471861471, "grad_norm": 3.71108078956604, "learning_rate": 1.3472062651793895e-05, "loss": 0.561, "step": 498200 }, { "epoch": 11.476466795615732, "grad_norm": 3.0207583904266357, "learning_rate": 1.346479108797138e-05, "loss": 0.5592, "step": 498400 }, { "epoch": 11.481072119369992, "grad_norm": 3.7575089931488037, "learning_rate": 1.3457519524148863e-05, "loss": 0.556, "step": 498600 }, { "epoch": 11.485677443124251, "grad_norm": 3.050069570541382, "learning_rate": 1.3450247960326347e-05, "loss": 0.5618, "step": 498800 }, { "epoch": 11.490282766878511, "grad_norm": 3.386132001876831, "learning_rate": 1.3443012754322944e-05, "loss": 0.5526, "step": 499000 }, { "epoch": 11.494888090632772, "grad_norm": 2.874272584915161, "learning_rate": 1.343574119050043e-05, "loss": 0.561, "step": 499200 }, { "epoch": 11.49949341438703, "grad_norm": 2.706022262573242, "learning_rate": 1.3428469626677914e-05, "loss": 0.5654, "step": 499400 }, { "epoch": 11.504098738141291, "grad_norm": 3.2156574726104736, "learning_rate": 1.3421234420674511e-05, "loss": 0.561, "step": 499600 }, { "epoch": 11.508704061895552, "grad_norm": 2.3544604778289795, "learning_rate": 1.3413962856851994e-05, "loss": 0.5725, "step": 499800 }, { "epoch": 11.513309385649812, "grad_norm": 3.362711191177368, "learning_rate": 1.340669129302948e-05, "loss": 0.5669, "step": 500000 }, { "epoch": 11.51791470940407, "grad_norm": 3.2059872150421143, "learning_rate": 1.3399419729206964e-05, "loss": 0.5644, "step": 500200 }, { "epoch": 11.522520033158331, "grad_norm": 2.8236091136932373, "learning_rate": 1.3392148165384448e-05, "loss": 0.5625, "step": 500400 }, { "epoch": 11.527125356912592, "grad_norm": 3.4916694164276123, "learning_rate": 1.3384912959381045e-05, "loss": 0.5556, "step": 500600 }, { "epoch": 11.53173068066685, "grad_norm": 2.976130962371826, "learning_rate": 1.337764139555853e-05, "loss": 0.5611, "step": 500800 }, { "epoch": 11.536336004421111, "grad_norm": 3.8852763175964355, "learning_rate": 1.3370369831736013e-05, "loss": 0.5676, "step": 501000 }, { "epoch": 11.540941328175371, "grad_norm": 3.322741985321045, "learning_rate": 1.3363098267913498e-05, "loss": 0.5669, "step": 501200 }, { "epoch": 11.54554665192963, "grad_norm": 2.873216390609741, "learning_rate": 1.3355826704090982e-05, "loss": 0.5711, "step": 501400 }, { "epoch": 11.55015197568389, "grad_norm": 2.740243911743164, "learning_rate": 1.3348555140268466e-05, "loss": 0.5584, "step": 501600 }, { "epoch": 11.554757299438151, "grad_norm": 2.4768307209014893, "learning_rate": 1.334128357644595e-05, "loss": 0.5652, "step": 501800 }, { "epoch": 11.55936262319241, "grad_norm": 2.8483810424804688, "learning_rate": 1.3334012012623435e-05, "loss": 0.5571, "step": 502000 }, { "epoch": 11.56396794694667, "grad_norm": 2.515305280685425, "learning_rate": 1.332674044880092e-05, "loss": 0.5515, "step": 502200 }, { "epoch": 11.56857327070093, "grad_norm": 2.972041606903076, "learning_rate": 1.3319468884978403e-05, "loss": 0.5558, "step": 502400 }, { "epoch": 11.57317859445519, "grad_norm": 2.9375104904174805, "learning_rate": 1.3312197321155887e-05, "loss": 0.5514, "step": 502600 }, { "epoch": 11.57778391820945, "grad_norm": 2.7295291423797607, "learning_rate": 1.3304925757333373e-05, "loss": 0.5623, "step": 502800 }, { "epoch": 11.58238924196371, "grad_norm": 3.959212064743042, "learning_rate": 1.3297654193510857e-05, "loss": 0.5642, "step": 503000 }, { "epoch": 11.58699456571797, "grad_norm": 3.5699644088745117, "learning_rate": 1.329038262968834e-05, "loss": 0.553, "step": 503200 }, { "epoch": 11.59159988947223, "grad_norm": 2.7130260467529297, "learning_rate": 1.3283111065865826e-05, "loss": 0.5518, "step": 503400 }, { "epoch": 11.59620521322649, "grad_norm": 2.8701913356781006, "learning_rate": 1.327583950204331e-05, "loss": 0.5529, "step": 503600 }, { "epoch": 11.600810536980749, "grad_norm": 2.8965420722961426, "learning_rate": 1.3268567938220794e-05, "loss": 0.556, "step": 503800 }, { "epoch": 11.60541586073501, "grad_norm": 2.749356269836426, "learning_rate": 1.3261296374398278e-05, "loss": 0.5555, "step": 504000 }, { "epoch": 11.61002118448927, "grad_norm": 3.4139721393585205, "learning_rate": 1.3254024810575763e-05, "loss": 0.5527, "step": 504200 }, { "epoch": 11.61462650824353, "grad_norm": 2.8031933307647705, "learning_rate": 1.3246753246753247e-05, "loss": 0.5462, "step": 504400 }, { "epoch": 11.619231831997789, "grad_norm": 3.3376245498657227, "learning_rate": 1.3239481682930731e-05, "loss": 0.563, "step": 504600 }, { "epoch": 11.62383715575205, "grad_norm": 2.577552318572998, "learning_rate": 1.3232210119108217e-05, "loss": 0.551, "step": 504800 }, { "epoch": 11.62844247950631, "grad_norm": 2.8429970741271973, "learning_rate": 1.32249385552857e-05, "loss": 0.5543, "step": 505000 }, { "epoch": 11.633047803260569, "grad_norm": 2.562479019165039, "learning_rate": 1.3217666991463184e-05, "loss": 0.5515, "step": 505200 }, { "epoch": 11.63765312701483, "grad_norm": 3.0546891689300537, "learning_rate": 1.321039542764067e-05, "loss": 0.5623, "step": 505400 }, { "epoch": 11.64225845076909, "grad_norm": 3.4961397647857666, "learning_rate": 1.3203123863818152e-05, "loss": 0.552, "step": 505600 }, { "epoch": 11.646863774523348, "grad_norm": 2.9656572341918945, "learning_rate": 1.3195852299995636e-05, "loss": 0.5574, "step": 505800 }, { "epoch": 11.651469098277609, "grad_norm": 2.4916203022003174, "learning_rate": 1.3188617093992233e-05, "loss": 0.5648, "step": 506000 }, { "epoch": 11.65607442203187, "grad_norm": 3.366502046585083, "learning_rate": 1.318134553016972e-05, "loss": 0.5664, "step": 506200 }, { "epoch": 11.660679745786128, "grad_norm": 2.6675634384155273, "learning_rate": 1.3174073966347203e-05, "loss": 0.561, "step": 506400 }, { "epoch": 11.665285069540388, "grad_norm": 2.798330068588257, "learning_rate": 1.3166802402524688e-05, "loss": 0.5609, "step": 506600 }, { "epoch": 11.669890393294649, "grad_norm": 3.1210689544677734, "learning_rate": 1.3159530838702172e-05, "loss": 0.5702, "step": 506800 }, { "epoch": 11.67449571704891, "grad_norm": 4.939085006713867, "learning_rate": 1.3152259274879656e-05, "loss": 0.559, "step": 507000 }, { "epoch": 11.679101040803168, "grad_norm": 3.4388821125030518, "learning_rate": 1.314498771105714e-05, "loss": 0.5616, "step": 507200 }, { "epoch": 11.683706364557429, "grad_norm": 2.974487066268921, "learning_rate": 1.3137716147234625e-05, "loss": 0.5539, "step": 507400 }, { "epoch": 11.688311688311689, "grad_norm": 2.9296836853027344, "learning_rate": 1.3130444583412109e-05, "loss": 0.5657, "step": 507600 }, { "epoch": 11.692917012065948, "grad_norm": 3.680119276046753, "learning_rate": 1.3123173019589593e-05, "loss": 0.5624, "step": 507800 }, { "epoch": 11.697522335820208, "grad_norm": 2.4661529064178467, "learning_rate": 1.3115901455767077e-05, "loss": 0.5654, "step": 508000 }, { "epoch": 11.702127659574469, "grad_norm": 3.046780586242676, "learning_rate": 1.3108629891944563e-05, "loss": 0.5738, "step": 508200 }, { "epoch": 11.706732983328727, "grad_norm": 2.5584285259246826, "learning_rate": 1.3101358328122046e-05, "loss": 0.554, "step": 508400 }, { "epoch": 11.711338307082988, "grad_norm": 3.345762014389038, "learning_rate": 1.309408676429953e-05, "loss": 0.5675, "step": 508600 }, { "epoch": 11.715943630837248, "grad_norm": 2.661505937576294, "learning_rate": 1.3086815200477016e-05, "loss": 0.5649, "step": 508800 }, { "epoch": 11.720548954591507, "grad_norm": 3.2517101764678955, "learning_rate": 1.3079543636654498e-05, "loss": 0.5518, "step": 509000 }, { "epoch": 11.725154278345768, "grad_norm": 2.780080556869507, "learning_rate": 1.3072272072831983e-05, "loss": 0.5524, "step": 509200 }, { "epoch": 11.729759602100028, "grad_norm": 3.5682613849639893, "learning_rate": 1.3065000509009468e-05, "loss": 0.5642, "step": 509400 }, { "epoch": 11.734364925854287, "grad_norm": 2.8819103240966797, "learning_rate": 1.3057728945186953e-05, "loss": 0.5553, "step": 509600 }, { "epoch": 11.738970249608547, "grad_norm": 3.0837063789367676, "learning_rate": 1.3050457381364437e-05, "loss": 0.5659, "step": 509800 }, { "epoch": 11.743575573362808, "grad_norm": 2.94468355178833, "learning_rate": 1.3043185817541921e-05, "loss": 0.5447, "step": 510000 }, { "epoch": 11.748180897117066, "grad_norm": 2.4969544410705566, "learning_rate": 1.3035914253719405e-05, "loss": 0.5503, "step": 510200 }, { "epoch": 11.752786220871327, "grad_norm": 2.9021387100219727, "learning_rate": 1.302864268989689e-05, "loss": 0.5498, "step": 510400 }, { "epoch": 11.757391544625587, "grad_norm": 3.434443473815918, "learning_rate": 1.3021371126074374e-05, "loss": 0.5611, "step": 510600 }, { "epoch": 11.761996868379848, "grad_norm": 2.887711763381958, "learning_rate": 1.3014099562251858e-05, "loss": 0.5348, "step": 510800 }, { "epoch": 11.766602192134107, "grad_norm": 3.3442883491516113, "learning_rate": 1.3006827998429342e-05, "loss": 0.5537, "step": 511000 }, { "epoch": 11.771207515888367, "grad_norm": 2.8919873237609863, "learning_rate": 1.2999592792425939e-05, "loss": 0.5731, "step": 511200 }, { "epoch": 11.775812839642628, "grad_norm": 3.013439178466797, "learning_rate": 1.2992321228603423e-05, "loss": 0.5496, "step": 511400 }, { "epoch": 11.780418163396886, "grad_norm": 2.499289035797119, "learning_rate": 1.298508602260002e-05, "loss": 0.5678, "step": 511600 }, { "epoch": 11.785023487151147, "grad_norm": 3.30153226852417, "learning_rate": 1.2977814458777506e-05, "loss": 0.557, "step": 511800 }, { "epoch": 11.789628810905407, "grad_norm": 3.1825666427612305, "learning_rate": 1.2970542894954989e-05, "loss": 0.5488, "step": 512000 }, { "epoch": 11.794234134659666, "grad_norm": 3.229902744293213, "learning_rate": 1.2963271331132473e-05, "loss": 0.567, "step": 512200 }, { "epoch": 11.798839458413926, "grad_norm": 2.68745493888855, "learning_rate": 1.2955999767309959e-05, "loss": 0.5674, "step": 512400 }, { "epoch": 11.803444782168187, "grad_norm": 3.270493507385254, "learning_rate": 1.2948728203487441e-05, "loss": 0.5487, "step": 512600 }, { "epoch": 11.808050105922446, "grad_norm": 3.161329507827759, "learning_rate": 1.2941456639664927e-05, "loss": 0.5625, "step": 512800 }, { "epoch": 11.812655429676706, "grad_norm": 2.8435397148132324, "learning_rate": 1.2934185075842411e-05, "loss": 0.5453, "step": 513000 }, { "epoch": 11.817260753430967, "grad_norm": 2.739443778991699, "learning_rate": 1.2926913512019896e-05, "loss": 0.5546, "step": 513200 }, { "epoch": 11.821866077185225, "grad_norm": 2.649695634841919, "learning_rate": 1.291964194819738e-05, "loss": 0.5519, "step": 513400 }, { "epoch": 11.826471400939486, "grad_norm": 3.265084981918335, "learning_rate": 1.2912370384374864e-05, "loss": 0.5485, "step": 513600 }, { "epoch": 11.831076724693746, "grad_norm": 3.4886581897735596, "learning_rate": 1.2905098820552348e-05, "loss": 0.5591, "step": 513800 }, { "epoch": 11.835682048448007, "grad_norm": 3.9588351249694824, "learning_rate": 1.2897827256729833e-05, "loss": 0.5473, "step": 514000 }, { "epoch": 11.840287372202265, "grad_norm": 3.0174105167388916, "learning_rate": 1.2890555692907317e-05, "loss": 0.5637, "step": 514200 }, { "epoch": 11.844892695956526, "grad_norm": 2.9916231632232666, "learning_rate": 1.2883284129084803e-05, "loss": 0.5631, "step": 514400 }, { "epoch": 11.849498019710786, "grad_norm": 3.1581456661224365, "learning_rate": 1.2876012565262285e-05, "loss": 0.5553, "step": 514600 }, { "epoch": 11.854103343465045, "grad_norm": 2.6253857612609863, "learning_rate": 1.286874100143977e-05, "loss": 0.5628, "step": 514800 }, { "epoch": 11.858708667219306, "grad_norm": 3.4651215076446533, "learning_rate": 1.2861469437617255e-05, "loss": 0.555, "step": 515000 }, { "epoch": 11.863313990973566, "grad_norm": 5.004696369171143, "learning_rate": 1.2854197873794738e-05, "loss": 0.5577, "step": 515200 }, { "epoch": 11.867919314727825, "grad_norm": 3.6759226322174072, "learning_rate": 1.2846926309972222e-05, "loss": 0.5608, "step": 515400 }, { "epoch": 11.872524638482085, "grad_norm": 4.682463645935059, "learning_rate": 1.2839654746149708e-05, "loss": 0.5639, "step": 515600 }, { "epoch": 11.877129962236346, "grad_norm": 3.1118075847625732, "learning_rate": 1.2832419540146305e-05, "loss": 0.5593, "step": 515800 }, { "epoch": 11.881735285990604, "grad_norm": 3.632077693939209, "learning_rate": 1.2825147976323787e-05, "loss": 0.5623, "step": 516000 }, { "epoch": 11.886340609744865, "grad_norm": 3.2938430309295654, "learning_rate": 1.2817876412501273e-05, "loss": 0.5567, "step": 516200 }, { "epoch": 11.890945933499125, "grad_norm": 3.3671703338623047, "learning_rate": 1.2810604848678758e-05, "loss": 0.558, "step": 516400 }, { "epoch": 11.895551257253384, "grad_norm": 2.796440839767456, "learning_rate": 1.2803333284856242e-05, "loss": 0.5565, "step": 516600 }, { "epoch": 11.900156581007645, "grad_norm": 4.450211524963379, "learning_rate": 1.2796061721033726e-05, "loss": 0.551, "step": 516800 }, { "epoch": 11.904761904761905, "grad_norm": 2.8839023113250732, "learning_rate": 1.278879015721121e-05, "loss": 0.5542, "step": 517000 }, { "epoch": 11.909367228516164, "grad_norm": 3.0486433506011963, "learning_rate": 1.2781518593388694e-05, "loss": 0.5539, "step": 517200 }, { "epoch": 11.913972552270424, "grad_norm": 3.802643299102783, "learning_rate": 1.2774247029566179e-05, "loss": 0.552, "step": 517400 }, { "epoch": 11.918577876024685, "grad_norm": 3.0999131202697754, "learning_rate": 1.2766975465743663e-05, "loss": 0.548, "step": 517600 }, { "epoch": 11.923183199778945, "grad_norm": 2.016890525817871, "learning_rate": 1.2759703901921149e-05, "loss": 0.5554, "step": 517800 }, { "epoch": 11.927788523533204, "grad_norm": 3.3377721309661865, "learning_rate": 1.2752432338098631e-05, "loss": 0.5529, "step": 518000 }, { "epoch": 11.932393847287464, "grad_norm": 3.55546498298645, "learning_rate": 1.2745197132095228e-05, "loss": 0.5649, "step": 518200 }, { "epoch": 11.936999171041725, "grad_norm": 2.818472146987915, "learning_rate": 1.2737961926091825e-05, "loss": 0.5533, "step": 518400 }, { "epoch": 11.941604494795984, "grad_norm": 3.24600887298584, "learning_rate": 1.273069036226931e-05, "loss": 0.5611, "step": 518600 }, { "epoch": 11.946209818550244, "grad_norm": 2.4776716232299805, "learning_rate": 1.2723418798446795e-05, "loss": 0.5607, "step": 518800 }, { "epoch": 11.950815142304505, "grad_norm": 3.1430325508117676, "learning_rate": 1.2716147234624278e-05, "loss": 0.5527, "step": 519000 }, { "epoch": 11.955420466058763, "grad_norm": 3.204774856567383, "learning_rate": 1.2708875670801762e-05, "loss": 0.5664, "step": 519200 }, { "epoch": 11.960025789813024, "grad_norm": 4.431719779968262, "learning_rate": 1.2701604106979248e-05, "loss": 0.5497, "step": 519400 }, { "epoch": 11.964631113567284, "grad_norm": 3.592142343521118, "learning_rate": 1.269433254315673e-05, "loss": 0.5578, "step": 519600 }, { "epoch": 11.969236437321543, "grad_norm": 3.99924373626709, "learning_rate": 1.2687060979334216e-05, "loss": 0.5628, "step": 519800 }, { "epoch": 11.973841761075803, "grad_norm": 3.2276337146759033, "learning_rate": 1.26797894155117e-05, "loss": 0.5689, "step": 520000 }, { "epoch": 11.978447084830064, "grad_norm": 3.4805285930633545, "learning_rate": 1.2672517851689185e-05, "loss": 0.5545, "step": 520200 }, { "epoch": 11.983052408584324, "grad_norm": 3.0665435791015625, "learning_rate": 1.2665246287866669e-05, "loss": 0.5564, "step": 520400 }, { "epoch": 11.987657732338583, "grad_norm": 3.2864396572113037, "learning_rate": 1.2657974724044153e-05, "loss": 0.5689, "step": 520600 }, { "epoch": 11.992263056092844, "grad_norm": 4.101680278778076, "learning_rate": 1.2650703160221637e-05, "loss": 0.5586, "step": 520800 }, { "epoch": 11.996868379847104, "grad_norm": 3.1322414875030518, "learning_rate": 1.2643431596399122e-05, "loss": 0.5665, "step": 521000 }, { "epoch": 12.0, "eval_loss": 0.5357550382614136, "eval_runtime": 168.6634, "eval_samples_per_second": 168.152, "eval_steps_per_second": 10.512, "step": 521136 }, { "epoch": 12.001473703601363, "grad_norm": 3.8854141235351562, "learning_rate": 1.2636160032576606e-05, "loss": 0.5461, "step": 521200 }, { "epoch": 12.006079027355623, "grad_norm": 3.1888136863708496, "learning_rate": 1.2628888468754092e-05, "loss": 0.5459, "step": 521400 }, { "epoch": 12.010684351109884, "grad_norm": 2.910918951034546, "learning_rate": 1.2621653262750689e-05, "loss": 0.5549, "step": 521600 }, { "epoch": 12.015289674864142, "grad_norm": 3.348071813583374, "learning_rate": 1.2614381698928171e-05, "loss": 0.5604, "step": 521800 }, { "epoch": 12.019894998618403, "grad_norm": 2.9339218139648438, "learning_rate": 1.2607110135105655e-05, "loss": 0.5526, "step": 522000 }, { "epoch": 12.024500322372663, "grad_norm": 2.8683624267578125, "learning_rate": 1.2599838571283141e-05, "loss": 0.5467, "step": 522200 }, { "epoch": 12.029105646126922, "grad_norm": 3.5013625621795654, "learning_rate": 1.2592567007460624e-05, "loss": 0.5531, "step": 522400 }, { "epoch": 12.033710969881183, "grad_norm": 2.941629409790039, "learning_rate": 1.258529544363811e-05, "loss": 0.553, "step": 522600 }, { "epoch": 12.038316293635443, "grad_norm": 4.11761999130249, "learning_rate": 1.2578023879815594e-05, "loss": 0.5439, "step": 522800 }, { "epoch": 12.042921617389702, "grad_norm": 2.471980094909668, "learning_rate": 1.2570752315993077e-05, "loss": 0.5569, "step": 523000 }, { "epoch": 12.047526941143962, "grad_norm": 3.2188100814819336, "learning_rate": 1.2563480752170562e-05, "loss": 0.5494, "step": 523200 }, { "epoch": 12.052132264898223, "grad_norm": 3.172586679458618, "learning_rate": 1.2556209188348047e-05, "loss": 0.5461, "step": 523400 }, { "epoch": 12.056737588652481, "grad_norm": 2.9670534133911133, "learning_rate": 1.254893762452553e-05, "loss": 0.5479, "step": 523600 }, { "epoch": 12.061342912406742, "grad_norm": 2.948775053024292, "learning_rate": 1.2541666060703015e-05, "loss": 0.5428, "step": 523800 }, { "epoch": 12.065948236161002, "grad_norm": 2.3510968685150146, "learning_rate": 1.25343944968805e-05, "loss": 0.5526, "step": 524000 }, { "epoch": 12.070553559915263, "grad_norm": 4.567888259887695, "learning_rate": 1.2527122933057985e-05, "loss": 0.5531, "step": 524200 }, { "epoch": 12.075158883669522, "grad_norm": 3.3408639430999756, "learning_rate": 1.2519851369235468e-05, "loss": 0.545, "step": 524400 }, { "epoch": 12.079764207423782, "grad_norm": 3.8344717025756836, "learning_rate": 1.2512579805412952e-05, "loss": 0.5609, "step": 524600 }, { "epoch": 12.084369531178043, "grad_norm": 2.317721366882324, "learning_rate": 1.2505308241590438e-05, "loss": 0.5464, "step": 524800 }, { "epoch": 12.088974854932301, "grad_norm": 2.980634927749634, "learning_rate": 1.249803667776792e-05, "loss": 0.5394, "step": 525000 }, { "epoch": 12.093580178686562, "grad_norm": 3.3348686695098877, "learning_rate": 1.2490765113945405e-05, "loss": 0.5551, "step": 525200 }, { "epoch": 12.098185502440822, "grad_norm": 3.0347678661346436, "learning_rate": 1.248349355012289e-05, "loss": 0.551, "step": 525400 }, { "epoch": 12.102790826195081, "grad_norm": 3.286358118057251, "learning_rate": 1.2476221986300373e-05, "loss": 0.5652, "step": 525600 }, { "epoch": 12.107396149949341, "grad_norm": 3.339553117752075, "learning_rate": 1.2468950422477859e-05, "loss": 0.5548, "step": 525800 }, { "epoch": 12.112001473703602, "grad_norm": 2.7706658840179443, "learning_rate": 1.2461715216474456e-05, "loss": 0.5553, "step": 526000 }, { "epoch": 12.11660679745786, "grad_norm": 2.956782817840576, "learning_rate": 1.245444365265194e-05, "loss": 0.5499, "step": 526200 }, { "epoch": 12.121212121212121, "grad_norm": 3.1089818477630615, "learning_rate": 1.2447172088829423e-05, "loss": 0.5438, "step": 526400 }, { "epoch": 12.125817444966382, "grad_norm": 2.3799941539764404, "learning_rate": 1.2439900525006909e-05, "loss": 0.5455, "step": 526600 }, { "epoch": 12.13042276872064, "grad_norm": 4.066889762878418, "learning_rate": 1.2432628961184393e-05, "loss": 0.554, "step": 526800 }, { "epoch": 12.1350280924749, "grad_norm": 3.080200672149658, "learning_rate": 1.2425357397361875e-05, "loss": 0.5558, "step": 527000 }, { "epoch": 12.139633416229161, "grad_norm": 2.6959545612335205, "learning_rate": 1.2418085833539361e-05, "loss": 0.5523, "step": 527200 }, { "epoch": 12.14423873998342, "grad_norm": 2.727076292037964, "learning_rate": 1.2410814269716845e-05, "loss": 0.5532, "step": 527400 }, { "epoch": 12.14884406373768, "grad_norm": 4.091317653656006, "learning_rate": 1.2403579063713442e-05, "loss": 0.5506, "step": 527600 }, { "epoch": 12.153449387491941, "grad_norm": 3.140193462371826, "learning_rate": 1.2396307499890928e-05, "loss": 0.5585, "step": 527800 }, { "epoch": 12.158054711246201, "grad_norm": 3.9469101428985596, "learning_rate": 1.238903593606841e-05, "loss": 0.5526, "step": 528000 }, { "epoch": 12.16266003500046, "grad_norm": 3.1913647651672363, "learning_rate": 1.2381764372245895e-05, "loss": 0.5434, "step": 528200 }, { "epoch": 12.16726535875472, "grad_norm": 2.5850043296813965, "learning_rate": 1.2374492808423381e-05, "loss": 0.5574, "step": 528400 }, { "epoch": 12.171870682508981, "grad_norm": 2.807478904724121, "learning_rate": 1.2367221244600863e-05, "loss": 0.5513, "step": 528600 }, { "epoch": 12.17647600626324, "grad_norm": 3.168511390686035, "learning_rate": 1.2359949680778348e-05, "loss": 0.5539, "step": 528800 }, { "epoch": 12.1810813300175, "grad_norm": 3.547654867172241, "learning_rate": 1.2352678116955834e-05, "loss": 0.5485, "step": 529000 }, { "epoch": 12.18568665377176, "grad_norm": 2.912477493286133, "learning_rate": 1.2345406553133316e-05, "loss": 0.5397, "step": 529200 }, { "epoch": 12.19029197752602, "grad_norm": 3.6767055988311768, "learning_rate": 1.2338134989310802e-05, "loss": 0.5581, "step": 529400 }, { "epoch": 12.19489730128028, "grad_norm": 3.366831064224243, "learning_rate": 1.2330863425488286e-05, "loss": 0.5556, "step": 529600 }, { "epoch": 12.19950262503454, "grad_norm": 3.2023255825042725, "learning_rate": 1.2323628219484883e-05, "loss": 0.5535, "step": 529800 }, { "epoch": 12.2041079487888, "grad_norm": 2.8660030364990234, "learning_rate": 1.2316356655662366e-05, "loss": 0.5565, "step": 530000 }, { "epoch": 12.20871327254306, "grad_norm": 2.7351365089416504, "learning_rate": 1.2309085091839852e-05, "loss": 0.5524, "step": 530200 }, { "epoch": 12.21331859629732, "grad_norm": 3.0572710037231445, "learning_rate": 1.2301813528017336e-05, "loss": 0.5538, "step": 530400 }, { "epoch": 12.217923920051579, "grad_norm": 3.4272561073303223, "learning_rate": 1.229454196419482e-05, "loss": 0.5479, "step": 530600 }, { "epoch": 12.22252924380584, "grad_norm": 2.7951786518096924, "learning_rate": 1.2287270400372304e-05, "loss": 0.5439, "step": 530800 }, { "epoch": 12.2271345675601, "grad_norm": 3.222205638885498, "learning_rate": 1.2279998836549789e-05, "loss": 0.5448, "step": 531000 }, { "epoch": 12.23173989131436, "grad_norm": 2.7026097774505615, "learning_rate": 1.2272727272727274e-05, "loss": 0.5442, "step": 531200 }, { "epoch": 12.236345215068619, "grad_norm": 3.955737590789795, "learning_rate": 1.2265455708904757e-05, "loss": 0.5558, "step": 531400 }, { "epoch": 12.24095053882288, "grad_norm": 3.2188639640808105, "learning_rate": 1.2258184145082241e-05, "loss": 0.5434, "step": 531600 }, { "epoch": 12.24555586257714, "grad_norm": 2.410057783126831, "learning_rate": 1.2250912581259727e-05, "loss": 0.5655, "step": 531800 }, { "epoch": 12.250161186331399, "grad_norm": 2.9907710552215576, "learning_rate": 1.224364101743721e-05, "loss": 0.5488, "step": 532000 }, { "epoch": 12.254766510085659, "grad_norm": 3.5246376991271973, "learning_rate": 1.2236369453614696e-05, "loss": 0.5616, "step": 532200 }, { "epoch": 12.25937183383992, "grad_norm": 3.297806978225708, "learning_rate": 1.222909788979218e-05, "loss": 0.5688, "step": 532400 }, { "epoch": 12.263977157594178, "grad_norm": 2.848536729812622, "learning_rate": 1.2221826325969662e-05, "loss": 0.5417, "step": 532600 }, { "epoch": 12.268582481348439, "grad_norm": 2.740184783935547, "learning_rate": 1.2214554762147148e-05, "loss": 0.558, "step": 532800 }, { "epoch": 12.2731878051027, "grad_norm": 3.5397231578826904, "learning_rate": 1.2207283198324632e-05, "loss": 0.5495, "step": 533000 }, { "epoch": 12.277793128856958, "grad_norm": 2.4408740997314453, "learning_rate": 1.220004799232123e-05, "loss": 0.5511, "step": 533200 }, { "epoch": 12.282398452611218, "grad_norm": 3.476637363433838, "learning_rate": 1.2192776428498712e-05, "loss": 0.5498, "step": 533400 }, { "epoch": 12.287003776365479, "grad_norm": 3.668010711669922, "learning_rate": 1.2185504864676198e-05, "loss": 0.549, "step": 533600 }, { "epoch": 12.291609100119738, "grad_norm": 3.334040641784668, "learning_rate": 1.2178233300853682e-05, "loss": 0.5352, "step": 533800 }, { "epoch": 12.296214423873998, "grad_norm": 2.8002822399139404, "learning_rate": 1.2170961737031166e-05, "loss": 0.5501, "step": 534000 }, { "epoch": 12.300819747628259, "grad_norm": 2.5833489894866943, "learning_rate": 1.216369017320865e-05, "loss": 0.5483, "step": 534200 }, { "epoch": 12.305425071382519, "grad_norm": 3.02528977394104, "learning_rate": 1.2156418609386135e-05, "loss": 0.5439, "step": 534400 }, { "epoch": 12.310030395136778, "grad_norm": 3.139838933944702, "learning_rate": 1.214914704556362e-05, "loss": 0.5504, "step": 534600 }, { "epoch": 12.314635718891038, "grad_norm": 2.3408186435699463, "learning_rate": 1.2141875481741103e-05, "loss": 0.5431, "step": 534800 }, { "epoch": 12.319241042645299, "grad_norm": 3.896301746368408, "learning_rate": 1.2134603917918587e-05, "loss": 0.5497, "step": 535000 }, { "epoch": 12.323846366399557, "grad_norm": 3.0924932956695557, "learning_rate": 1.2127332354096073e-05, "loss": 0.5636, "step": 535200 }, { "epoch": 12.328451690153818, "grad_norm": 3.347902297973633, "learning_rate": 1.2120060790273556e-05, "loss": 0.5617, "step": 535400 }, { "epoch": 12.333057013908078, "grad_norm": 3.351632833480835, "learning_rate": 1.2112789226451042e-05, "loss": 0.5495, "step": 535600 }, { "epoch": 12.337662337662337, "grad_norm": 3.4726133346557617, "learning_rate": 1.2105517662628526e-05, "loss": 0.563, "step": 535800 }, { "epoch": 12.342267661416598, "grad_norm": 3.684715509414673, "learning_rate": 1.2098246098806008e-05, "loss": 0.548, "step": 536000 }, { "epoch": 12.346872985170858, "grad_norm": 2.476365327835083, "learning_rate": 1.2091010892802605e-05, "loss": 0.5503, "step": 536200 }, { "epoch": 12.351478308925117, "grad_norm": 3.3055906295776367, "learning_rate": 1.2083739328980091e-05, "loss": 0.5412, "step": 536400 }, { "epoch": 12.356083632679377, "grad_norm": 3.4834065437316895, "learning_rate": 1.2076467765157575e-05, "loss": 0.5416, "step": 536600 }, { "epoch": 12.360688956433638, "grad_norm": 3.004270553588867, "learning_rate": 1.2069196201335058e-05, "loss": 0.5557, "step": 536800 }, { "epoch": 12.365294280187896, "grad_norm": 2.7978525161743164, "learning_rate": 1.2061924637512544e-05, "loss": 0.5507, "step": 537000 }, { "epoch": 12.369899603942157, "grad_norm": 3.2909445762634277, "learning_rate": 1.2054653073690028e-05, "loss": 0.5573, "step": 537200 }, { "epoch": 12.374504927696417, "grad_norm": 3.068385124206543, "learning_rate": 1.2047381509867512e-05, "loss": 0.561, "step": 537400 }, { "epoch": 12.379110251450676, "grad_norm": 3.6887574195861816, "learning_rate": 1.2040109946044997e-05, "loss": 0.5581, "step": 537600 }, { "epoch": 12.383715575204937, "grad_norm": 3.1257615089416504, "learning_rate": 1.203283838222248e-05, "loss": 0.5471, "step": 537800 }, { "epoch": 12.388320898959197, "grad_norm": 2.885606527328491, "learning_rate": 1.2025566818399967e-05, "loss": 0.558, "step": 538000 }, { "epoch": 12.392926222713458, "grad_norm": 4.5534138679504395, "learning_rate": 1.201829525457745e-05, "loss": 0.5434, "step": 538200 }, { "epoch": 12.397531546467716, "grad_norm": 4.204901695251465, "learning_rate": 1.2011023690754933e-05, "loss": 0.5512, "step": 538400 }, { "epoch": 12.402136870221977, "grad_norm": 3.7075207233428955, "learning_rate": 1.200375212693242e-05, "loss": 0.5446, "step": 538600 }, { "epoch": 12.406742193976237, "grad_norm": 2.795241117477417, "learning_rate": 1.1996480563109902e-05, "loss": 0.5551, "step": 538800 }, { "epoch": 12.411347517730496, "grad_norm": 3.3998024463653564, "learning_rate": 1.1989208999287388e-05, "loss": 0.5498, "step": 539000 }, { "epoch": 12.415952841484756, "grad_norm": 2.8881683349609375, "learning_rate": 1.1981937435464872e-05, "loss": 0.5478, "step": 539200 }, { "epoch": 12.420558165239017, "grad_norm": 3.143094062805176, "learning_rate": 1.1974665871642354e-05, "loss": 0.5445, "step": 539400 }, { "epoch": 12.425163488993276, "grad_norm": 3.2156903743743896, "learning_rate": 1.196739430781984e-05, "loss": 0.5572, "step": 539600 }, { "epoch": 12.429768812747536, "grad_norm": 2.0139708518981934, "learning_rate": 1.1960122743997325e-05, "loss": 0.5578, "step": 539800 }, { "epoch": 12.434374136501797, "grad_norm": 2.9531643390655518, "learning_rate": 1.1952851180174809e-05, "loss": 0.5555, "step": 540000 }, { "epoch": 12.438979460256055, "grad_norm": 2.786808490753174, "learning_rate": 1.1945579616352293e-05, "loss": 0.5339, "step": 540200 }, { "epoch": 12.443584784010316, "grad_norm": 3.243690013885498, "learning_rate": 1.1938308052529777e-05, "loss": 0.5459, "step": 540400 }, { "epoch": 12.448190107764576, "grad_norm": 3.303994655609131, "learning_rate": 1.1931036488707262e-05, "loss": 0.5634, "step": 540600 }, { "epoch": 12.452795431518835, "grad_norm": 4.073940277099609, "learning_rate": 1.1923764924884746e-05, "loss": 0.5491, "step": 540800 }, { "epoch": 12.457400755273095, "grad_norm": 2.980377435684204, "learning_rate": 1.191649336106223e-05, "loss": 0.5501, "step": 541000 }, { "epoch": 12.462006079027356, "grad_norm": 3.772876501083374, "learning_rate": 1.1909221797239716e-05, "loss": 0.5487, "step": 541200 }, { "epoch": 12.466611402781616, "grad_norm": 2.804576873779297, "learning_rate": 1.1901950233417198e-05, "loss": 0.5479, "step": 541400 }, { "epoch": 12.471216726535875, "grad_norm": 3.1938371658325195, "learning_rate": 1.1894678669594684e-05, "loss": 0.5541, "step": 541600 }, { "epoch": 12.475822050290136, "grad_norm": 2.5504708290100098, "learning_rate": 1.1887407105772169e-05, "loss": 0.5465, "step": 541800 }, { "epoch": 12.480427374044396, "grad_norm": 2.7534172534942627, "learning_rate": 1.1880135541949651e-05, "loss": 0.5542, "step": 542000 }, { "epoch": 12.485032697798655, "grad_norm": 3.2549421787261963, "learning_rate": 1.1872863978127137e-05, "loss": 0.5521, "step": 542200 }, { "epoch": 12.489638021552915, "grad_norm": 3.0159831047058105, "learning_rate": 1.1865592414304621e-05, "loss": 0.5503, "step": 542400 }, { "epoch": 12.494243345307176, "grad_norm": 2.7797956466674805, "learning_rate": 1.1858357208301218e-05, "loss": 0.5564, "step": 542600 }, { "epoch": 12.498848669061434, "grad_norm": 3.071063995361328, "learning_rate": 1.18510856444787e-05, "loss": 0.5411, "step": 542800 }, { "epoch": 12.503453992815695, "grad_norm": 4.068755626678467, "learning_rate": 1.1843814080656187e-05, "loss": 0.5572, "step": 543000 }, { "epoch": 12.508059316569955, "grad_norm": 3.225297689437866, "learning_rate": 1.183654251683367e-05, "loss": 0.5508, "step": 543200 }, { "epoch": 12.512664640324214, "grad_norm": 2.821578025817871, "learning_rate": 1.1829307310830268e-05, "loss": 0.5474, "step": 543400 }, { "epoch": 12.517269964078475, "grad_norm": 5.399388313293457, "learning_rate": 1.1822035747007752e-05, "loss": 0.5504, "step": 543600 }, { "epoch": 12.521875287832735, "grad_norm": 3.5580897331237793, "learning_rate": 1.1814764183185236e-05, "loss": 0.5641, "step": 543800 }, { "epoch": 12.526480611586994, "grad_norm": 3.240264892578125, "learning_rate": 1.180749261936272e-05, "loss": 0.5511, "step": 544000 }, { "epoch": 12.531085935341254, "grad_norm": 3.2633004188537598, "learning_rate": 1.1800221055540205e-05, "loss": 0.5504, "step": 544200 }, { "epoch": 12.535691259095515, "grad_norm": 3.4117608070373535, "learning_rate": 1.1792949491717689e-05, "loss": 0.5429, "step": 544400 }, { "epoch": 12.540296582849773, "grad_norm": 3.1878907680511475, "learning_rate": 1.1785677927895173e-05, "loss": 0.5544, "step": 544600 }, { "epoch": 12.544901906604034, "grad_norm": 3.49251389503479, "learning_rate": 1.1778406364072659e-05, "loss": 0.5402, "step": 544800 }, { "epoch": 12.549507230358294, "grad_norm": 3.136068820953369, "learning_rate": 1.1771134800250141e-05, "loss": 0.5617, "step": 545000 }, { "epoch": 12.554112554112555, "grad_norm": 3.5267179012298584, "learning_rate": 1.1763863236427627e-05, "loss": 0.5583, "step": 545200 }, { "epoch": 12.558717877866814, "grad_norm": 2.639244794845581, "learning_rate": 1.1756591672605112e-05, "loss": 0.5556, "step": 545400 }, { "epoch": 12.563323201621074, "grad_norm": 2.4130160808563232, "learning_rate": 1.1749356466601708e-05, "loss": 0.5378, "step": 545600 }, { "epoch": 12.567928525375335, "grad_norm": 3.0723562240600586, "learning_rate": 1.1742084902779191e-05, "loss": 0.5493, "step": 545800 }, { "epoch": 12.572533849129593, "grad_norm": 3.0826873779296875, "learning_rate": 1.1734813338956677e-05, "loss": 0.5478, "step": 546000 }, { "epoch": 12.577139172883854, "grad_norm": 3.6943721771240234, "learning_rate": 1.1727541775134161e-05, "loss": 0.552, "step": 546200 }, { "epoch": 12.581744496638114, "grad_norm": 3.1983044147491455, "learning_rate": 1.1720270211311644e-05, "loss": 0.5533, "step": 546400 }, { "epoch": 12.586349820392373, "grad_norm": 2.8163883686065674, "learning_rate": 1.171299864748913e-05, "loss": 0.5551, "step": 546600 }, { "epoch": 12.590955144146633, "grad_norm": 3.1819756031036377, "learning_rate": 1.1705727083666614e-05, "loss": 0.5637, "step": 546800 }, { "epoch": 12.595560467900894, "grad_norm": 3.5395870208740234, "learning_rate": 1.1698455519844098e-05, "loss": 0.5517, "step": 547000 }, { "epoch": 12.600165791655153, "grad_norm": 2.773688793182373, "learning_rate": 1.1691183956021582e-05, "loss": 0.5692, "step": 547200 }, { "epoch": 12.604771115409413, "grad_norm": 3.0060763359069824, "learning_rate": 1.1683912392199066e-05, "loss": 0.5571, "step": 547400 }, { "epoch": 12.609376439163674, "grad_norm": 3.25728702545166, "learning_rate": 1.167664082837655e-05, "loss": 0.5608, "step": 547600 }, { "epoch": 12.613981762917934, "grad_norm": 3.356100559234619, "learning_rate": 1.1669369264554035e-05, "loss": 0.5368, "step": 547800 }, { "epoch": 12.618587086672193, "grad_norm": 3.625145435333252, "learning_rate": 1.166209770073152e-05, "loss": 0.538, "step": 548000 }, { "epoch": 12.623192410426453, "grad_norm": 2.9204094409942627, "learning_rate": 1.1654826136909003e-05, "loss": 0.546, "step": 548200 }, { "epoch": 12.627797734180714, "grad_norm": 3.6132349967956543, "learning_rate": 1.1647590930905602e-05, "loss": 0.5592, "step": 548400 }, { "epoch": 12.632403057934972, "grad_norm": 3.495785713195801, "learning_rate": 1.1640319367083084e-05, "loss": 0.539, "step": 548600 }, { "epoch": 12.637008381689233, "grad_norm": 3.632181167602539, "learning_rate": 1.163304780326057e-05, "loss": 0.5363, "step": 548800 }, { "epoch": 12.641613705443493, "grad_norm": 3.3912816047668457, "learning_rate": 1.1625776239438055e-05, "loss": 0.554, "step": 549000 }, { "epoch": 12.646219029197752, "grad_norm": 3.4610326290130615, "learning_rate": 1.1618504675615537e-05, "loss": 0.5537, "step": 549200 }, { "epoch": 12.650824352952013, "grad_norm": 3.4539434909820557, "learning_rate": 1.1611233111793023e-05, "loss": 0.5621, "step": 549400 }, { "epoch": 12.655429676706273, "grad_norm": 3.00600528717041, "learning_rate": 1.160399790578962e-05, "loss": 0.5558, "step": 549600 }, { "epoch": 12.660035000460532, "grad_norm": 3.060939311981201, "learning_rate": 1.1596726341967104e-05, "loss": 0.561, "step": 549800 }, { "epoch": 12.664640324214792, "grad_norm": 3.2902770042419434, "learning_rate": 1.1589454778144588e-05, "loss": 0.5428, "step": 550000 }, { "epoch": 12.669245647969053, "grad_norm": 3.1374452114105225, "learning_rate": 1.1582183214322073e-05, "loss": 0.5523, "step": 550200 }, { "epoch": 12.673850971723311, "grad_norm": 2.6760544776916504, "learning_rate": 1.1574911650499557e-05, "loss": 0.555, "step": 550400 }, { "epoch": 12.678456295477572, "grad_norm": 3.863650321960449, "learning_rate": 1.1567640086677041e-05, "loss": 0.5525, "step": 550600 }, { "epoch": 12.683061619231832, "grad_norm": 2.675981044769287, "learning_rate": 1.1560368522854525e-05, "loss": 0.5486, "step": 550800 }, { "epoch": 12.687666942986091, "grad_norm": 3.3591291904449463, "learning_rate": 1.155309695903201e-05, "loss": 0.5461, "step": 551000 }, { "epoch": 12.692272266740352, "grad_norm": 2.852849006652832, "learning_rate": 1.1545825395209494e-05, "loss": 0.5527, "step": 551200 }, { "epoch": 12.696877590494612, "grad_norm": 2.850482940673828, "learning_rate": 1.1538553831386978e-05, "loss": 0.5474, "step": 551400 }, { "epoch": 12.70148291424887, "grad_norm": 3.2086989879608154, "learning_rate": 1.1531282267564464e-05, "loss": 0.549, "step": 551600 }, { "epoch": 12.706088238003131, "grad_norm": 3.074154853820801, "learning_rate": 1.152404706156106e-05, "loss": 0.559, "step": 551800 }, { "epoch": 12.710693561757392, "grad_norm": 3.1534781455993652, "learning_rate": 1.1516775497738545e-05, "loss": 0.5647, "step": 552000 }, { "epoch": 12.715298885511652, "grad_norm": 3.174724817276001, "learning_rate": 1.1509503933916027e-05, "loss": 0.5527, "step": 552200 }, { "epoch": 12.719904209265911, "grad_norm": 2.7721035480499268, "learning_rate": 1.1502232370093513e-05, "loss": 0.5488, "step": 552400 }, { "epoch": 12.724509533020171, "grad_norm": 3.360368251800537, "learning_rate": 1.1494960806270998e-05, "loss": 0.5643, "step": 552600 }, { "epoch": 12.729114856774432, "grad_norm": 3.283878803253174, "learning_rate": 1.148768924244848e-05, "loss": 0.5432, "step": 552800 }, { "epoch": 12.73372018052869, "grad_norm": 3.110537052154541, "learning_rate": 1.1480417678625966e-05, "loss": 0.555, "step": 553000 }, { "epoch": 12.738325504282951, "grad_norm": 4.010375499725342, "learning_rate": 1.147314611480345e-05, "loss": 0.5433, "step": 553200 }, { "epoch": 12.742930828037212, "grad_norm": 3.154646873474121, "learning_rate": 1.1465874550980934e-05, "loss": 0.5519, "step": 553400 }, { "epoch": 12.74753615179147, "grad_norm": 3.806518077850342, "learning_rate": 1.1458602987158419e-05, "loss": 0.5567, "step": 553600 }, { "epoch": 12.75214147554573, "grad_norm": 3.772310972213745, "learning_rate": 1.1451331423335903e-05, "loss": 0.5504, "step": 553800 }, { "epoch": 12.756746799299991, "grad_norm": 3.2461366653442383, "learning_rate": 1.1444059859513387e-05, "loss": 0.5575, "step": 554000 }, { "epoch": 12.76135212305425, "grad_norm": 3.1284291744232178, "learning_rate": 1.1436788295690871e-05, "loss": 0.5541, "step": 554200 }, { "epoch": 12.76595744680851, "grad_norm": 3.069737434387207, "learning_rate": 1.1429516731868356e-05, "loss": 0.5529, "step": 554400 }, { "epoch": 12.770562770562771, "grad_norm": 3.285787582397461, "learning_rate": 1.142224516804584e-05, "loss": 0.5531, "step": 554600 }, { "epoch": 12.775168094317031, "grad_norm": 2.772050142288208, "learning_rate": 1.1415009962042437e-05, "loss": 0.5438, "step": 554800 }, { "epoch": 12.77977341807129, "grad_norm": 2.2192680835723877, "learning_rate": 1.1407738398219921e-05, "loss": 0.5319, "step": 555000 }, { "epoch": 12.78437874182555, "grad_norm": 3.2888576984405518, "learning_rate": 1.1400466834397407e-05, "loss": 0.5462, "step": 555200 }, { "epoch": 12.788984065579811, "grad_norm": 3.075023889541626, "learning_rate": 1.139319527057489e-05, "loss": 0.5449, "step": 555400 }, { "epoch": 12.79358938933407, "grad_norm": 3.2394826412200928, "learning_rate": 1.1385923706752374e-05, "loss": 0.5461, "step": 555600 }, { "epoch": 12.79819471308833, "grad_norm": 3.1171531677246094, "learning_rate": 1.137865214292986e-05, "loss": 0.5433, "step": 555800 }, { "epoch": 12.80280003684259, "grad_norm": 2.5984578132629395, "learning_rate": 1.1371380579107344e-05, "loss": 0.5531, "step": 556000 }, { "epoch": 12.80740536059685, "grad_norm": 4.881665229797363, "learning_rate": 1.1364109015284826e-05, "loss": 0.5432, "step": 556200 }, { "epoch": 12.81201068435111, "grad_norm": 2.8773512840270996, "learning_rate": 1.1356837451462312e-05, "loss": 0.5667, "step": 556400 }, { "epoch": 12.81661600810537, "grad_norm": 3.3617618083953857, "learning_rate": 1.1349565887639796e-05, "loss": 0.5393, "step": 556600 }, { "epoch": 12.821221331859629, "grad_norm": 2.5510413646698, "learning_rate": 1.134229432381728e-05, "loss": 0.5449, "step": 556800 }, { "epoch": 12.82582665561389, "grad_norm": 3.1804873943328857, "learning_rate": 1.1335022759994765e-05, "loss": 0.5515, "step": 557000 }, { "epoch": 12.83043197936815, "grad_norm": 3.242882490158081, "learning_rate": 1.1327751196172249e-05, "loss": 0.5713, "step": 557200 }, { "epoch": 12.835037303122409, "grad_norm": 2.9455573558807373, "learning_rate": 1.1320479632349733e-05, "loss": 0.5588, "step": 557400 }, { "epoch": 12.83964262687667, "grad_norm": 3.158345937728882, "learning_rate": 1.131324442634633e-05, "loss": 0.5454, "step": 557600 }, { "epoch": 12.84424795063093, "grad_norm": 3.215794563293457, "learning_rate": 1.1305972862523814e-05, "loss": 0.543, "step": 557800 }, { "epoch": 12.848853274385188, "grad_norm": 3.158463954925537, "learning_rate": 1.12987012987013e-05, "loss": 0.5543, "step": 558000 }, { "epoch": 12.853458598139449, "grad_norm": 2.367264747619629, "learning_rate": 1.1291429734878783e-05, "loss": 0.5444, "step": 558200 }, { "epoch": 12.85806392189371, "grad_norm": 2.9616682529449463, "learning_rate": 1.1284158171056267e-05, "loss": 0.5505, "step": 558400 }, { "epoch": 12.86266924564797, "grad_norm": 4.787532329559326, "learning_rate": 1.1276886607233753e-05, "loss": 0.542, "step": 558600 }, { "epoch": 12.867274569402229, "grad_norm": 2.9759278297424316, "learning_rate": 1.1269615043411235e-05, "loss": 0.5531, "step": 558800 }, { "epoch": 12.871879893156489, "grad_norm": 3.0211524963378906, "learning_rate": 1.126234347958872e-05, "loss": 0.5526, "step": 559000 }, { "epoch": 12.87648521691075, "grad_norm": 3.0512332916259766, "learning_rate": 1.1255071915766206e-05, "loss": 0.5651, "step": 559200 }, { "epoch": 12.881090540665008, "grad_norm": 3.1640422344207764, "learning_rate": 1.124780035194369e-05, "loss": 0.5473, "step": 559400 }, { "epoch": 12.885695864419269, "grad_norm": 2.9496026039123535, "learning_rate": 1.1240528788121174e-05, "loss": 0.5571, "step": 559600 }, { "epoch": 12.89030118817353, "grad_norm": 2.9672327041625977, "learning_rate": 1.1233293582117771e-05, "loss": 0.5536, "step": 559800 }, { "epoch": 12.894906511927788, "grad_norm": 3.0200912952423096, "learning_rate": 1.1226022018295255e-05, "loss": 0.5478, "step": 560000 }, { "epoch": 12.899511835682048, "grad_norm": 3.494211196899414, "learning_rate": 1.121875045447274e-05, "loss": 0.5472, "step": 560200 }, { "epoch": 12.904117159436309, "grad_norm": 3.695265293121338, "learning_rate": 1.1211478890650224e-05, "loss": 0.5525, "step": 560400 }, { "epoch": 12.908722483190568, "grad_norm": 2.6711409091949463, "learning_rate": 1.1204207326827708e-05, "loss": 0.5473, "step": 560600 }, { "epoch": 12.913327806944828, "grad_norm": 3.254509925842285, "learning_rate": 1.1196935763005192e-05, "loss": 0.5604, "step": 560800 }, { "epoch": 12.917933130699089, "grad_norm": 3.699026107788086, "learning_rate": 1.1189664199182676e-05, "loss": 0.5427, "step": 561000 }, { "epoch": 12.922538454453347, "grad_norm": 3.1557929515838623, "learning_rate": 1.118239263536016e-05, "loss": 0.5558, "step": 561200 }, { "epoch": 12.927143778207608, "grad_norm": 3.200070381164551, "learning_rate": 1.1175121071537646e-05, "loss": 0.5462, "step": 561400 }, { "epoch": 12.931749101961868, "grad_norm": 2.912868022918701, "learning_rate": 1.1167849507715129e-05, "loss": 0.5445, "step": 561600 }, { "epoch": 12.936354425716129, "grad_norm": 3.3618550300598145, "learning_rate": 1.1160577943892613e-05, "loss": 0.5478, "step": 561800 }, { "epoch": 12.940959749470387, "grad_norm": 2.8849663734436035, "learning_rate": 1.1153306380070099e-05, "loss": 0.5437, "step": 562000 }, { "epoch": 12.945565073224648, "grad_norm": 2.9502649307250977, "learning_rate": 1.1146034816247582e-05, "loss": 0.5529, "step": 562200 }, { "epoch": 12.950170396978908, "grad_norm": 3.4335813522338867, "learning_rate": 1.1138763252425066e-05, "loss": 0.5609, "step": 562400 }, { "epoch": 12.954775720733167, "grad_norm": 2.878819227218628, "learning_rate": 1.1131491688602552e-05, "loss": 0.5495, "step": 562600 }, { "epoch": 12.959381044487428, "grad_norm": 3.1835694313049316, "learning_rate": 1.1124220124780036e-05, "loss": 0.5511, "step": 562800 }, { "epoch": 12.963986368241688, "grad_norm": 3.6071760654449463, "learning_rate": 1.111694856095752e-05, "loss": 0.5501, "step": 563000 }, { "epoch": 12.968591691995947, "grad_norm": 2.51938533782959, "learning_rate": 1.1109676997135004e-05, "loss": 0.5579, "step": 563200 }, { "epoch": 12.973197015750207, "grad_norm": 3.228456497192383, "learning_rate": 1.1102405433312489e-05, "loss": 0.5607, "step": 563400 }, { "epoch": 12.977802339504468, "grad_norm": 3.919980764389038, "learning_rate": 1.1095133869489973e-05, "loss": 0.5575, "step": 563600 }, { "epoch": 12.982407663258726, "grad_norm": 3.511781930923462, "learning_rate": 1.1087862305667457e-05, "loss": 0.5527, "step": 563800 }, { "epoch": 12.987012987012987, "grad_norm": 3.6616077423095703, "learning_rate": 1.1080590741844941e-05, "loss": 0.5486, "step": 564000 }, { "epoch": 12.991618310767247, "grad_norm": 3.4475338459014893, "learning_rate": 1.1073355535841538e-05, "loss": 0.5548, "step": 564200 }, { "epoch": 12.996223634521506, "grad_norm": 3.6094648838043213, "learning_rate": 1.1066083972019022e-05, "loss": 0.5447, "step": 564400 }, { "epoch": 13.0, "eval_loss": 0.535080075263977, "eval_runtime": 174.1646, "eval_samples_per_second": 162.84, "eval_steps_per_second": 10.18, "step": 564564 }, { "epoch": 13.000828958275767, "grad_norm": 3.2227447032928467, "learning_rate": 1.1058812408196507e-05, "loss": 0.5408, "step": 564600 }, { "epoch": 13.005434282030027, "grad_norm": 2.8370840549468994, "learning_rate": 1.1051540844373992e-05, "loss": 0.5382, "step": 564800 }, { "epoch": 13.010039605784286, "grad_norm": 3.57401442527771, "learning_rate": 1.1044269280551475e-05, "loss": 0.5435, "step": 565000 }, { "epoch": 13.014644929538546, "grad_norm": 3.077897310256958, "learning_rate": 1.1037034074548072e-05, "loss": 0.5478, "step": 565200 }, { "epoch": 13.019250253292807, "grad_norm": 3.1777496337890625, "learning_rate": 1.1029762510725556e-05, "loss": 0.5281, "step": 565400 }, { "epoch": 13.023855577047067, "grad_norm": 3.8741352558135986, "learning_rate": 1.1022490946903042e-05, "loss": 0.5499, "step": 565600 }, { "epoch": 13.028460900801326, "grad_norm": 2.772387742996216, "learning_rate": 1.1015219383080525e-05, "loss": 0.5341, "step": 565800 }, { "epoch": 13.033066224555586, "grad_norm": 2.9487922191619873, "learning_rate": 1.1007947819258009e-05, "loss": 0.5334, "step": 566000 }, { "epoch": 13.037671548309847, "grad_norm": 2.795191764831543, "learning_rate": 1.1000676255435495e-05, "loss": 0.55, "step": 566200 }, { "epoch": 13.042276872064106, "grad_norm": 2.9274868965148926, "learning_rate": 1.0993404691612979e-05, "loss": 0.5449, "step": 566400 }, { "epoch": 13.046882195818366, "grad_norm": 2.8535702228546143, "learning_rate": 1.0986133127790463e-05, "loss": 0.5465, "step": 566600 }, { "epoch": 13.051487519572627, "grad_norm": 2.9377224445343018, "learning_rate": 1.0978861563967947e-05, "loss": 0.5542, "step": 566800 }, { "epoch": 13.056092843326885, "grad_norm": 3.308588743209839, "learning_rate": 1.0971590000145432e-05, "loss": 0.5451, "step": 567000 }, { "epoch": 13.060698167081146, "grad_norm": 3.0085132122039795, "learning_rate": 1.0964354794142028e-05, "loss": 0.5511, "step": 567200 }, { "epoch": 13.065303490835406, "grad_norm": 2.9090094566345215, "learning_rate": 1.0957083230319513e-05, "loss": 0.5345, "step": 567400 }, { "epoch": 13.069908814589665, "grad_norm": 2.8537399768829346, "learning_rate": 1.0949811666496997e-05, "loss": 0.5553, "step": 567600 }, { "epoch": 13.074514138343925, "grad_norm": 2.471475124359131, "learning_rate": 1.0942540102674483e-05, "loss": 0.5526, "step": 567800 }, { "epoch": 13.079119462098186, "grad_norm": 3.00661301612854, "learning_rate": 1.0935268538851965e-05, "loss": 0.5402, "step": 568000 }, { "epoch": 13.083724785852445, "grad_norm": 3.291968822479248, "learning_rate": 1.092799697502945e-05, "loss": 0.5389, "step": 568200 }, { "epoch": 13.088330109606705, "grad_norm": 5.072547912597656, "learning_rate": 1.0920725411206936e-05, "loss": 0.5422, "step": 568400 }, { "epoch": 13.092935433360966, "grad_norm": 2.9961328506469727, "learning_rate": 1.0913453847384418e-05, "loss": 0.5345, "step": 568600 }, { "epoch": 13.097540757115226, "grad_norm": 2.3409829139709473, "learning_rate": 1.0906182283561902e-05, "loss": 0.5516, "step": 568800 }, { "epoch": 13.102146080869485, "grad_norm": 2.485668897628784, "learning_rate": 1.0898910719739388e-05, "loss": 0.563, "step": 569000 }, { "epoch": 13.106751404623745, "grad_norm": 2.9506261348724365, "learning_rate": 1.089163915591687e-05, "loss": 0.5452, "step": 569200 }, { "epoch": 13.111356728378006, "grad_norm": 3.2650866508483887, "learning_rate": 1.0884367592094357e-05, "loss": 0.547, "step": 569400 }, { "epoch": 13.115962052132264, "grad_norm": 4.075747489929199, "learning_rate": 1.087709602827184e-05, "loss": 0.5352, "step": 569600 }, { "epoch": 13.120567375886525, "grad_norm": 3.905189275741577, "learning_rate": 1.0869824464449325e-05, "loss": 0.5458, "step": 569800 }, { "epoch": 13.125172699640785, "grad_norm": 3.0449321269989014, "learning_rate": 1.086255290062681e-05, "loss": 0.5476, "step": 570000 }, { "epoch": 13.129778023395044, "grad_norm": 5.308364391326904, "learning_rate": 1.0855281336804293e-05, "loss": 0.5482, "step": 570200 }, { "epoch": 13.134383347149305, "grad_norm": 3.4296975135803223, "learning_rate": 1.0848009772981778e-05, "loss": 0.5381, "step": 570400 }, { "epoch": 13.138988670903565, "grad_norm": 3.1632046699523926, "learning_rate": 1.0840738209159262e-05, "loss": 0.5482, "step": 570600 }, { "epoch": 13.143593994657824, "grad_norm": 2.9227309226989746, "learning_rate": 1.0833466645336746e-05, "loss": 0.5479, "step": 570800 }, { "epoch": 13.148199318412084, "grad_norm": 2.676102638244629, "learning_rate": 1.0826195081514232e-05, "loss": 0.5539, "step": 571000 }, { "epoch": 13.152804642166345, "grad_norm": 3.266191244125366, "learning_rate": 1.0818923517691715e-05, "loss": 0.5409, "step": 571200 }, { "epoch": 13.157409965920603, "grad_norm": 2.1656315326690674, "learning_rate": 1.0811651953869199e-05, "loss": 0.5465, "step": 571400 }, { "epoch": 13.162015289674864, "grad_norm": 2.710066318511963, "learning_rate": 1.0804380390046685e-05, "loss": 0.5401, "step": 571600 }, { "epoch": 13.166620613429124, "grad_norm": 2.9499149322509766, "learning_rate": 1.0797108826224167e-05, "loss": 0.5369, "step": 571800 }, { "epoch": 13.171225937183385, "grad_norm": 3.0565083026885986, "learning_rate": 1.0789837262401651e-05, "loss": 0.5431, "step": 572000 }, { "epoch": 13.175831260937644, "grad_norm": 2.4222073554992676, "learning_rate": 1.0782602056398248e-05, "loss": 0.5354, "step": 572200 }, { "epoch": 13.180436584691904, "grad_norm": 3.2243409156799316, "learning_rate": 1.0775330492575734e-05, "loss": 0.5388, "step": 572400 }, { "epoch": 13.185041908446165, "grad_norm": 3.4836127758026123, "learning_rate": 1.0768058928753217e-05, "loss": 0.5449, "step": 572600 }, { "epoch": 13.189647232200423, "grad_norm": 2.60473370552063, "learning_rate": 1.0760787364930703e-05, "loss": 0.5426, "step": 572800 }, { "epoch": 13.194252555954684, "grad_norm": 2.716947317123413, "learning_rate": 1.0753515801108187e-05, "loss": 0.5478, "step": 573000 }, { "epoch": 13.198857879708944, "grad_norm": 3.650529623031616, "learning_rate": 1.0746244237285671e-05, "loss": 0.5549, "step": 573200 }, { "epoch": 13.203463203463203, "grad_norm": 2.8756980895996094, "learning_rate": 1.0738972673463155e-05, "loss": 0.5316, "step": 573400 }, { "epoch": 13.208068527217463, "grad_norm": 2.6897635459899902, "learning_rate": 1.0731737467459752e-05, "loss": 0.5562, "step": 573600 }, { "epoch": 13.212673850971724, "grad_norm": 3.219444751739502, "learning_rate": 1.0724465903637237e-05, "loss": 0.5417, "step": 573800 }, { "epoch": 13.217279174725983, "grad_norm": 3.117840051651001, "learning_rate": 1.071719433981472e-05, "loss": 0.5398, "step": 574000 }, { "epoch": 13.221884498480243, "grad_norm": 4.030891418457031, "learning_rate": 1.0709922775992205e-05, "loss": 0.5512, "step": 574200 }, { "epoch": 13.226489822234504, "grad_norm": 3.657562494277954, "learning_rate": 1.070265121216969e-05, "loss": 0.5485, "step": 574400 }, { "epoch": 13.231095145988762, "grad_norm": 2.986097574234009, "learning_rate": 1.0695379648347175e-05, "loss": 0.5443, "step": 574600 }, { "epoch": 13.235700469743023, "grad_norm": 3.578052043914795, "learning_rate": 1.0688108084524658e-05, "loss": 0.5537, "step": 574800 }, { "epoch": 13.240305793497283, "grad_norm": 3.427138328552246, "learning_rate": 1.0680836520702142e-05, "loss": 0.5402, "step": 575000 }, { "epoch": 13.244911117251542, "grad_norm": 3.066641330718994, "learning_rate": 1.0673564956879628e-05, "loss": 0.5465, "step": 575200 }, { "epoch": 13.249516441005802, "grad_norm": 2.710906505584717, "learning_rate": 1.066629339305711e-05, "loss": 0.5569, "step": 575400 }, { "epoch": 13.254121764760063, "grad_norm": 2.721404790878296, "learning_rate": 1.0659021829234596e-05, "loss": 0.5504, "step": 575600 }, { "epoch": 13.258727088514323, "grad_norm": 4.378937244415283, "learning_rate": 1.065175026541208e-05, "loss": 0.5419, "step": 575800 }, { "epoch": 13.263332412268582, "grad_norm": 2.9746248722076416, "learning_rate": 1.0644478701589563e-05, "loss": 0.5399, "step": 576000 }, { "epoch": 13.267937736022843, "grad_norm": 3.093886613845825, "learning_rate": 1.0637207137767049e-05, "loss": 0.5506, "step": 576200 }, { "epoch": 13.272543059777103, "grad_norm": 3.421165943145752, "learning_rate": 1.0629935573944533e-05, "loss": 0.5569, "step": 576400 }, { "epoch": 13.277148383531362, "grad_norm": 2.879319906234741, "learning_rate": 1.0622664010122017e-05, "loss": 0.5379, "step": 576600 }, { "epoch": 13.281753707285622, "grad_norm": 4.0503363609313965, "learning_rate": 1.0615392446299501e-05, "loss": 0.5518, "step": 576800 }, { "epoch": 13.286359031039883, "grad_norm": 3.7506306171417236, "learning_rate": 1.0608120882476986e-05, "loss": 0.5454, "step": 577000 }, { "epoch": 13.290964354794141, "grad_norm": 3.531977891921997, "learning_rate": 1.0600849318654472e-05, "loss": 0.5499, "step": 577200 }, { "epoch": 13.295569678548402, "grad_norm": 3.0157346725463867, "learning_rate": 1.0593614112651069e-05, "loss": 0.5529, "step": 577400 }, { "epoch": 13.300175002302662, "grad_norm": 2.793652057647705, "learning_rate": 1.0586342548828551e-05, "loss": 0.5458, "step": 577600 }, { "epoch": 13.304780326056921, "grad_norm": 2.5756351947784424, "learning_rate": 1.0579070985006035e-05, "loss": 0.5509, "step": 577800 }, { "epoch": 13.309385649811182, "grad_norm": 2.810288906097412, "learning_rate": 1.0571799421183521e-05, "loss": 0.5426, "step": 578000 }, { "epoch": 13.313990973565442, "grad_norm": 2.495398759841919, "learning_rate": 1.0564527857361004e-05, "loss": 0.5526, "step": 578200 }, { "epoch": 13.3185962973197, "grad_norm": 4.127913475036621, "learning_rate": 1.0557256293538488e-05, "loss": 0.5498, "step": 578400 }, { "epoch": 13.323201621073961, "grad_norm": 3.074737071990967, "learning_rate": 1.0549984729715974e-05, "loss": 0.543, "step": 578600 }, { "epoch": 13.327806944828222, "grad_norm": 2.928070306777954, "learning_rate": 1.0542713165893456e-05, "loss": 0.5449, "step": 578800 }, { "epoch": 13.332412268582482, "grad_norm": 3.301532745361328, "learning_rate": 1.0535441602070942e-05, "loss": 0.5471, "step": 579000 }, { "epoch": 13.337017592336741, "grad_norm": 3.355616331100464, "learning_rate": 1.0528170038248427e-05, "loss": 0.5447, "step": 579200 }, { "epoch": 13.341622916091001, "grad_norm": 2.8991310596466064, "learning_rate": 1.0520898474425909e-05, "loss": 0.5405, "step": 579400 }, { "epoch": 13.346228239845262, "grad_norm": 3.3360657691955566, "learning_rate": 1.0513626910603395e-05, "loss": 0.5585, "step": 579600 }, { "epoch": 13.35083356359952, "grad_norm": 4.818231105804443, "learning_rate": 1.050635534678088e-05, "loss": 0.5512, "step": 579800 }, { "epoch": 13.355438887353781, "grad_norm": 4.0637078285217285, "learning_rate": 1.0499083782958362e-05, "loss": 0.5499, "step": 580000 }, { "epoch": 13.360044211108042, "grad_norm": 4.462865352630615, "learning_rate": 1.0491812219135848e-05, "loss": 0.5425, "step": 580200 }, { "epoch": 13.3646495348623, "grad_norm": 2.5351686477661133, "learning_rate": 1.0484540655313332e-05, "loss": 0.5363, "step": 580400 }, { "epoch": 13.36925485861656, "grad_norm": 3.533302068710327, "learning_rate": 1.0477269091490818e-05, "loss": 0.5424, "step": 580600 }, { "epoch": 13.373860182370821, "grad_norm": 3.593005418777466, "learning_rate": 1.04699975276683e-05, "loss": 0.5491, "step": 580800 }, { "epoch": 13.37846550612508, "grad_norm": 3.0207858085632324, "learning_rate": 1.0462725963845784e-05, "loss": 0.5451, "step": 581000 }, { "epoch": 13.38307082987934, "grad_norm": 3.600573778152466, "learning_rate": 1.045545440002327e-05, "loss": 0.5409, "step": 581200 }, { "epoch": 13.3876761536336, "grad_norm": 3.4477622509002686, "learning_rate": 1.0448182836200753e-05, "loss": 0.5515, "step": 581400 }, { "epoch": 13.39228147738786, "grad_norm": 4.029882431030273, "learning_rate": 1.044094763019735e-05, "loss": 0.557, "step": 581600 }, { "epoch": 13.39688680114212, "grad_norm": 3.970107316970825, "learning_rate": 1.0433676066374834e-05, "loss": 0.545, "step": 581800 }, { "epoch": 13.40149212489638, "grad_norm": 3.111429214477539, "learning_rate": 1.042640450255232e-05, "loss": 0.5443, "step": 582000 }, { "epoch": 13.406097448650641, "grad_norm": 3.304234027862549, "learning_rate": 1.0419132938729802e-05, "loss": 0.5425, "step": 582200 }, { "epoch": 13.4107027724049, "grad_norm": 3.118661642074585, "learning_rate": 1.0411861374907288e-05, "loss": 0.5426, "step": 582400 }, { "epoch": 13.41530809615916, "grad_norm": 3.274920701980591, "learning_rate": 1.0404589811084773e-05, "loss": 0.5526, "step": 582600 }, { "epoch": 13.41991341991342, "grad_norm": 2.8108534812927246, "learning_rate": 1.0397318247262255e-05, "loss": 0.5496, "step": 582800 }, { "epoch": 13.42451874366768, "grad_norm": 3.452916383743286, "learning_rate": 1.0390046683439741e-05, "loss": 0.5546, "step": 583000 }, { "epoch": 13.42912406742194, "grad_norm": 2.889641284942627, "learning_rate": 1.0382775119617225e-05, "loss": 0.5399, "step": 583200 }, { "epoch": 13.4337293911762, "grad_norm": 2.9781227111816406, "learning_rate": 1.037550355579471e-05, "loss": 0.5442, "step": 583400 }, { "epoch": 13.438334714930459, "grad_norm": 2.3209664821624756, "learning_rate": 1.0368231991972194e-05, "loss": 0.5563, "step": 583600 }, { "epoch": 13.44294003868472, "grad_norm": 3.272317409515381, "learning_rate": 1.036099678596879e-05, "loss": 0.5424, "step": 583800 }, { "epoch": 13.44754536243898, "grad_norm": 3.984229803085327, "learning_rate": 1.0353725222146275e-05, "loss": 0.5377, "step": 584000 }, { "epoch": 13.452150686193239, "grad_norm": 2.723288059234619, "learning_rate": 1.034645365832376e-05, "loss": 0.5353, "step": 584200 }, { "epoch": 13.4567560099475, "grad_norm": 2.800933361053467, "learning_rate": 1.0339218452320358e-05, "loss": 0.5375, "step": 584400 }, { "epoch": 13.46136133370176, "grad_norm": 8.197662353515625, "learning_rate": 1.033194688849784e-05, "loss": 0.5404, "step": 584600 }, { "epoch": 13.465966657456018, "grad_norm": 2.963289737701416, "learning_rate": 1.0324675324675324e-05, "loss": 0.5413, "step": 584800 }, { "epoch": 13.470571981210279, "grad_norm": 3.9458508491516113, "learning_rate": 1.031740376085281e-05, "loss": 0.5431, "step": 585000 }, { "epoch": 13.47517730496454, "grad_norm": 2.880087375640869, "learning_rate": 1.0310132197030293e-05, "loss": 0.5559, "step": 585200 }, { "epoch": 13.479782628718798, "grad_norm": 2.817814826965332, "learning_rate": 1.0302860633207779e-05, "loss": 0.5403, "step": 585400 }, { "epoch": 13.484387952473059, "grad_norm": 3.2126080989837646, "learning_rate": 1.0295589069385263e-05, "loss": 0.536, "step": 585600 }, { "epoch": 13.488993276227319, "grad_norm": 2.991262674331665, "learning_rate": 1.0288317505562746e-05, "loss": 0.5428, "step": 585800 }, { "epoch": 13.49359859998158, "grad_norm": 2.65507435798645, "learning_rate": 1.0281045941740231e-05, "loss": 0.5384, "step": 586000 }, { "epoch": 13.498203923735838, "grad_norm": 3.388256788253784, "learning_rate": 1.0273774377917716e-05, "loss": 0.5431, "step": 586200 }, { "epoch": 13.502809247490099, "grad_norm": 3.2963063716888428, "learning_rate": 1.0266502814095198e-05, "loss": 0.5506, "step": 586400 }, { "epoch": 13.50741457124436, "grad_norm": 3.947801351547241, "learning_rate": 1.0259231250272684e-05, "loss": 0.5419, "step": 586600 }, { "epoch": 13.512019894998618, "grad_norm": 3.5557472705841064, "learning_rate": 1.0251959686450168e-05, "loss": 0.5538, "step": 586800 }, { "epoch": 13.516625218752878, "grad_norm": 3.8408093452453613, "learning_rate": 1.0244688122627653e-05, "loss": 0.5394, "step": 587000 }, { "epoch": 13.521230542507139, "grad_norm": 3.790393352508545, "learning_rate": 1.0237416558805137e-05, "loss": 0.5567, "step": 587200 }, { "epoch": 13.525835866261398, "grad_norm": 2.450249195098877, "learning_rate": 1.0230144994982621e-05, "loss": 0.5475, "step": 587400 }, { "epoch": 13.530441190015658, "grad_norm": 3.272376537322998, "learning_rate": 1.0222873431160107e-05, "loss": 0.5481, "step": 587600 }, { "epoch": 13.535046513769919, "grad_norm": 3.2694225311279297, "learning_rate": 1.021560186733759e-05, "loss": 0.5467, "step": 587800 }, { "epoch": 13.539651837524177, "grad_norm": 2.739555835723877, "learning_rate": 1.0208330303515074e-05, "loss": 0.541, "step": 588000 }, { "epoch": 13.544257161278438, "grad_norm": 3.357004404067993, "learning_rate": 1.020105873969256e-05, "loss": 0.5549, "step": 588200 }, { "epoch": 13.548862485032698, "grad_norm": 3.012406349182129, "learning_rate": 1.0193823533689156e-05, "loss": 0.5503, "step": 588400 }, { "epoch": 13.553467808786957, "grad_norm": 2.7790162563323975, "learning_rate": 1.0186551969866639e-05, "loss": 0.5447, "step": 588600 }, { "epoch": 13.558073132541217, "grad_norm": 3.0401885509490967, "learning_rate": 1.0179280406044125e-05, "loss": 0.5352, "step": 588800 }, { "epoch": 13.562678456295478, "grad_norm": 3.2460315227508545, "learning_rate": 1.0172008842221609e-05, "loss": 0.5387, "step": 589000 }, { "epoch": 13.567283780049738, "grad_norm": 3.198957681655884, "learning_rate": 1.0164737278399092e-05, "loss": 0.5492, "step": 589200 }, { "epoch": 13.571889103803997, "grad_norm": 3.1791601181030273, "learning_rate": 1.0157465714576578e-05, "loss": 0.5488, "step": 589400 }, { "epoch": 13.576494427558258, "grad_norm": 3.927855968475342, "learning_rate": 1.0150194150754062e-05, "loss": 0.5459, "step": 589600 }, { "epoch": 13.581099751312518, "grad_norm": 3.2088983058929443, "learning_rate": 1.0142922586931544e-05, "loss": 0.5425, "step": 589800 }, { "epoch": 13.585705075066777, "grad_norm": 3.1823244094848633, "learning_rate": 1.013565102310903e-05, "loss": 0.5521, "step": 590000 }, { "epoch": 13.590310398821037, "grad_norm": 3.0992443561553955, "learning_rate": 1.0128379459286514e-05, "loss": 0.5455, "step": 590200 }, { "epoch": 13.594915722575298, "grad_norm": 3.4294822216033936, "learning_rate": 1.0121107895463999e-05, "loss": 0.54, "step": 590400 }, { "epoch": 13.599521046329556, "grad_norm": 3.575033187866211, "learning_rate": 1.0113836331641483e-05, "loss": 0.553, "step": 590600 }, { "epoch": 13.604126370083817, "grad_norm": 2.823061466217041, "learning_rate": 1.0106564767818967e-05, "loss": 0.5422, "step": 590800 }, { "epoch": 13.608731693838077, "grad_norm": 3.334573268890381, "learning_rate": 1.0099293203996453e-05, "loss": 0.5412, "step": 591000 }, { "epoch": 13.613337017592336, "grad_norm": 3.7110838890075684, "learning_rate": 1.0092021640173936e-05, "loss": 0.5439, "step": 591200 }, { "epoch": 13.617942341346597, "grad_norm": 2.9446306228637695, "learning_rate": 1.008475007635142e-05, "loss": 0.551, "step": 591400 }, { "epoch": 13.622547665100857, "grad_norm": 3.2592220306396484, "learning_rate": 1.0077478512528906e-05, "loss": 0.546, "step": 591600 }, { "epoch": 13.627152988855116, "grad_norm": 2.9190750122070312, "learning_rate": 1.0070206948706388e-05, "loss": 0.5515, "step": 591800 }, { "epoch": 13.631758312609376, "grad_norm": 3.803069829940796, "learning_rate": 1.0062935384883874e-05, "loss": 0.5438, "step": 592000 }, { "epoch": 13.636363636363637, "grad_norm": 3.252239942550659, "learning_rate": 1.0055663821061358e-05, "loss": 0.547, "step": 592200 }, { "epoch": 13.640968960117895, "grad_norm": 3.070131778717041, "learning_rate": 1.004839225723884e-05, "loss": 0.5504, "step": 592400 }, { "epoch": 13.645574283872156, "grad_norm": 3.4407927989959717, "learning_rate": 1.0041120693416327e-05, "loss": 0.5491, "step": 592600 }, { "epoch": 13.650179607626416, "grad_norm": 3.2702760696411133, "learning_rate": 1.0033849129593811e-05, "loss": 0.5437, "step": 592800 }, { "epoch": 13.654784931380677, "grad_norm": 3.0138094425201416, "learning_rate": 1.0026577565771295e-05, "loss": 0.5406, "step": 593000 }, { "epoch": 13.659390255134936, "grad_norm": 3.4657397270202637, "learning_rate": 1.0019342359767892e-05, "loss": 0.5364, "step": 593200 }, { "epoch": 13.663995578889196, "grad_norm": 2.7210536003112793, "learning_rate": 1.0012070795945376e-05, "loss": 0.5348, "step": 593400 }, { "epoch": 13.668600902643457, "grad_norm": 2.796694278717041, "learning_rate": 1.000479923212286e-05, "loss": 0.5356, "step": 593600 }, { "epoch": 13.673206226397715, "grad_norm": 3.0048344135284424, "learning_rate": 9.997527668300345e-06, "loss": 0.547, "step": 593800 }, { "epoch": 13.677811550151976, "grad_norm": 3.452331781387329, "learning_rate": 9.990292462296942e-06, "loss": 0.5483, "step": 594000 }, { "epoch": 13.682416873906236, "grad_norm": 2.8550798892974854, "learning_rate": 9.983020898474426e-06, "loss": 0.5466, "step": 594200 }, { "epoch": 13.687022197660495, "grad_norm": 3.329484701156616, "learning_rate": 9.97574933465191e-06, "loss": 0.5404, "step": 594400 }, { "epoch": 13.691627521414755, "grad_norm": 2.8781020641326904, "learning_rate": 9.968477770829396e-06, "loss": 0.5518, "step": 594600 }, { "epoch": 13.696232845169016, "grad_norm": 3.1248257160186768, "learning_rate": 9.961206207006879e-06, "loss": 0.5421, "step": 594800 }, { "epoch": 13.700838168923275, "grad_norm": 3.2857155799865723, "learning_rate": 9.953934643184364e-06, "loss": 0.5479, "step": 595000 }, { "epoch": 13.705443492677535, "grad_norm": 3.693159818649292, "learning_rate": 9.946663079361849e-06, "loss": 0.5427, "step": 595200 }, { "epoch": 13.710048816431796, "grad_norm": 3.3811700344085693, "learning_rate": 9.939391515539331e-06, "loss": 0.5495, "step": 595400 }, { "epoch": 13.714654140186056, "grad_norm": 3.110826253890991, "learning_rate": 9.932119951716817e-06, "loss": 0.5327, "step": 595600 }, { "epoch": 13.719259463940315, "grad_norm": 3.164827346801758, "learning_rate": 9.924848387894301e-06, "loss": 0.5373, "step": 595800 }, { "epoch": 13.723864787694575, "grad_norm": 3.971219062805176, "learning_rate": 9.917576824071784e-06, "loss": 0.5416, "step": 596000 }, { "epoch": 13.728470111448836, "grad_norm": 3.429321527481079, "learning_rate": 9.91030526024927e-06, "loss": 0.5397, "step": 596200 }, { "epoch": 13.733075435203094, "grad_norm": 2.479889392852783, "learning_rate": 9.903033696426754e-06, "loss": 0.5494, "step": 596400 }, { "epoch": 13.737680758957355, "grad_norm": 2.7341201305389404, "learning_rate": 9.895762132604238e-06, "loss": 0.5354, "step": 596600 }, { "epoch": 13.742286082711615, "grad_norm": 3.743123769760132, "learning_rate": 9.888490568781722e-06, "loss": 0.5502, "step": 596800 }, { "epoch": 13.746891406465874, "grad_norm": 2.8174068927764893, "learning_rate": 9.88125536277832e-06, "loss": 0.5447, "step": 597000 }, { "epoch": 13.751496730220135, "grad_norm": 2.5518455505371094, "learning_rate": 9.873983798955804e-06, "loss": 0.5406, "step": 597200 }, { "epoch": 13.756102053974395, "grad_norm": 2.6613545417785645, "learning_rate": 9.866712235133288e-06, "loss": 0.5436, "step": 597400 }, { "epoch": 13.760707377728654, "grad_norm": 3.2584588527679443, "learning_rate": 9.859440671310772e-06, "loss": 0.5454, "step": 597600 }, { "epoch": 13.765312701482914, "grad_norm": 3.6804134845733643, "learning_rate": 9.852169107488256e-06, "loss": 0.5415, "step": 597800 }, { "epoch": 13.769918025237175, "grad_norm": 3.93707013130188, "learning_rate": 9.844933901484853e-06, "loss": 0.5405, "step": 598000 }, { "epoch": 13.774523348991433, "grad_norm": 2.835728645324707, "learning_rate": 9.837662337662339e-06, "loss": 0.5355, "step": 598200 }, { "epoch": 13.779128672745694, "grad_norm": 2.6848623752593994, "learning_rate": 9.830390773839822e-06, "loss": 0.5337, "step": 598400 }, { "epoch": 13.783733996499954, "grad_norm": 3.4841365814208984, "learning_rate": 9.823119210017307e-06, "loss": 0.531, "step": 598600 }, { "epoch": 13.788339320254213, "grad_norm": 2.816138982772827, "learning_rate": 9.815847646194792e-06, "loss": 0.5315, "step": 598800 }, { "epoch": 13.792944644008474, "grad_norm": 2.922630786895752, "learning_rate": 9.808576082372274e-06, "loss": 0.5528, "step": 599000 }, { "epoch": 13.797549967762734, "grad_norm": 3.093204975128174, "learning_rate": 9.80130451854976e-06, "loss": 0.5525, "step": 599200 }, { "epoch": 13.802155291516993, "grad_norm": 3.1738221645355225, "learning_rate": 9.794032954727244e-06, "loss": 0.5269, "step": 599400 }, { "epoch": 13.806760615271253, "grad_norm": 2.908632755279541, "learning_rate": 9.786761390904727e-06, "loss": 0.5307, "step": 599600 }, { "epoch": 13.811365939025514, "grad_norm": 3.4265527725219727, "learning_rate": 9.779489827082213e-06, "loss": 0.5327, "step": 599800 }, { "epoch": 13.815971262779774, "grad_norm": 3.252572774887085, "learning_rate": 9.772218263259697e-06, "loss": 0.5453, "step": 600000 }, { "epoch": 13.820576586534033, "grad_norm": 3.13742733001709, "learning_rate": 9.764946699437181e-06, "loss": 0.546, "step": 600200 }, { "epoch": 13.825181910288293, "grad_norm": 2.9839015007019043, "learning_rate": 9.757675135614665e-06, "loss": 0.5444, "step": 600400 }, { "epoch": 13.829787234042554, "grad_norm": 2.5981061458587646, "learning_rate": 9.75040357179215e-06, "loss": 0.5294, "step": 600600 }, { "epoch": 13.834392557796813, "grad_norm": 2.8224525451660156, "learning_rate": 9.743132007969634e-06, "loss": 0.5433, "step": 600800 }, { "epoch": 13.838997881551073, "grad_norm": 2.841012477874756, "learning_rate": 9.735860444147118e-06, "loss": 0.5346, "step": 601000 }, { "epoch": 13.843603205305334, "grad_norm": 5.748844146728516, "learning_rate": 9.728588880324602e-06, "loss": 0.5509, "step": 601200 }, { "epoch": 13.848208529059592, "grad_norm": 2.9748475551605225, "learning_rate": 9.721317316502088e-06, "loss": 0.5386, "step": 601400 }, { "epoch": 13.852813852813853, "grad_norm": 3.0845344066619873, "learning_rate": 9.71404575267957e-06, "loss": 0.5288, "step": 601600 }, { "epoch": 13.857419176568113, "grad_norm": 2.9533259868621826, "learning_rate": 9.706774188857057e-06, "loss": 0.5563, "step": 601800 }, { "epoch": 13.862024500322372, "grad_norm": 4.201800346374512, "learning_rate": 9.699502625034541e-06, "loss": 0.5445, "step": 602000 }, { "epoch": 13.866629824076632, "grad_norm": 2.9285476207733154, "learning_rate": 9.692231061212023e-06, "loss": 0.5451, "step": 602200 }, { "epoch": 13.871235147830893, "grad_norm": 4.119074821472168, "learning_rate": 9.68495949738951e-06, "loss": 0.5433, "step": 602400 }, { "epoch": 13.875840471585153, "grad_norm": 4.315847396850586, "learning_rate": 9.677687933566994e-06, "loss": 0.5429, "step": 602600 }, { "epoch": 13.880445795339412, "grad_norm": 3.370650053024292, "learning_rate": 9.670416369744478e-06, "loss": 0.538, "step": 602800 }, { "epoch": 13.885051119093673, "grad_norm": 3.967122793197632, "learning_rate": 9.663144805921962e-06, "loss": 0.5346, "step": 603000 }, { "epoch": 13.889656442847933, "grad_norm": 3.766850233078003, "learning_rate": 9.655873242099446e-06, "loss": 0.5345, "step": 603200 }, { "epoch": 13.894261766602192, "grad_norm": 2.5367722511291504, "learning_rate": 9.64860167827693e-06, "loss": 0.5518, "step": 603400 }, { "epoch": 13.898867090356452, "grad_norm": 3.3279225826263428, "learning_rate": 9.641330114454415e-06, "loss": 0.5508, "step": 603600 }, { "epoch": 13.903472414110713, "grad_norm": 2.8402557373046875, "learning_rate": 9.634058550631899e-06, "loss": 0.5419, "step": 603800 }, { "epoch": 13.908077737864971, "grad_norm": 2.7564351558685303, "learning_rate": 9.626786986809383e-06, "loss": 0.545, "step": 604000 }, { "epoch": 13.912683061619232, "grad_norm": 2.57075572013855, "learning_rate": 9.61955178080598e-06, "loss": 0.5444, "step": 604200 }, { "epoch": 13.917288385373492, "grad_norm": 3.027113676071167, "learning_rate": 9.612280216983464e-06, "loss": 0.5438, "step": 604400 }, { "epoch": 13.921893709127751, "grad_norm": 3.521167516708374, "learning_rate": 9.60500865316095e-06, "loss": 0.5461, "step": 604600 }, { "epoch": 13.926499032882012, "grad_norm": 3.0881049633026123, "learning_rate": 9.597737089338434e-06, "loss": 0.5481, "step": 604800 }, { "epoch": 13.931104356636272, "grad_norm": 2.756943941116333, "learning_rate": 9.590465525515917e-06, "loss": 0.551, "step": 605000 }, { "epoch": 13.93570968039053, "grad_norm": 2.7377870082855225, "learning_rate": 9.583193961693403e-06, "loss": 0.5483, "step": 605200 }, { "epoch": 13.940315004144791, "grad_norm": 3.1983232498168945, "learning_rate": 9.575922397870887e-06, "loss": 0.5511, "step": 605400 }, { "epoch": 13.944920327899052, "grad_norm": 2.8796114921569824, "learning_rate": 9.56865083404837e-06, "loss": 0.5355, "step": 605600 }, { "epoch": 13.94952565165331, "grad_norm": 3.955488443374634, "learning_rate": 9.561379270225855e-06, "loss": 0.5489, "step": 605800 }, { "epoch": 13.95413097540757, "grad_norm": 3.366757869720459, "learning_rate": 9.55410770640334e-06, "loss": 0.5402, "step": 606000 }, { "epoch": 13.958736299161831, "grad_norm": 3.5042965412139893, "learning_rate": 9.546836142580824e-06, "loss": 0.5414, "step": 606200 }, { "epoch": 13.963341622916092, "grad_norm": 4.029479026794434, "learning_rate": 9.539564578758308e-06, "loss": 0.5524, "step": 606400 }, { "epoch": 13.96794694667035, "grad_norm": 3.883596897125244, "learning_rate": 9.532293014935792e-06, "loss": 0.5432, "step": 606600 }, { "epoch": 13.972552270424611, "grad_norm": 3.188150644302368, "learning_rate": 9.52505780893239e-06, "loss": 0.5396, "step": 606800 }, { "epoch": 13.977157594178871, "grad_norm": 2.3923470973968506, "learning_rate": 9.517786245109873e-06, "loss": 0.5358, "step": 607000 }, { "epoch": 13.98176291793313, "grad_norm": 2.5199859142303467, "learning_rate": 9.510514681287358e-06, "loss": 0.5378, "step": 607200 }, { "epoch": 13.98636824168739, "grad_norm": 3.514549970626831, "learning_rate": 9.503279475283955e-06, "loss": 0.5471, "step": 607400 }, { "epoch": 13.990973565441651, "grad_norm": 3.0256316661834717, "learning_rate": 9.496007911461439e-06, "loss": 0.5447, "step": 607600 }, { "epoch": 13.99557888919591, "grad_norm": 2.8945305347442627, "learning_rate": 9.488736347638923e-06, "loss": 0.5516, "step": 607800 }, { "epoch": 14.0, "eval_loss": 0.5243151783943176, "eval_runtime": 171.7907, "eval_samples_per_second": 165.09, "eval_steps_per_second": 10.321, "step": 607992 }, { "epoch": 14.00018421295017, "grad_norm": 3.234140634536743, "learning_rate": 9.481464783816407e-06, "loss": 0.5465, "step": 608000 }, { "epoch": 14.00478953670443, "grad_norm": 3.9970383644104004, "learning_rate": 9.474193219993893e-06, "loss": 0.5352, "step": 608200 }, { "epoch": 14.00939486045869, "grad_norm": 2.708507537841797, "learning_rate": 9.466921656171377e-06, "loss": 0.5386, "step": 608400 }, { "epoch": 14.01400018421295, "grad_norm": 3.3150854110717773, "learning_rate": 9.45965009234886e-06, "loss": 0.5356, "step": 608600 }, { "epoch": 14.01860550796721, "grad_norm": 2.4977152347564697, "learning_rate": 9.452378528526346e-06, "loss": 0.5303, "step": 608800 }, { "epoch": 14.02321083172147, "grad_norm": 2.6058592796325684, "learning_rate": 9.44510696470383e-06, "loss": 0.5452, "step": 609000 }, { "epoch": 14.02781615547573, "grad_norm": 3.491718292236328, "learning_rate": 9.437835400881313e-06, "loss": 0.5431, "step": 609200 }, { "epoch": 14.03242147922999, "grad_norm": 3.100562810897827, "learning_rate": 9.430563837058798e-06, "loss": 0.534, "step": 609400 }, { "epoch": 14.03702680298425, "grad_norm": 3.014887809753418, "learning_rate": 9.423292273236283e-06, "loss": 0.5268, "step": 609600 }, { "epoch": 14.04163212673851, "grad_norm": 4.120047569274902, "learning_rate": 9.416020709413767e-06, "loss": 0.5492, "step": 609800 }, { "epoch": 14.04623745049277, "grad_norm": 3.292614698410034, "learning_rate": 9.408749145591251e-06, "loss": 0.5406, "step": 610000 }, { "epoch": 14.05084277424703, "grad_norm": 2.325287342071533, "learning_rate": 9.401477581768735e-06, "loss": 0.5308, "step": 610200 }, { "epoch": 14.055448098001289, "grad_norm": 2.72371244430542, "learning_rate": 9.394242375765332e-06, "loss": 0.5306, "step": 610400 }, { "epoch": 14.06005342175555, "grad_norm": 2.5140206813812256, "learning_rate": 9.386970811942816e-06, "loss": 0.5412, "step": 610600 }, { "epoch": 14.06465874550981, "grad_norm": 2.95316481590271, "learning_rate": 9.3796992481203e-06, "loss": 0.5408, "step": 610800 }, { "epoch": 14.069264069264069, "grad_norm": 3.183243989944458, "learning_rate": 9.372427684297785e-06, "loss": 0.5371, "step": 611000 }, { "epoch": 14.07386939301833, "grad_norm": 2.708509683609009, "learning_rate": 9.365156120475269e-06, "loss": 0.5479, "step": 611200 }, { "epoch": 14.07847471677259, "grad_norm": 3.455313205718994, "learning_rate": 9.357884556652753e-06, "loss": 0.5429, "step": 611400 }, { "epoch": 14.083080040526848, "grad_norm": 3.839599609375, "learning_rate": 9.35061299283024e-06, "loss": 0.5455, "step": 611600 }, { "epoch": 14.087685364281109, "grad_norm": 3.0706214904785156, "learning_rate": 9.343341429007722e-06, "loss": 0.5382, "step": 611800 }, { "epoch": 14.09229068803537, "grad_norm": 2.855938673019409, "learning_rate": 9.336069865185206e-06, "loss": 0.5346, "step": 612000 }, { "epoch": 14.096896011789628, "grad_norm": 2.6702816486358643, "learning_rate": 9.328798301362692e-06, "loss": 0.5304, "step": 612200 }, { "epoch": 14.101501335543889, "grad_norm": 2.650740146636963, "learning_rate": 9.321526737540176e-06, "loss": 0.549, "step": 612400 }, { "epoch": 14.106106659298149, "grad_norm": 4.2644124031066895, "learning_rate": 9.31425517371766e-06, "loss": 0.5448, "step": 612600 }, { "epoch": 14.110711983052408, "grad_norm": 4.463499546051025, "learning_rate": 9.306983609895145e-06, "loss": 0.5472, "step": 612800 }, { "epoch": 14.115317306806668, "grad_norm": 3.099343776702881, "learning_rate": 9.299712046072629e-06, "loss": 0.5316, "step": 613000 }, { "epoch": 14.119922630560929, "grad_norm": 3.3200466632843018, "learning_rate": 9.292440482250113e-06, "loss": 0.5424, "step": 613200 }, { "epoch": 14.12452795431519, "grad_norm": 4.582034587860107, "learning_rate": 9.285168918427597e-06, "loss": 0.5322, "step": 613400 }, { "epoch": 14.129133278069448, "grad_norm": 2.7270848751068115, "learning_rate": 9.277897354605081e-06, "loss": 0.5405, "step": 613600 }, { "epoch": 14.133738601823708, "grad_norm": 3.561002731323242, "learning_rate": 9.270625790782566e-06, "loss": 0.5403, "step": 613800 }, { "epoch": 14.138343925577969, "grad_norm": 2.916008710861206, "learning_rate": 9.26335422696005e-06, "loss": 0.5363, "step": 614000 }, { "epoch": 14.142949249332228, "grad_norm": 3.4459245204925537, "learning_rate": 9.256082663137536e-06, "loss": 0.5467, "step": 614200 }, { "epoch": 14.147554573086488, "grad_norm": 4.279280662536621, "learning_rate": 9.248811099315018e-06, "loss": 0.5397, "step": 614400 }, { "epoch": 14.152159896840748, "grad_norm": 3.383220672607422, "learning_rate": 9.241539535492503e-06, "loss": 0.5442, "step": 614600 }, { "epoch": 14.156765220595007, "grad_norm": 2.799022912979126, "learning_rate": 9.234267971669988e-06, "loss": 0.5455, "step": 614800 }, { "epoch": 14.161370544349268, "grad_norm": 3.1966896057128906, "learning_rate": 9.226996407847471e-06, "loss": 0.5399, "step": 615000 }, { "epoch": 14.165975868103528, "grad_norm": 3.2406864166259766, "learning_rate": 9.219724844024955e-06, "loss": 0.543, "step": 615200 }, { "epoch": 14.170581191857787, "grad_norm": 2.572612762451172, "learning_rate": 9.212453280202441e-06, "loss": 0.5397, "step": 615400 }, { "epoch": 14.175186515612047, "grad_norm": 2.9783430099487305, "learning_rate": 9.205181716379925e-06, "loss": 0.5372, "step": 615600 }, { "epoch": 14.179791839366308, "grad_norm": 3.033306837081909, "learning_rate": 9.19791015255741e-06, "loss": 0.5424, "step": 615800 }, { "epoch": 14.184397163120567, "grad_norm": 3.275662660598755, "learning_rate": 9.190638588734894e-06, "loss": 0.5333, "step": 616000 }, { "epoch": 14.189002486874827, "grad_norm": 2.4978132247924805, "learning_rate": 9.183367024912378e-06, "loss": 0.5341, "step": 616200 }, { "epoch": 14.193607810629087, "grad_norm": 2.4442851543426514, "learning_rate": 9.176095461089862e-06, "loss": 0.5327, "step": 616400 }, { "epoch": 14.198213134383348, "grad_norm": 3.3033714294433594, "learning_rate": 9.168823897267346e-06, "loss": 0.527, "step": 616600 }, { "epoch": 14.202818458137607, "grad_norm": 3.127436399459839, "learning_rate": 9.16155233344483e-06, "loss": 0.5371, "step": 616800 }, { "epoch": 14.207423781891867, "grad_norm": 3.2301642894744873, "learning_rate": 9.154280769622315e-06, "loss": 0.537, "step": 617000 }, { "epoch": 14.212029105646128, "grad_norm": 3.059586763381958, "learning_rate": 9.147009205799799e-06, "loss": 0.5396, "step": 617200 }, { "epoch": 14.216634429400386, "grad_norm": 3.4187586307525635, "learning_rate": 9.139773999796396e-06, "loss": 0.5429, "step": 617400 }, { "epoch": 14.221239753154647, "grad_norm": 2.5766658782958984, "learning_rate": 9.132502435973882e-06, "loss": 0.5241, "step": 617600 }, { "epoch": 14.225845076908907, "grad_norm": 3.2574641704559326, "learning_rate": 9.125230872151364e-06, "loss": 0.5498, "step": 617800 }, { "epoch": 14.230450400663166, "grad_norm": 3.0240838527679443, "learning_rate": 9.117995666147961e-06, "loss": 0.5397, "step": 618000 }, { "epoch": 14.235055724417427, "grad_norm": 3.250147819519043, "learning_rate": 9.110724102325446e-06, "loss": 0.5319, "step": 618200 }, { "epoch": 14.239661048171687, "grad_norm": 3.770249128341675, "learning_rate": 9.103488896322042e-06, "loss": 0.5388, "step": 618400 }, { "epoch": 14.244266371925946, "grad_norm": 3.1490142345428467, "learning_rate": 9.096217332499528e-06, "loss": 0.5409, "step": 618600 }, { "epoch": 14.248871695680206, "grad_norm": 3.5395240783691406, "learning_rate": 9.088945768677011e-06, "loss": 0.5395, "step": 618800 }, { "epoch": 14.253477019434467, "grad_norm": 3.340034246444702, "learning_rate": 9.081674204854495e-06, "loss": 0.5388, "step": 619000 }, { "epoch": 14.258082343188725, "grad_norm": 2.908026933670044, "learning_rate": 9.074402641031981e-06, "loss": 0.5424, "step": 619200 }, { "epoch": 14.262687666942986, "grad_norm": 2.7248940467834473, "learning_rate": 9.067131077209465e-06, "loss": 0.5236, "step": 619400 }, { "epoch": 14.267292990697246, "grad_norm": 3.4366588592529297, "learning_rate": 9.05985951338695e-06, "loss": 0.5526, "step": 619600 }, { "epoch": 14.271898314451505, "grad_norm": 3.0255722999572754, "learning_rate": 9.052587949564434e-06, "loss": 0.5282, "step": 619800 }, { "epoch": 14.276503638205766, "grad_norm": 3.2476325035095215, "learning_rate": 9.045316385741918e-06, "loss": 0.5384, "step": 620000 }, { "epoch": 14.281108961960026, "grad_norm": 3.1371731758117676, "learning_rate": 9.038044821919402e-06, "loss": 0.539, "step": 620200 }, { "epoch": 14.285714285714286, "grad_norm": 2.5684289932250977, "learning_rate": 9.030773258096886e-06, "loss": 0.5288, "step": 620400 }, { "epoch": 14.290319609468545, "grad_norm": 3.3922548294067383, "learning_rate": 9.023501694274372e-06, "loss": 0.5344, "step": 620600 }, { "epoch": 14.294924933222806, "grad_norm": 2.9498867988586426, "learning_rate": 9.016230130451855e-06, "loss": 0.5327, "step": 620800 }, { "epoch": 14.299530256977066, "grad_norm": 3.503101110458374, "learning_rate": 9.008958566629339e-06, "loss": 0.5408, "step": 621000 }, { "epoch": 14.304135580731325, "grad_norm": 2.9323580265045166, "learning_rate": 9.001687002806825e-06, "loss": 0.5407, "step": 621200 }, { "epoch": 14.308740904485585, "grad_norm": 3.0948574542999268, "learning_rate": 8.994415438984307e-06, "loss": 0.543, "step": 621400 }, { "epoch": 14.313346228239846, "grad_norm": 2.787778854370117, "learning_rate": 8.987180232980904e-06, "loss": 0.5334, "step": 621600 }, { "epoch": 14.317951551994105, "grad_norm": 2.2373883724212646, "learning_rate": 8.979908669158389e-06, "loss": 0.5355, "step": 621800 }, { "epoch": 14.322556875748365, "grad_norm": 3.0317771434783936, "learning_rate": 8.972637105335874e-06, "loss": 0.5372, "step": 622000 }, { "epoch": 14.327162199502625, "grad_norm": 3.486945152282715, "learning_rate": 8.965365541513357e-06, "loss": 0.5294, "step": 622200 }, { "epoch": 14.331767523256884, "grad_norm": 3.084419012069702, "learning_rate": 8.958093977690843e-06, "loss": 0.545, "step": 622400 }, { "epoch": 14.336372847011145, "grad_norm": 3.271340847015381, "learning_rate": 8.950822413868327e-06, "loss": 0.5436, "step": 622600 }, { "epoch": 14.340978170765405, "grad_norm": 2.878502607345581, "learning_rate": 8.943550850045811e-06, "loss": 0.5316, "step": 622800 }, { "epoch": 14.345583494519666, "grad_norm": 3.714315414428711, "learning_rate": 8.936279286223296e-06, "loss": 0.5538, "step": 623000 }, { "epoch": 14.350188818273924, "grad_norm": 3.77945876121521, "learning_rate": 8.92900772240078e-06, "loss": 0.5426, "step": 623200 }, { "epoch": 14.354794142028185, "grad_norm": 3.177495241165161, "learning_rate": 8.921736158578264e-06, "loss": 0.5407, "step": 623400 }, { "epoch": 14.359399465782445, "grad_norm": 2.679957151412964, "learning_rate": 8.914464594755748e-06, "loss": 0.5439, "step": 623600 }, { "epoch": 14.364004789536704, "grad_norm": 3.1106204986572266, "learning_rate": 8.907193030933232e-06, "loss": 0.5319, "step": 623800 }, { "epoch": 14.368610113290964, "grad_norm": 4.358025550842285, "learning_rate": 8.899921467110718e-06, "loss": 0.5272, "step": 624000 }, { "epoch": 14.373215437045225, "grad_norm": 3.5645134449005127, "learning_rate": 8.892649903288201e-06, "loss": 0.5265, "step": 624200 }, { "epoch": 14.377820760799484, "grad_norm": 3.1196436882019043, "learning_rate": 8.885378339465685e-06, "loss": 0.5261, "step": 624400 }, { "epoch": 14.382426084553744, "grad_norm": 3.206526517868042, "learning_rate": 8.878106775643171e-06, "loss": 0.5461, "step": 624600 }, { "epoch": 14.387031408308005, "grad_norm": 3.2251393795013428, "learning_rate": 8.870835211820654e-06, "loss": 0.5388, "step": 624800 }, { "epoch": 14.391636732062263, "grad_norm": 2.812065362930298, "learning_rate": 8.86360000581725e-06, "loss": 0.5345, "step": 625000 }, { "epoch": 14.396242055816524, "grad_norm": 3.157151460647583, "learning_rate": 8.856328441994735e-06, "loss": 0.5436, "step": 625200 }, { "epoch": 14.400847379570784, "grad_norm": 3.392270088195801, "learning_rate": 8.84905687817222e-06, "loss": 0.527, "step": 625400 }, { "epoch": 14.405452703325043, "grad_norm": 2.6218199729919434, "learning_rate": 8.841785314349703e-06, "loss": 0.5309, "step": 625600 }, { "epoch": 14.410058027079303, "grad_norm": 2.878702402114868, "learning_rate": 8.834513750527189e-06, "loss": 0.5339, "step": 625800 }, { "epoch": 14.414663350833564, "grad_norm": 3.8115878105163574, "learning_rate": 8.827242186704673e-06, "loss": 0.5313, "step": 626000 }, { "epoch": 14.419268674587823, "grad_norm": 3.6281392574310303, "learning_rate": 8.819970622882157e-06, "loss": 0.5352, "step": 626200 }, { "epoch": 14.423873998342083, "grad_norm": 3.010791301727295, "learning_rate": 8.812699059059642e-06, "loss": 0.5449, "step": 626400 }, { "epoch": 14.428479322096344, "grad_norm": 3.1626334190368652, "learning_rate": 8.805427495237126e-06, "loss": 0.5427, "step": 626600 }, { "epoch": 14.433084645850604, "grad_norm": 4.617162704467773, "learning_rate": 8.79815593141461e-06, "loss": 0.5296, "step": 626800 }, { "epoch": 14.437689969604863, "grad_norm": 2.7026705741882324, "learning_rate": 8.790884367592094e-06, "loss": 0.5386, "step": 627000 }, { "epoch": 14.442295293359123, "grad_norm": 2.913951873779297, "learning_rate": 8.783612803769579e-06, "loss": 0.5436, "step": 627200 }, { "epoch": 14.446900617113384, "grad_norm": 3.2576162815093994, "learning_rate": 8.776341239947064e-06, "loss": 0.5406, "step": 627400 }, { "epoch": 14.451505940867643, "grad_norm": 2.7531659603118896, "learning_rate": 8.769069676124547e-06, "loss": 0.5552, "step": 627600 }, { "epoch": 14.456111264621903, "grad_norm": 3.288334608078003, "learning_rate": 8.761798112302031e-06, "loss": 0.5285, "step": 627800 }, { "epoch": 14.460716588376163, "grad_norm": 2.9675936698913574, "learning_rate": 8.754526548479517e-06, "loss": 0.5303, "step": 628000 }, { "epoch": 14.465321912130422, "grad_norm": 3.810732364654541, "learning_rate": 8.747254984657e-06, "loss": 0.5309, "step": 628200 }, { "epoch": 14.469927235884683, "grad_norm": 3.084245204925537, "learning_rate": 8.739983420834486e-06, "loss": 0.53, "step": 628400 }, { "epoch": 14.474532559638943, "grad_norm": 3.4375557899475098, "learning_rate": 8.73274821483108e-06, "loss": 0.5417, "step": 628600 }, { "epoch": 14.479137883393202, "grad_norm": 4.032174587249756, "learning_rate": 8.725476651008567e-06, "loss": 0.5392, "step": 628800 }, { "epoch": 14.483743207147462, "grad_norm": 3.3805575370788574, "learning_rate": 8.71820508718605e-06, "loss": 0.5307, "step": 629000 }, { "epoch": 14.488348530901723, "grad_norm": 3.0205650329589844, "learning_rate": 8.710933523363535e-06, "loss": 0.5383, "step": 629200 }, { "epoch": 14.492953854655982, "grad_norm": 2.994614601135254, "learning_rate": 8.70366195954102e-06, "loss": 0.543, "step": 629400 }, { "epoch": 14.497559178410242, "grad_norm": 3.1769533157348633, "learning_rate": 8.696390395718504e-06, "loss": 0.5316, "step": 629600 }, { "epoch": 14.502164502164502, "grad_norm": 2.6771903038024902, "learning_rate": 8.689118831895988e-06, "loss": 0.5426, "step": 629800 }, { "epoch": 14.506769825918763, "grad_norm": 2.680912494659424, "learning_rate": 8.681847268073472e-06, "loss": 0.5232, "step": 630000 }, { "epoch": 14.511375149673022, "grad_norm": 2.9476630687713623, "learning_rate": 8.674575704250958e-06, "loss": 0.5455, "step": 630200 }, { "epoch": 14.515980473427282, "grad_norm": 4.284652233123779, "learning_rate": 8.66730414042844e-06, "loss": 0.5425, "step": 630400 }, { "epoch": 14.520585797181543, "grad_norm": 3.3897085189819336, "learning_rate": 8.660032576605925e-06, "loss": 0.5245, "step": 630600 }, { "epoch": 14.525191120935801, "grad_norm": 2.838503122329712, "learning_rate": 8.65276101278341e-06, "loss": 0.5334, "step": 630800 }, { "epoch": 14.529796444690062, "grad_norm": 2.7507781982421875, "learning_rate": 8.645489448960893e-06, "loss": 0.5292, "step": 631000 }, { "epoch": 14.534401768444322, "grad_norm": 4.586523532867432, "learning_rate": 8.638217885138377e-06, "loss": 0.543, "step": 631200 }, { "epoch": 14.539007092198581, "grad_norm": 2.6941652297973633, "learning_rate": 8.630946321315863e-06, "loss": 0.5372, "step": 631400 }, { "epoch": 14.543612415952841, "grad_norm": 2.9810523986816406, "learning_rate": 8.62371111531246e-06, "loss": 0.5382, "step": 631600 }, { "epoch": 14.548217739707102, "grad_norm": 2.956925868988037, "learning_rate": 8.616439551489943e-06, "loss": 0.5463, "step": 631800 }, { "epoch": 14.55282306346136, "grad_norm": 2.96683669090271, "learning_rate": 8.609167987667429e-06, "loss": 0.5295, "step": 632000 }, { "epoch": 14.557428387215621, "grad_norm": 3.31773042678833, "learning_rate": 8.601896423844913e-06, "loss": 0.5294, "step": 632200 }, { "epoch": 14.562033710969882, "grad_norm": 3.2458245754241943, "learning_rate": 8.594624860022395e-06, "loss": 0.5335, "step": 632400 }, { "epoch": 14.56663903472414, "grad_norm": 3.0680081844329834, "learning_rate": 8.587353296199881e-06, "loss": 0.547, "step": 632600 }, { "epoch": 14.5712443584784, "grad_norm": 3.1158411502838135, "learning_rate": 8.580081732377365e-06, "loss": 0.555, "step": 632800 }, { "epoch": 14.575849682232661, "grad_norm": 4.015247821807861, "learning_rate": 8.57281016855485e-06, "loss": 0.5307, "step": 633000 }, { "epoch": 14.58045500598692, "grad_norm": 3.309248924255371, "learning_rate": 8.565538604732334e-06, "loss": 0.5387, "step": 633200 }, { "epoch": 14.58506032974118, "grad_norm": 3.6629464626312256, "learning_rate": 8.558267040909818e-06, "loss": 0.5384, "step": 633400 }, { "epoch": 14.589665653495441, "grad_norm": 3.6036899089813232, "learning_rate": 8.550995477087304e-06, "loss": 0.5417, "step": 633600 }, { "epoch": 14.594270977249701, "grad_norm": 2.716386556625366, "learning_rate": 8.543723913264787e-06, "loss": 0.5361, "step": 633800 }, { "epoch": 14.59887630100396, "grad_norm": 2.9629690647125244, "learning_rate": 8.53645234944227e-06, "loss": 0.5406, "step": 634000 }, { "epoch": 14.60348162475822, "grad_norm": 3.4902899265289307, "learning_rate": 8.529180785619757e-06, "loss": 0.5288, "step": 634200 }, { "epoch": 14.608086948512481, "grad_norm": 3.307722568511963, "learning_rate": 8.52190922179724e-06, "loss": 0.54, "step": 634400 }, { "epoch": 14.61269227226674, "grad_norm": 2.7474825382232666, "learning_rate": 8.514637657974723e-06, "loss": 0.5514, "step": 634600 }, { "epoch": 14.617297596021, "grad_norm": 3.100318431854248, "learning_rate": 8.50736609415221e-06, "loss": 0.5373, "step": 634800 }, { "epoch": 14.62190291977526, "grad_norm": 3.0019752979278564, "learning_rate": 8.500094530329692e-06, "loss": 0.5312, "step": 635000 }, { "epoch": 14.62650824352952, "grad_norm": 3.607322931289673, "learning_rate": 8.492822966507178e-06, "loss": 0.5222, "step": 635200 }, { "epoch": 14.63111356728378, "grad_norm": 2.8540308475494385, "learning_rate": 8.485551402684662e-06, "loss": 0.5392, "step": 635400 }, { "epoch": 14.63571889103804, "grad_norm": 2.7987473011016846, "learning_rate": 8.478279838862145e-06, "loss": 0.5433, "step": 635600 }, { "epoch": 14.6403242147923, "grad_norm": 3.4120616912841797, "learning_rate": 8.471044632858741e-06, "loss": 0.5306, "step": 635800 }, { "epoch": 14.64492953854656, "grad_norm": 2.813546657562256, "learning_rate": 8.463773069036227e-06, "loss": 0.5319, "step": 636000 }, { "epoch": 14.64953486230082, "grad_norm": 3.185399293899536, "learning_rate": 8.456501505213712e-06, "loss": 0.5474, "step": 636200 }, { "epoch": 14.654140186055079, "grad_norm": 3.0868401527404785, "learning_rate": 8.449229941391194e-06, "loss": 0.5404, "step": 636400 }, { "epoch": 14.65874550980934, "grad_norm": 3.2732975482940674, "learning_rate": 8.44195837756868e-06, "loss": 0.536, "step": 636600 }, { "epoch": 14.6633508335636, "grad_norm": 3.1363329887390137, "learning_rate": 8.434686813746164e-06, "loss": 0.5395, "step": 636800 }, { "epoch": 14.66795615731786, "grad_norm": 3.2439472675323486, "learning_rate": 8.42741524992365e-06, "loss": 0.5405, "step": 637000 }, { "epoch": 14.672561481072119, "grad_norm": 2.4419994354248047, "learning_rate": 8.420180043920247e-06, "loss": 0.5228, "step": 637200 }, { "epoch": 14.67716680482638, "grad_norm": 4.0651044845581055, "learning_rate": 8.41290848009773e-06, "loss": 0.5441, "step": 637400 }, { "epoch": 14.68177212858064, "grad_norm": 3.520622730255127, "learning_rate": 8.405636916275214e-06, "loss": 0.5334, "step": 637600 }, { "epoch": 14.686377452334899, "grad_norm": 2.7496581077575684, "learning_rate": 8.3983653524527e-06, "loss": 0.5408, "step": 637800 }, { "epoch": 14.69098277608916, "grad_norm": 2.8176567554473877, "learning_rate": 8.391093788630182e-06, "loss": 0.5329, "step": 638000 }, { "epoch": 14.69558809984342, "grad_norm": 3.117527484893799, "learning_rate": 8.383822224807668e-06, "loss": 0.5327, "step": 638200 }, { "epoch": 14.700193423597678, "grad_norm": 3.2691383361816406, "learning_rate": 8.376550660985152e-06, "loss": 0.5375, "step": 638400 }, { "epoch": 14.704798747351939, "grad_norm": 3.023637533187866, "learning_rate": 8.369279097162635e-06, "loss": 0.535, "step": 638600 }, { "epoch": 14.7094040711062, "grad_norm": 4.642063140869141, "learning_rate": 8.36200753334012e-06, "loss": 0.5324, "step": 638800 }, { "epoch": 14.714009394860458, "grad_norm": 3.2362873554229736, "learning_rate": 8.354735969517605e-06, "loss": 0.5345, "step": 639000 }, { "epoch": 14.718614718614718, "grad_norm": 3.0052409172058105, "learning_rate": 8.347464405695088e-06, "loss": 0.5431, "step": 639200 }, { "epoch": 14.723220042368979, "grad_norm": 2.905585765838623, "learning_rate": 8.340229199691684e-06, "loss": 0.5339, "step": 639400 }, { "epoch": 14.727825366123238, "grad_norm": 3.1975319385528564, "learning_rate": 8.33295763586917e-06, "loss": 0.5351, "step": 639600 }, { "epoch": 14.732430689877498, "grad_norm": 3.070645809173584, "learning_rate": 8.325686072046655e-06, "loss": 0.5395, "step": 639800 }, { "epoch": 14.737036013631759, "grad_norm": 4.0564351081848145, "learning_rate": 8.318414508224139e-06, "loss": 0.542, "step": 640000 }, { "epoch": 14.741641337386017, "grad_norm": 3.2501471042633057, "learning_rate": 8.311142944401623e-06, "loss": 0.5381, "step": 640200 }, { "epoch": 14.746246661140278, "grad_norm": 2.8298182487487793, "learning_rate": 8.303871380579107e-06, "loss": 0.5309, "step": 640400 }, { "epoch": 14.750851984894538, "grad_norm": 2.915964365005493, "learning_rate": 8.296599816756593e-06, "loss": 0.5458, "step": 640600 }, { "epoch": 14.755457308648799, "grad_norm": 3.227104425430298, "learning_rate": 8.289328252934076e-06, "loss": 0.5375, "step": 640800 }, { "epoch": 14.760062632403057, "grad_norm": 3.489304304122925, "learning_rate": 8.28205668911156e-06, "loss": 0.5386, "step": 641000 }, { "epoch": 14.764667956157318, "grad_norm": 2.84167218208313, "learning_rate": 8.274785125289046e-06, "loss": 0.5288, "step": 641200 }, { "epoch": 14.769273279911578, "grad_norm": 3.9559543132781982, "learning_rate": 8.267513561466528e-06, "loss": 0.5479, "step": 641400 }, { "epoch": 14.773878603665837, "grad_norm": 3.0262818336486816, "learning_rate": 8.260241997644014e-06, "loss": 0.5233, "step": 641600 }, { "epoch": 14.778483927420098, "grad_norm": 2.9113521575927734, "learning_rate": 8.252970433821499e-06, "loss": 0.545, "step": 641800 }, { "epoch": 14.783089251174358, "grad_norm": 3.577045440673828, "learning_rate": 8.245698869998981e-06, "loss": 0.5376, "step": 642000 }, { "epoch": 14.787694574928617, "grad_norm": 2.9530036449432373, "learning_rate": 8.238427306176467e-06, "loss": 0.5402, "step": 642200 }, { "epoch": 14.792299898682877, "grad_norm": 2.98638916015625, "learning_rate": 8.231155742353951e-06, "loss": 0.5326, "step": 642400 }, { "epoch": 14.796905222437138, "grad_norm": 3.20383620262146, "learning_rate": 8.223884178531434e-06, "loss": 0.527, "step": 642600 }, { "epoch": 14.801510546191397, "grad_norm": 3.5670669078826904, "learning_rate": 8.21661261470892e-06, "loss": 0.54, "step": 642800 }, { "epoch": 14.806115869945657, "grad_norm": 3.084299087524414, "learning_rate": 8.209377408705517e-06, "loss": 0.5414, "step": 643000 }, { "epoch": 14.810721193699917, "grad_norm": 3.610342264175415, "learning_rate": 8.202105844883e-06, "loss": 0.547, "step": 643200 }, { "epoch": 14.815326517454178, "grad_norm": 3.762600898742676, "learning_rate": 8.194834281060485e-06, "loss": 0.5448, "step": 643400 }, { "epoch": 14.819931841208437, "grad_norm": 2.416067600250244, "learning_rate": 8.18756271723797e-06, "loss": 0.5423, "step": 643600 }, { "epoch": 14.824537164962697, "grad_norm": 3.5616204738616943, "learning_rate": 8.180291153415453e-06, "loss": 0.5233, "step": 643800 }, { "epoch": 14.829142488716958, "grad_norm": 2.5570333003997803, "learning_rate": 8.17301958959294e-06, "loss": 0.537, "step": 644000 }, { "epoch": 14.833747812471216, "grad_norm": 3.0077576637268066, "learning_rate": 8.165784383589536e-06, "loss": 0.546, "step": 644200 }, { "epoch": 14.838353136225477, "grad_norm": 2.393003463745117, "learning_rate": 8.158512819767019e-06, "loss": 0.5329, "step": 644400 }, { "epoch": 14.842958459979737, "grad_norm": 3.575730085372925, "learning_rate": 8.151241255944503e-06, "loss": 0.5481, "step": 644600 }, { "epoch": 14.847563783733996, "grad_norm": 3.534426689147949, "learning_rate": 8.143969692121989e-06, "loss": 0.5373, "step": 644800 }, { "epoch": 14.852169107488256, "grad_norm": 2.990290880203247, "learning_rate": 8.136698128299471e-06, "loss": 0.5358, "step": 645000 }, { "epoch": 14.856774431242517, "grad_norm": 2.6674997806549072, "learning_rate": 8.129426564476957e-06, "loss": 0.545, "step": 645200 }, { "epoch": 14.861379754996776, "grad_norm": 2.798583745956421, "learning_rate": 8.122155000654442e-06, "loss": 0.5366, "step": 645400 }, { "epoch": 14.865985078751036, "grad_norm": 2.496399164199829, "learning_rate": 8.114883436831924e-06, "loss": 0.5419, "step": 645600 }, { "epoch": 14.870590402505297, "grad_norm": 2.99106502532959, "learning_rate": 8.10761187300941e-06, "loss": 0.5414, "step": 645800 }, { "epoch": 14.875195726259555, "grad_norm": 3.370936393737793, "learning_rate": 8.100340309186894e-06, "loss": 0.5246, "step": 646000 }, { "epoch": 14.879801050013816, "grad_norm": 2.571732521057129, "learning_rate": 8.093068745364378e-06, "loss": 0.5323, "step": 646200 }, { "epoch": 14.884406373768076, "grad_norm": 4.192697525024414, "learning_rate": 8.085797181541863e-06, "loss": 0.5265, "step": 646400 }, { "epoch": 14.889011697522335, "grad_norm": 3.2478342056274414, "learning_rate": 8.078525617719347e-06, "loss": 0.5266, "step": 646600 }, { "epoch": 14.893617021276595, "grad_norm": 3.7629292011260986, "learning_rate": 8.071254053896831e-06, "loss": 0.5455, "step": 646800 }, { "epoch": 14.898222345030856, "grad_norm": 3.1777334213256836, "learning_rate": 8.063982490074315e-06, "loss": 0.5255, "step": 647000 }, { "epoch": 14.902827668785115, "grad_norm": 3.3058884143829346, "learning_rate": 8.0567109262518e-06, "loss": 0.5519, "step": 647200 }, { "epoch": 14.907432992539375, "grad_norm": 3.7439780235290527, "learning_rate": 8.049439362429285e-06, "loss": 0.545, "step": 647400 }, { "epoch": 14.912038316293636, "grad_norm": 3.925713062286377, "learning_rate": 8.042167798606768e-06, "loss": 0.5236, "step": 647600 }, { "epoch": 14.916643640047896, "grad_norm": 3.196169376373291, "learning_rate": 8.034896234784254e-06, "loss": 0.541, "step": 647800 }, { "epoch": 14.921248963802155, "grad_norm": 2.940185308456421, "learning_rate": 8.027624670961738e-06, "loss": 0.5409, "step": 648000 }, { "epoch": 14.925854287556415, "grad_norm": 3.2344326972961426, "learning_rate": 8.02035310713922e-06, "loss": 0.5228, "step": 648200 }, { "epoch": 14.930459611310676, "grad_norm": 2.691751718521118, "learning_rate": 8.013081543316707e-06, "loss": 0.5243, "step": 648400 }, { "epoch": 14.935064935064934, "grad_norm": 2.5507559776306152, "learning_rate": 8.00580997949419e-06, "loss": 0.5326, "step": 648600 }, { "epoch": 14.939670258819195, "grad_norm": 3.069321870803833, "learning_rate": 7.998538415671673e-06, "loss": 0.5364, "step": 648800 }, { "epoch": 14.944275582573455, "grad_norm": 2.938601016998291, "learning_rate": 7.99126685184916e-06, "loss": 0.5257, "step": 649000 }, { "epoch": 14.948880906327714, "grad_norm": 2.886845588684082, "learning_rate": 7.983995288026643e-06, "loss": 0.5415, "step": 649200 }, { "epoch": 14.953486230081975, "grad_norm": 2.869224786758423, "learning_rate": 7.976723724204128e-06, "loss": 0.5408, "step": 649400 }, { "epoch": 14.958091553836235, "grad_norm": 2.6893749237060547, "learning_rate": 7.969452160381612e-06, "loss": 0.5284, "step": 649600 }, { "epoch": 14.962696877590494, "grad_norm": 3.181957244873047, "learning_rate": 7.962180596559096e-06, "loss": 0.5446, "step": 649800 }, { "epoch": 14.967302201344754, "grad_norm": 2.8158302307128906, "learning_rate": 7.95490903273658e-06, "loss": 0.5428, "step": 650000 }, { "epoch": 14.971907525099015, "grad_norm": 3.031609296798706, "learning_rate": 7.947673826733177e-06, "loss": 0.5318, "step": 650200 }, { "epoch": 14.976512848853275, "grad_norm": 2.578882932662964, "learning_rate": 7.940402262910661e-06, "loss": 0.5252, "step": 650400 }, { "epoch": 14.981118172607534, "grad_norm": 3.891084671020508, "learning_rate": 7.933130699088146e-06, "loss": 0.5412, "step": 650600 }, { "epoch": 14.985723496361794, "grad_norm": 4.118905544281006, "learning_rate": 7.925859135265632e-06, "loss": 0.5389, "step": 650800 }, { "epoch": 14.990328820116055, "grad_norm": 3.9632790088653564, "learning_rate": 7.918587571443114e-06, "loss": 0.527, "step": 651000 }, { "epoch": 14.994934143870314, "grad_norm": 3.387319564819336, "learning_rate": 7.9113160076206e-06, "loss": 0.5273, "step": 651200 }, { "epoch": 14.999539467624574, "grad_norm": 2.79263973236084, "learning_rate": 7.904044443798084e-06, "loss": 0.539, "step": 651400 }, { "epoch": 15.0, "eval_loss": 0.5169693231582642, "eval_runtime": 168.9237, "eval_samples_per_second": 167.892, "eval_steps_per_second": 10.496, "step": 651420 }, { "epoch": 15.004144791378835, "grad_norm": 3.222738742828369, "learning_rate": 7.896772879975567e-06, "loss": 0.5368, "step": 651600 }, { "epoch": 15.008750115133093, "grad_norm": 3.698350191116333, "learning_rate": 7.889501316153053e-06, "loss": 0.5365, "step": 651800 }, { "epoch": 15.013355438887354, "grad_norm": 3.877946376800537, "learning_rate": 7.882229752330537e-06, "loss": 0.5331, "step": 652000 }, { "epoch": 15.017960762641614, "grad_norm": 3.619192600250244, "learning_rate": 7.87495818850802e-06, "loss": 0.5337, "step": 652200 }, { "epoch": 15.022566086395873, "grad_norm": 2.9322800636291504, "learning_rate": 7.867686624685505e-06, "loss": 0.5217, "step": 652400 }, { "epoch": 15.027171410150133, "grad_norm": 2.9945201873779297, "learning_rate": 7.86041506086299e-06, "loss": 0.5297, "step": 652600 }, { "epoch": 15.031776733904394, "grad_norm": 2.359719753265381, "learning_rate": 7.853143497040474e-06, "loss": 0.5316, "step": 652800 }, { "epoch": 15.036382057658653, "grad_norm": 3.135897636413574, "learning_rate": 7.845871933217958e-06, "loss": 0.534, "step": 653000 }, { "epoch": 15.040987381412913, "grad_norm": 3.2203567028045654, "learning_rate": 7.838600369395442e-06, "loss": 0.5371, "step": 653200 }, { "epoch": 15.045592705167174, "grad_norm": 3.355985164642334, "learning_rate": 7.831328805572926e-06, "loss": 0.5294, "step": 653400 }, { "epoch": 15.050198028921432, "grad_norm": 3.7770304679870605, "learning_rate": 7.82405724175041e-06, "loss": 0.5409, "step": 653600 }, { "epoch": 15.054803352675693, "grad_norm": 9.291865348815918, "learning_rate": 7.816785677927895e-06, "loss": 0.5363, "step": 653800 }, { "epoch": 15.059408676429953, "grad_norm": 2.6774072647094727, "learning_rate": 7.80951411410538e-06, "loss": 0.5297, "step": 654000 }, { "epoch": 15.064014000184214, "grad_norm": 3.5901434421539307, "learning_rate": 7.802242550282863e-06, "loss": 0.527, "step": 654200 }, { "epoch": 15.068619323938472, "grad_norm": 3.092885971069336, "learning_rate": 7.79497098646035e-06, "loss": 0.5316, "step": 654400 }, { "epoch": 15.073224647692733, "grad_norm": 2.6505744457244873, "learning_rate": 7.787735780456946e-06, "loss": 0.523, "step": 654600 }, { "epoch": 15.077829971446993, "grad_norm": 2.4430651664733887, "learning_rate": 7.78046421663443e-06, "loss": 0.5332, "step": 654800 }, { "epoch": 15.082435295201252, "grad_norm": 3.4239096641540527, "learning_rate": 7.773192652811913e-06, "loss": 0.5329, "step": 655000 }, { "epoch": 15.087040618955513, "grad_norm": 3.358081340789795, "learning_rate": 7.765921088989399e-06, "loss": 0.5242, "step": 655200 }, { "epoch": 15.091645942709773, "grad_norm": 3.774402141571045, "learning_rate": 7.758685882985996e-06, "loss": 0.5316, "step": 655400 }, { "epoch": 15.096251266464032, "grad_norm": 3.171030282974243, "learning_rate": 7.75141431916348e-06, "loss": 0.545, "step": 655600 }, { "epoch": 15.100856590218292, "grad_norm": 2.9989264011383057, "learning_rate": 7.744142755340964e-06, "loss": 0.5374, "step": 655800 }, { "epoch": 15.105461913972553, "grad_norm": 2.5736443996429443, "learning_rate": 7.736871191518448e-06, "loss": 0.5254, "step": 656000 }, { "epoch": 15.110067237726811, "grad_norm": 3.9181089401245117, "learning_rate": 7.729599627695933e-06, "loss": 0.5276, "step": 656200 }, { "epoch": 15.114672561481072, "grad_norm": 2.8905580043792725, "learning_rate": 7.722328063873417e-06, "loss": 0.5338, "step": 656400 }, { "epoch": 15.119277885235332, "grad_norm": 3.0359580516815186, "learning_rate": 7.715056500050901e-06, "loss": 0.5302, "step": 656600 }, { "epoch": 15.123883208989591, "grad_norm": 3.0906269550323486, "learning_rate": 7.707784936228385e-06, "loss": 0.5233, "step": 656800 }, { "epoch": 15.128488532743852, "grad_norm": 2.517125129699707, "learning_rate": 7.70051337240587e-06, "loss": 0.5382, "step": 657000 }, { "epoch": 15.133093856498112, "grad_norm": 3.945773124694824, "learning_rate": 7.693241808583354e-06, "loss": 0.5329, "step": 657200 }, { "epoch": 15.137699180252373, "grad_norm": 3.2435407638549805, "learning_rate": 7.68597024476084e-06, "loss": 0.5289, "step": 657400 }, { "epoch": 15.142304504006631, "grad_norm": 2.7934975624084473, "learning_rate": 7.678698680938324e-06, "loss": 0.5373, "step": 657600 }, { "epoch": 15.146909827760892, "grad_norm": 2.8237783908843994, "learning_rate": 7.671427117115806e-06, "loss": 0.5276, "step": 657800 }, { "epoch": 15.151515151515152, "grad_norm": 4.111272811889648, "learning_rate": 7.664155553293292e-06, "loss": 0.5264, "step": 658000 }, { "epoch": 15.156120475269411, "grad_norm": 3.4282681941986084, "learning_rate": 7.656883989470776e-06, "loss": 0.5343, "step": 658200 }, { "epoch": 15.160725799023671, "grad_norm": 2.7267274856567383, "learning_rate": 7.649612425648259e-06, "loss": 0.5311, "step": 658400 }, { "epoch": 15.165331122777932, "grad_norm": 3.3111186027526855, "learning_rate": 7.642340861825745e-06, "loss": 0.5329, "step": 658600 }, { "epoch": 15.16993644653219, "grad_norm": 2.9832489490509033, "learning_rate": 7.635069298003229e-06, "loss": 0.5277, "step": 658800 }, { "epoch": 15.174541770286451, "grad_norm": 3.5788393020629883, "learning_rate": 7.6277977341807125e-06, "loss": 0.5254, "step": 659000 }, { "epoch": 15.179147094040712, "grad_norm": 2.7909393310546875, "learning_rate": 7.6205261703581975e-06, "loss": 0.5332, "step": 659200 }, { "epoch": 15.18375241779497, "grad_norm": 3.0156285762786865, "learning_rate": 7.613254606535683e-06, "loss": 0.5396, "step": 659400 }, { "epoch": 15.18835774154923, "grad_norm": 3.3758177757263184, "learning_rate": 7.605983042713165e-06, "loss": 0.5257, "step": 659600 }, { "epoch": 15.192963065303491, "grad_norm": 3.161837100982666, "learning_rate": 7.59871147889065e-06, "loss": 0.5314, "step": 659800 }, { "epoch": 15.19756838905775, "grad_norm": 4.073904037475586, "learning_rate": 7.591476272887247e-06, "loss": 0.5275, "step": 660000 }, { "epoch": 15.20217371281201, "grad_norm": 3.1027348041534424, "learning_rate": 7.584204709064732e-06, "loss": 0.5367, "step": 660200 }, { "epoch": 15.206779036566271, "grad_norm": 2.5127763748168945, "learning_rate": 7.5769331452422155e-06, "loss": 0.5321, "step": 660400 }, { "epoch": 15.21138436032053, "grad_norm": 3.317880153656006, "learning_rate": 7.5696615814197e-06, "loss": 0.535, "step": 660600 }, { "epoch": 15.21598968407479, "grad_norm": 3.18585467338562, "learning_rate": 7.562390017597185e-06, "loss": 0.5364, "step": 660800 }, { "epoch": 15.22059500782905, "grad_norm": 3.1549441814422607, "learning_rate": 7.555118453774668e-06, "loss": 0.5288, "step": 661000 }, { "epoch": 15.225200331583311, "grad_norm": 3.0933032035827637, "learning_rate": 7.547846889952153e-06, "loss": 0.5315, "step": 661200 }, { "epoch": 15.22980565533757, "grad_norm": 3.4483559131622314, "learning_rate": 7.5405753261296375e-06, "loss": 0.5372, "step": 661400 }, { "epoch": 15.23441097909183, "grad_norm": 3.425278663635254, "learning_rate": 7.5333037623071225e-06, "loss": 0.5268, "step": 661600 }, { "epoch": 15.23901630284609, "grad_norm": 2.674301862716675, "learning_rate": 7.526032198484606e-06, "loss": 0.523, "step": 661800 }, { "epoch": 15.24362162660035, "grad_norm": 4.0348358154296875, "learning_rate": 7.518760634662091e-06, "loss": 0.5256, "step": 662000 }, { "epoch": 15.24822695035461, "grad_norm": 2.689831018447876, "learning_rate": 7.511489070839575e-06, "loss": 0.5198, "step": 662200 }, { "epoch": 15.25283227410887, "grad_norm": 3.238309860229492, "learning_rate": 7.5042175070170586e-06, "loss": 0.5295, "step": 662400 }, { "epoch": 15.25743759786313, "grad_norm": 2.907236337661743, "learning_rate": 7.496945943194544e-06, "loss": 0.5316, "step": 662600 }, { "epoch": 15.26204292161739, "grad_norm": 2.706756114959717, "learning_rate": 7.4897107371911405e-06, "loss": 0.522, "step": 662800 }, { "epoch": 15.26664824537165, "grad_norm": 3.2358510494232178, "learning_rate": 7.482439173368625e-06, "loss": 0.5382, "step": 663000 }, { "epoch": 15.271253569125909, "grad_norm": 2.505164861679077, "learning_rate": 7.475167609546109e-06, "loss": 0.5304, "step": 663200 }, { "epoch": 15.27585889288017, "grad_norm": 2.7011849880218506, "learning_rate": 7.467896045723593e-06, "loss": 0.5381, "step": 663400 }, { "epoch": 15.28046421663443, "grad_norm": 4.1736159324646, "learning_rate": 7.460624481901077e-06, "loss": 0.53, "step": 663600 }, { "epoch": 15.285069540388688, "grad_norm": 2.8597121238708496, "learning_rate": 7.4533529180785625e-06, "loss": 0.5273, "step": 663800 }, { "epoch": 15.289674864142949, "grad_norm": 4.301202297210693, "learning_rate": 7.446081354256047e-06, "loss": 0.5306, "step": 664000 }, { "epoch": 15.29428018789721, "grad_norm": 3.257692575454712, "learning_rate": 7.438809790433531e-06, "loss": 0.5249, "step": 664200 }, { "epoch": 15.29888551165147, "grad_norm": 4.112732887268066, "learning_rate": 7.431538226611015e-06, "loss": 0.5303, "step": 664400 }, { "epoch": 15.303490835405729, "grad_norm": 3.026270627975464, "learning_rate": 7.424266662788499e-06, "loss": 0.5309, "step": 664600 }, { "epoch": 15.30809615915999, "grad_norm": 3.302058219909668, "learning_rate": 7.417031456785096e-06, "loss": 0.5272, "step": 664800 }, { "epoch": 15.31270148291425, "grad_norm": 3.1343607902526855, "learning_rate": 7.409759892962581e-06, "loss": 0.543, "step": 665000 }, { "epoch": 15.317306806668508, "grad_norm": 3.4846322536468506, "learning_rate": 7.402488329140065e-06, "loss": 0.5345, "step": 665200 }, { "epoch": 15.321912130422769, "grad_norm": 2.883300304412842, "learning_rate": 7.395216765317549e-06, "loss": 0.5179, "step": 665400 }, { "epoch": 15.32651745417703, "grad_norm": 3.417581081390381, "learning_rate": 7.387945201495034e-06, "loss": 0.5259, "step": 665600 }, { "epoch": 15.331122777931288, "grad_norm": 2.906959056854248, "learning_rate": 7.380673637672518e-06, "loss": 0.5253, "step": 665800 }, { "epoch": 15.335728101685548, "grad_norm": 2.635720729827881, "learning_rate": 7.3734020738500024e-06, "loss": 0.5321, "step": 666000 }, { "epoch": 15.340333425439809, "grad_norm": 2.8805277347564697, "learning_rate": 7.366130510027487e-06, "loss": 0.5336, "step": 666200 }, { "epoch": 15.344938749194068, "grad_norm": 2.8946115970611572, "learning_rate": 7.358858946204971e-06, "loss": 0.5299, "step": 666400 }, { "epoch": 15.349544072948328, "grad_norm": 3.4212236404418945, "learning_rate": 7.351587382382456e-06, "loss": 0.5307, "step": 666600 }, { "epoch": 15.354149396702589, "grad_norm": 3.027736186981201, "learning_rate": 7.344315818559939e-06, "loss": 0.5367, "step": 666800 }, { "epoch": 15.358754720456847, "grad_norm": 2.96783185005188, "learning_rate": 7.3370442547374235e-06, "loss": 0.5288, "step": 667000 }, { "epoch": 15.363360044211108, "grad_norm": 3.2669103145599365, "learning_rate": 7.329772690914909e-06, "loss": 0.5318, "step": 667200 }, { "epoch": 15.367965367965368, "grad_norm": 3.050448417663574, "learning_rate": 7.322501127092393e-06, "loss": 0.534, "step": 667400 }, { "epoch": 15.372570691719627, "grad_norm": 4.087128162384033, "learning_rate": 7.315229563269877e-06, "loss": 0.5263, "step": 667600 }, { "epoch": 15.377176015473887, "grad_norm": 3.5883004665374756, "learning_rate": 7.307957999447361e-06, "loss": 0.5277, "step": 667800 }, { "epoch": 15.381781339228148, "grad_norm": 3.088322162628174, "learning_rate": 7.3006864356248455e-06, "loss": 0.5414, "step": 668000 }, { "epoch": 15.386386662982408, "grad_norm": 3.5304698944091797, "learning_rate": 7.2934148718023305e-06, "loss": 0.5338, "step": 668200 }, { "epoch": 15.390991986736667, "grad_norm": 3.154963731765747, "learning_rate": 7.286143307979814e-06, "loss": 0.5417, "step": 668400 }, { "epoch": 15.395597310490928, "grad_norm": 3.262460231781006, "learning_rate": 7.278871744157298e-06, "loss": 0.5163, "step": 668600 }, { "epoch": 15.400202634245188, "grad_norm": 3.046994209289551, "learning_rate": 7.271600180334783e-06, "loss": 0.5291, "step": 668800 }, { "epoch": 15.404807957999447, "grad_norm": 3.0019094944000244, "learning_rate": 7.264328616512267e-06, "loss": 0.5308, "step": 669000 }, { "epoch": 15.409413281753707, "grad_norm": 2.577397584915161, "learning_rate": 7.257057052689752e-06, "loss": 0.5328, "step": 669200 }, { "epoch": 15.414018605507968, "grad_norm": 3.689873218536377, "learning_rate": 7.249785488867236e-06, "loss": 0.5362, "step": 669400 }, { "epoch": 15.418623929262226, "grad_norm": 3.1702044010162354, "learning_rate": 7.24251392504472e-06, "loss": 0.5278, "step": 669600 }, { "epoch": 15.423229253016487, "grad_norm": 2.942192316055298, "learning_rate": 7.235278719041317e-06, "loss": 0.5438, "step": 669800 }, { "epoch": 15.427834576770747, "grad_norm": 3.0847814083099365, "learning_rate": 7.228007155218802e-06, "loss": 0.5425, "step": 670000 }, { "epoch": 15.432439900525006, "grad_norm": 2.7735490798950195, "learning_rate": 7.220735591396285e-06, "loss": 0.5362, "step": 670200 }, { "epoch": 15.437045224279267, "grad_norm": 3.166234254837036, "learning_rate": 7.21346402757377e-06, "loss": 0.528, "step": 670400 }, { "epoch": 15.441650548033527, "grad_norm": 4.274187088012695, "learning_rate": 7.206192463751255e-06, "loss": 0.5324, "step": 670600 }, { "epoch": 15.446255871787788, "grad_norm": 3.542017698287964, "learning_rate": 7.198920899928739e-06, "loss": 0.5365, "step": 670800 }, { "epoch": 15.450861195542046, "grad_norm": 3.5981078147888184, "learning_rate": 7.191649336106223e-06, "loss": 0.5345, "step": 671000 }, { "epoch": 15.455466519296307, "grad_norm": 2.7863192558288574, "learning_rate": 7.184377772283707e-06, "loss": 0.5415, "step": 671200 }, { "epoch": 15.460071843050567, "grad_norm": 3.475003957748413, "learning_rate": 7.177106208461192e-06, "loss": 0.5261, "step": 671400 }, { "epoch": 15.464677166804826, "grad_norm": 2.9916470050811768, "learning_rate": 7.169834644638677e-06, "loss": 0.5382, "step": 671600 }, { "epoch": 15.469282490559086, "grad_norm": 3.3957395553588867, "learning_rate": 7.162563080816161e-06, "loss": 0.5401, "step": 671800 }, { "epoch": 15.473887814313347, "grad_norm": 3.301769495010376, "learning_rate": 7.155291516993644e-06, "loss": 0.529, "step": 672000 }, { "epoch": 15.478493138067606, "grad_norm": 3.8502883911132812, "learning_rate": 7.148019953171129e-06, "loss": 0.5284, "step": 672200 }, { "epoch": 15.483098461821866, "grad_norm": 2.8070068359375, "learning_rate": 7.1407483893486135e-06, "loss": 0.5286, "step": 672400 }, { "epoch": 15.487703785576127, "grad_norm": 3.680961847305298, "learning_rate": 7.133476825526099e-06, "loss": 0.5374, "step": 672600 }, { "epoch": 15.492309109330385, "grad_norm": 3.303067922592163, "learning_rate": 7.126205261703582e-06, "loss": 0.5306, "step": 672800 }, { "epoch": 15.496914433084646, "grad_norm": 3.2984859943389893, "learning_rate": 7.118933697881066e-06, "loss": 0.5258, "step": 673000 }, { "epoch": 15.501519756838906, "grad_norm": 3.2153737545013428, "learning_rate": 7.111662134058551e-06, "loss": 0.5297, "step": 673200 }, { "epoch": 15.506125080593165, "grad_norm": 2.8928987979888916, "learning_rate": 7.1043905702360355e-06, "loss": 0.5369, "step": 673400 }, { "epoch": 15.510730404347425, "grad_norm": 3.484666109085083, "learning_rate": 7.097119006413519e-06, "loss": 0.5319, "step": 673600 }, { "epoch": 15.515335728101686, "grad_norm": 3.3581833839416504, "learning_rate": 7.089883800410116e-06, "loss": 0.5299, "step": 673800 }, { "epoch": 15.519941051855945, "grad_norm": 3.2200675010681152, "learning_rate": 7.082612236587601e-06, "loss": 0.5264, "step": 674000 }, { "epoch": 15.524546375610205, "grad_norm": 3.4437928199768066, "learning_rate": 7.075340672765085e-06, "loss": 0.5304, "step": 674200 }, { "epoch": 15.529151699364466, "grad_norm": 2.679928779602051, "learning_rate": 7.06806910894257e-06, "loss": 0.5364, "step": 674400 }, { "epoch": 15.533757023118724, "grad_norm": 3.410003662109375, "learning_rate": 7.0607975451200535e-06, "loss": 0.5329, "step": 674600 }, { "epoch": 15.538362346872985, "grad_norm": 2.5245554447174072, "learning_rate": 7.053525981297538e-06, "loss": 0.5294, "step": 674800 }, { "epoch": 15.542967670627245, "grad_norm": 3.400965452194214, "learning_rate": 7.046290775294135e-06, "loss": 0.5289, "step": 675000 }, { "epoch": 15.547572994381506, "grad_norm": 3.5481555461883545, "learning_rate": 7.03901921147162e-06, "loss": 0.534, "step": 675200 }, { "epoch": 15.552178318135764, "grad_norm": 3.425252914428711, "learning_rate": 7.031747647649104e-06, "loss": 0.5249, "step": 675400 }, { "epoch": 15.556783641890025, "grad_norm": 3.6418352127075195, "learning_rate": 7.024476083826587e-06, "loss": 0.5178, "step": 675600 }, { "epoch": 15.561388965644285, "grad_norm": 2.9544789791107178, "learning_rate": 7.017204520004072e-06, "loss": 0.5337, "step": 675800 }, { "epoch": 15.565994289398544, "grad_norm": 2.8568692207336426, "learning_rate": 7.0099329561815565e-06, "loss": 0.5304, "step": 676000 }, { "epoch": 15.570599613152805, "grad_norm": 3.2933669090270996, "learning_rate": 7.002661392359042e-06, "loss": 0.5202, "step": 676200 }, { "epoch": 15.575204936907065, "grad_norm": 2.9382436275482178, "learning_rate": 6.995389828536525e-06, "loss": 0.5256, "step": 676400 }, { "epoch": 15.579810260661324, "grad_norm": 2.4080753326416016, "learning_rate": 6.988118264714009e-06, "loss": 0.5245, "step": 676600 }, { "epoch": 15.584415584415584, "grad_norm": 2.816089630126953, "learning_rate": 6.980846700891494e-06, "loss": 0.5256, "step": 676800 }, { "epoch": 15.589020908169845, "grad_norm": 2.963587999343872, "learning_rate": 6.9735751370689785e-06, "loss": 0.5329, "step": 677000 }, { "epoch": 15.593626231924103, "grad_norm": 3.300342559814453, "learning_rate": 6.966303573246462e-06, "loss": 0.5376, "step": 677200 }, { "epoch": 15.598231555678364, "grad_norm": 3.2802958488464355, "learning_rate": 6.959032009423947e-06, "loss": 0.528, "step": 677400 }, { "epoch": 15.602836879432624, "grad_norm": 2.6798970699310303, "learning_rate": 6.951760445601431e-06, "loss": 0.5237, "step": 677600 }, { "epoch": 15.607442203186885, "grad_norm": 3.517439603805542, "learning_rate": 6.944488881778916e-06, "loss": 0.5408, "step": 677800 }, { "epoch": 15.612047526941144, "grad_norm": 3.87048602104187, "learning_rate": 6.9372173179564e-06, "loss": 0.534, "step": 678000 }, { "epoch": 15.616652850695404, "grad_norm": 2.775428295135498, "learning_rate": 6.929945754133884e-06, "loss": 0.5248, "step": 678200 }, { "epoch": 15.621258174449665, "grad_norm": 2.7201571464538574, "learning_rate": 6.922710548130481e-06, "loss": 0.5338, "step": 678400 }, { "epoch": 15.625863498203923, "grad_norm": 2.8548715114593506, "learning_rate": 6.915438984307966e-06, "loss": 0.5275, "step": 678600 }, { "epoch": 15.630468821958184, "grad_norm": 2.6835644245147705, "learning_rate": 6.90816742048545e-06, "loss": 0.532, "step": 678800 }, { "epoch": 15.635074145712444, "grad_norm": 2.5075621604919434, "learning_rate": 6.900895856662933e-06, "loss": 0.5155, "step": 679000 }, { "epoch": 15.639679469466703, "grad_norm": 3.476208448410034, "learning_rate": 6.89366065065953e-06, "loss": 0.5243, "step": 679200 }, { "epoch": 15.644284793220963, "grad_norm": 2.8655552864074707, "learning_rate": 6.886389086837015e-06, "loss": 0.5279, "step": 679400 }, { "epoch": 15.648890116975224, "grad_norm": 2.7669894695281982, "learning_rate": 6.8791175230144996e-06, "loss": 0.5381, "step": 679600 }, { "epoch": 15.653495440729483, "grad_norm": 4.442414283752441, "learning_rate": 6.871845959191985e-06, "loss": 0.5195, "step": 679800 }, { "epoch": 15.658100764483743, "grad_norm": 2.7395546436309814, "learning_rate": 6.864574395369468e-06, "loss": 0.5274, "step": 680000 }, { "epoch": 15.662706088238004, "grad_norm": 3.1513760089874268, "learning_rate": 6.857302831546952e-06, "loss": 0.5258, "step": 680200 }, { "epoch": 15.667311411992262, "grad_norm": 3.7221457958221436, "learning_rate": 6.850031267724437e-06, "loss": 0.5325, "step": 680400 }, { "epoch": 15.671916735746523, "grad_norm": 2.821197271347046, "learning_rate": 6.8427597039019215e-06, "loss": 0.5167, "step": 680600 }, { "epoch": 15.676522059500783, "grad_norm": 3.9420833587646484, "learning_rate": 6.835488140079406e-06, "loss": 0.5301, "step": 680800 }, { "epoch": 15.681127383255042, "grad_norm": 3.045525312423706, "learning_rate": 6.82821657625689e-06, "loss": 0.5269, "step": 681000 }, { "epoch": 15.685732707009302, "grad_norm": 3.1435394287109375, "learning_rate": 6.820945012434374e-06, "loss": 0.5343, "step": 681200 }, { "epoch": 15.690338030763563, "grad_norm": 3.5429775714874268, "learning_rate": 6.813673448611859e-06, "loss": 0.5335, "step": 681400 }, { "epoch": 15.694943354517823, "grad_norm": 3.320526361465454, "learning_rate": 6.8064018847893435e-06, "loss": 0.5305, "step": 681600 }, { "epoch": 15.699548678272082, "grad_norm": 3.505725860595703, "learning_rate": 6.799130320966827e-06, "loss": 0.5378, "step": 681800 }, { "epoch": 15.704154002026343, "grad_norm": 3.6867668628692627, "learning_rate": 6.791858757144312e-06, "loss": 0.5328, "step": 682000 }, { "epoch": 15.708759325780603, "grad_norm": 3.264688491821289, "learning_rate": 6.784587193321796e-06, "loss": 0.5392, "step": 682200 }, { "epoch": 15.713364649534862, "grad_norm": 2.858060121536255, "learning_rate": 6.77731562949928e-06, "loss": 0.5387, "step": 682400 }, { "epoch": 15.717969973289122, "grad_norm": 3.1449389457702637, "learning_rate": 6.7700440656767646e-06, "loss": 0.5224, "step": 682600 }, { "epoch": 15.722575297043383, "grad_norm": 4.474485397338867, "learning_rate": 6.762772501854249e-06, "loss": 0.5496, "step": 682800 }, { "epoch": 15.727180620797641, "grad_norm": 3.6581742763519287, "learning_rate": 6.755500938031734e-06, "loss": 0.5347, "step": 683000 }, { "epoch": 15.731785944551902, "grad_norm": 3.4204602241516113, "learning_rate": 6.748229374209218e-06, "loss": 0.5319, "step": 683200 }, { "epoch": 15.736391268306162, "grad_norm": 3.710782289505005, "learning_rate": 6.7409578103867014e-06, "loss": 0.5362, "step": 683400 }, { "epoch": 15.740996592060421, "grad_norm": 3.196833610534668, "learning_rate": 6.7336862465641865e-06, "loss": 0.5305, "step": 683600 }, { "epoch": 15.745601915814682, "grad_norm": 3.187162160873413, "learning_rate": 6.726414682741671e-06, "loss": 0.5248, "step": 683800 }, { "epoch": 15.750207239568942, "grad_norm": 3.471301794052124, "learning_rate": 6.719143118919155e-06, "loss": 0.5289, "step": 684000 }, { "epoch": 15.7548125633232, "grad_norm": 3.092719316482544, "learning_rate": 6.711871555096639e-06, "loss": 0.5483, "step": 684200 }, { "epoch": 15.759417887077461, "grad_norm": 2.3404221534729004, "learning_rate": 6.704599991274123e-06, "loss": 0.5327, "step": 684400 }, { "epoch": 15.764023210831722, "grad_norm": 3.6259121894836426, "learning_rate": 6.69736478527072e-06, "loss": 0.53, "step": 684600 }, { "epoch": 15.768628534585982, "grad_norm": 3.0392374992370605, "learning_rate": 6.690129579267317e-06, "loss": 0.5328, "step": 684800 }, { "epoch": 15.773233858340241, "grad_norm": 3.2242770195007324, "learning_rate": 6.682858015444802e-06, "loss": 0.5337, "step": 685000 }, { "epoch": 15.777839182094501, "grad_norm": 2.7651524543762207, "learning_rate": 6.6755864516222865e-06, "loss": 0.5286, "step": 685200 }, { "epoch": 15.782444505848762, "grad_norm": 2.7037301063537598, "learning_rate": 6.66831488779977e-06, "loss": 0.5247, "step": 685400 }, { "epoch": 15.78704982960302, "grad_norm": 3.071185350418091, "learning_rate": 6.661043323977255e-06, "loss": 0.5305, "step": 685600 }, { "epoch": 15.791655153357281, "grad_norm": 3.1269609928131104, "learning_rate": 6.653771760154739e-06, "loss": 0.523, "step": 685800 }, { "epoch": 15.796260477111542, "grad_norm": 2.9973862171173096, "learning_rate": 6.646500196332223e-06, "loss": 0.5328, "step": 686000 }, { "epoch": 15.8008658008658, "grad_norm": 2.5962016582489014, "learning_rate": 6.6392286325097076e-06, "loss": 0.5325, "step": 686200 }, { "epoch": 15.80547112462006, "grad_norm": 3.33072566986084, "learning_rate": 6.631957068687192e-06, "loss": 0.5284, "step": 686400 }, { "epoch": 15.810076448374321, "grad_norm": 2.7956624031066895, "learning_rate": 6.624685504864677e-06, "loss": 0.5206, "step": 686600 }, { "epoch": 15.81468177212858, "grad_norm": 3.8837435245513916, "learning_rate": 6.617413941042161e-06, "loss": 0.5238, "step": 686800 }, { "epoch": 15.81928709588284, "grad_norm": 3.072110652923584, "learning_rate": 6.6101423772196445e-06, "loss": 0.5346, "step": 687000 }, { "epoch": 15.823892419637101, "grad_norm": 2.895686388015747, "learning_rate": 6.6028708133971295e-06, "loss": 0.5327, "step": 687200 }, { "epoch": 15.82849774339136, "grad_norm": 3.0027430057525635, "learning_rate": 6.595599249574614e-06, "loss": 0.5186, "step": 687400 }, { "epoch": 15.83310306714562, "grad_norm": 2.7608888149261475, "learning_rate": 6.588327685752098e-06, "loss": 0.5347, "step": 687600 }, { "epoch": 15.83770839089988, "grad_norm": 3.104914903640747, "learning_rate": 6.581056121929582e-06, "loss": 0.5289, "step": 687800 }, { "epoch": 15.84231371465414, "grad_norm": 4.210239410400391, "learning_rate": 6.573784558107066e-06, "loss": 0.542, "step": 688000 }, { "epoch": 15.8469190384084, "grad_norm": 2.743356704711914, "learning_rate": 6.566512994284551e-06, "loss": 0.5315, "step": 688200 }, { "epoch": 15.85152436216266, "grad_norm": 3.4717984199523926, "learning_rate": 6.559241430462036e-06, "loss": 0.5337, "step": 688400 }, { "epoch": 15.85612968591692, "grad_norm": 3.4075236320495605, "learning_rate": 6.551969866639519e-06, "loss": 0.5363, "step": 688600 }, { "epoch": 15.86073500967118, "grad_norm": 4.167804718017578, "learning_rate": 6.544698302817004e-06, "loss": 0.5274, "step": 688800 }, { "epoch": 15.86534033342544, "grad_norm": 3.3863329887390137, "learning_rate": 6.537426738994488e-06, "loss": 0.5284, "step": 689000 }, { "epoch": 15.8699456571797, "grad_norm": 3.17704439163208, "learning_rate": 6.5301551751719726e-06, "loss": 0.5336, "step": 689200 }, { "epoch": 15.874550980933959, "grad_norm": 2.546211004257202, "learning_rate": 6.522883611349458e-06, "loss": 0.5344, "step": 689400 }, { "epoch": 15.87915630468822, "grad_norm": 3.9471852779388428, "learning_rate": 6.515612047526941e-06, "loss": 0.5266, "step": 689600 }, { "epoch": 15.88376162844248, "grad_norm": 2.803940773010254, "learning_rate": 6.508340483704425e-06, "loss": 0.5422, "step": 689800 }, { "epoch": 15.888366952196739, "grad_norm": 3.0068013668060303, "learning_rate": 6.50106891988191e-06, "loss": 0.5318, "step": 690000 }, { "epoch": 15.892972275951, "grad_norm": 3.8956100940704346, "learning_rate": 6.4937973560593945e-06, "loss": 0.5329, "step": 690200 }, { "epoch": 15.89757759970526, "grad_norm": 2.6682162284851074, "learning_rate": 6.486562150055991e-06, "loss": 0.5214, "step": 690400 }, { "epoch": 15.902182923459518, "grad_norm": 3.1031453609466553, "learning_rate": 6.479290586233476e-06, "loss": 0.5215, "step": 690600 }, { "epoch": 15.906788247213779, "grad_norm": 3.070067882537842, "learning_rate": 6.47201902241096e-06, "loss": 0.5341, "step": 690800 }, { "epoch": 15.91139357096804, "grad_norm": 3.4854178428649902, "learning_rate": 6.464747458588444e-06, "loss": 0.5161, "step": 691000 }, { "epoch": 15.9159988947223, "grad_norm": 2.887986421585083, "learning_rate": 6.457475894765929e-06, "loss": 0.5282, "step": 691200 }, { "epoch": 15.920604218476559, "grad_norm": 3.0503897666931152, "learning_rate": 6.4502043309434125e-06, "loss": 0.5358, "step": 691400 }, { "epoch": 15.925209542230819, "grad_norm": 2.886718988418579, "learning_rate": 6.442932767120897e-06, "loss": 0.5237, "step": 691600 }, { "epoch": 15.92981486598508, "grad_norm": 3.217623710632324, "learning_rate": 6.435661203298382e-06, "loss": 0.5236, "step": 691800 }, { "epoch": 15.934420189739338, "grad_norm": 3.3502800464630127, "learning_rate": 6.428389639475866e-06, "loss": 0.531, "step": 692000 }, { "epoch": 15.939025513493599, "grad_norm": 2.8795714378356934, "learning_rate": 6.42111807565335e-06, "loss": 0.5261, "step": 692200 }, { "epoch": 15.94363083724786, "grad_norm": 3.2952957153320312, "learning_rate": 6.4138465118308345e-06, "loss": 0.5382, "step": 692400 }, { "epoch": 15.948236161002118, "grad_norm": 3.00901460647583, "learning_rate": 6.406574948008319e-06, "loss": 0.5271, "step": 692600 }, { "epoch": 15.952841484756378, "grad_norm": 2.900832414627075, "learning_rate": 6.399303384185804e-06, "loss": 0.5227, "step": 692800 }, { "epoch": 15.957446808510639, "grad_norm": 3.579869031906128, "learning_rate": 6.392031820363287e-06, "loss": 0.5219, "step": 693000 }, { "epoch": 15.962052132264898, "grad_norm": 2.8938448429107666, "learning_rate": 6.384760256540771e-06, "loss": 0.5369, "step": 693200 }, { "epoch": 15.966657456019158, "grad_norm": 3.0627634525299072, "learning_rate": 6.377488692718256e-06, "loss": 0.5327, "step": 693400 }, { "epoch": 15.971262779773419, "grad_norm": 3.0290048122406006, "learning_rate": 6.370217128895741e-06, "loss": 0.5283, "step": 693600 }, { "epoch": 15.975868103527677, "grad_norm": 2.843632698059082, "learning_rate": 6.362945565073225e-06, "loss": 0.5388, "step": 693800 }, { "epoch": 15.980473427281938, "grad_norm": 3.2812376022338867, "learning_rate": 6.355674001250709e-06, "loss": 0.5199, "step": 694000 }, { "epoch": 15.985078751036198, "grad_norm": 3.616701126098633, "learning_rate": 6.348438795247306e-06, "loss": 0.5359, "step": 694200 }, { "epoch": 15.989684074790457, "grad_norm": 3.0894737243652344, "learning_rate": 6.34116723142479e-06, "loss": 0.5268, "step": 694400 }, { "epoch": 15.994289398544717, "grad_norm": 3.021001100540161, "learning_rate": 6.333895667602275e-06, "loss": 0.5346, "step": 694600 }, { "epoch": 15.998894722298978, "grad_norm": 3.486922264099121, "learning_rate": 6.326624103779759e-06, "loss": 0.5314, "step": 694800 }, { "epoch": 16.0, "eval_loss": 0.5152611136436462, "eval_runtime": 161.1159, "eval_samples_per_second": 176.029, "eval_steps_per_second": 11.005, "step": 694848 }, { "epoch": 16.003500046053237, "grad_norm": 3.235459566116333, "learning_rate": 6.319352539957243e-06, "loss": 0.5261, "step": 695000 }, { "epoch": 16.0081053698075, "grad_norm": 2.7268731594085693, "learning_rate": 6.312080976134728e-06, "loss": 0.5294, "step": 695200 }, { "epoch": 16.012710693561758, "grad_norm": 2.6637632846832275, "learning_rate": 6.304809412312212e-06, "loss": 0.5276, "step": 695400 }, { "epoch": 16.017316017316016, "grad_norm": 2.3737680912017822, "learning_rate": 6.297537848489696e-06, "loss": 0.5162, "step": 695600 }, { "epoch": 16.02192134107028, "grad_norm": 2.3051598072052, "learning_rate": 6.2902662846671806e-06, "loss": 0.5141, "step": 695800 }, { "epoch": 16.026526664824537, "grad_norm": 3.3139994144439697, "learning_rate": 6.282994720844665e-06, "loss": 0.5175, "step": 696000 }, { "epoch": 16.031131988578796, "grad_norm": 2.9257097244262695, "learning_rate": 6.27572315702215e-06, "loss": 0.5121, "step": 696200 }, { "epoch": 16.035737312333058, "grad_norm": 3.4968149662017822, "learning_rate": 6.268451593199633e-06, "loss": 0.5392, "step": 696400 }, { "epoch": 16.040342636087317, "grad_norm": 2.8969199657440186, "learning_rate": 6.2611800293771174e-06, "loss": 0.5296, "step": 696600 }, { "epoch": 16.044947959841576, "grad_norm": 2.9978652000427246, "learning_rate": 6.253944823373714e-06, "loss": 0.5304, "step": 696800 }, { "epoch": 16.049553283595838, "grad_norm": 3.812542676925659, "learning_rate": 6.246673259551199e-06, "loss": 0.526, "step": 697000 }, { "epoch": 16.054158607350097, "grad_norm": 3.203511953353882, "learning_rate": 6.239401695728684e-06, "loss": 0.5188, "step": 697200 }, { "epoch": 16.058763931104355, "grad_norm": 3.406111478805542, "learning_rate": 6.232130131906168e-06, "loss": 0.5199, "step": 697400 }, { "epoch": 16.063369254858618, "grad_norm": 3.4208409786224365, "learning_rate": 6.224858568083652e-06, "loss": 0.5317, "step": 697600 }, { "epoch": 16.067974578612876, "grad_norm": 3.2451865673065186, "learning_rate": 6.217587004261136e-06, "loss": 0.5201, "step": 697800 }, { "epoch": 16.072579902367135, "grad_norm": 2.8705670833587646, "learning_rate": 6.210315440438621e-06, "loss": 0.53, "step": 698000 }, { "epoch": 16.077185226121397, "grad_norm": 3.8681085109710693, "learning_rate": 6.203043876616106e-06, "loss": 0.5249, "step": 698200 }, { "epoch": 16.081790549875656, "grad_norm": 4.016938209533691, "learning_rate": 6.195772312793589e-06, "loss": 0.5138, "step": 698400 }, { "epoch": 16.086395873629915, "grad_norm": 2.935976266860962, "learning_rate": 6.188500748971074e-06, "loss": 0.5238, "step": 698600 }, { "epoch": 16.091001197384177, "grad_norm": 3.2641563415527344, "learning_rate": 6.181229185148558e-06, "loss": 0.5174, "step": 698800 }, { "epoch": 16.095606521138436, "grad_norm": 3.6100635528564453, "learning_rate": 6.173993979145155e-06, "loss": 0.5299, "step": 699000 }, { "epoch": 16.100211844892694, "grad_norm": 3.5034003257751465, "learning_rate": 6.16672241532264e-06, "loss": 0.5293, "step": 699200 }, { "epoch": 16.104817168646957, "grad_norm": 2.800861358642578, "learning_rate": 6.159450851500124e-06, "loss": 0.5214, "step": 699400 }, { "epoch": 16.109422492401215, "grad_norm": 3.0961642265319824, "learning_rate": 6.152179287677608e-06, "loss": 0.5254, "step": 699600 }, { "epoch": 16.114027816155474, "grad_norm": 2.8042900562286377, "learning_rate": 6.144907723855093e-06, "loss": 0.5155, "step": 699800 }, { "epoch": 16.118633139909736, "grad_norm": 3.172004461288452, "learning_rate": 6.137636160032577e-06, "loss": 0.5195, "step": 700000 }, { "epoch": 16.123238463663995, "grad_norm": 3.4041786193847656, "learning_rate": 6.1303645962100605e-06, "loss": 0.5156, "step": 700200 }, { "epoch": 16.127843787418257, "grad_norm": 3.219374895095825, "learning_rate": 6.1230930323875455e-06, "loss": 0.5299, "step": 700400 }, { "epoch": 16.132449111172516, "grad_norm": 2.8847038745880127, "learning_rate": 6.11582146856503e-06, "loss": 0.5128, "step": 700600 }, { "epoch": 16.137054434926775, "grad_norm": 2.9731578826904297, "learning_rate": 6.108622620380739e-06, "loss": 0.5165, "step": 700800 }, { "epoch": 16.141659758681037, "grad_norm": 3.0595650672912598, "learning_rate": 6.1013510565582236e-06, "loss": 0.5284, "step": 701000 }, { "epoch": 16.146265082435296, "grad_norm": 2.8716259002685547, "learning_rate": 6.094079492735709e-06, "loss": 0.519, "step": 701200 }, { "epoch": 16.150870406189554, "grad_norm": 3.1690585613250732, "learning_rate": 6.086807928913192e-06, "loss": 0.529, "step": 701400 }, { "epoch": 16.155475729943817, "grad_norm": 2.8554840087890625, "learning_rate": 6.079536365090676e-06, "loss": 0.5153, "step": 701600 }, { "epoch": 16.160081053698075, "grad_norm": 2.701791524887085, "learning_rate": 6.072264801268161e-06, "loss": 0.5385, "step": 701800 }, { "epoch": 16.164686377452334, "grad_norm": 2.9476139545440674, "learning_rate": 6.0649932374456455e-06, "loss": 0.5344, "step": 702000 }, { "epoch": 16.169291701206596, "grad_norm": 3.677595853805542, "learning_rate": 6.057721673623129e-06, "loss": 0.5302, "step": 702200 }, { "epoch": 16.173897024960855, "grad_norm": 3.322723388671875, "learning_rate": 6.050450109800614e-06, "loss": 0.5301, "step": 702400 }, { "epoch": 16.178502348715114, "grad_norm": 3.177597761154175, "learning_rate": 6.043178545978098e-06, "loss": 0.5293, "step": 702600 }, { "epoch": 16.183107672469376, "grad_norm": 2.9423975944519043, "learning_rate": 6.035906982155583e-06, "loss": 0.5263, "step": 702800 }, { "epoch": 16.187712996223635, "grad_norm": 3.1583240032196045, "learning_rate": 6.028635418333067e-06, "loss": 0.5101, "step": 703000 }, { "epoch": 16.192318319977893, "grad_norm": 2.8584578037261963, "learning_rate": 6.021363854510551e-06, "loss": 0.523, "step": 703200 }, { "epoch": 16.196923643732156, "grad_norm": 2.663959503173828, "learning_rate": 6.014092290688036e-06, "loss": 0.5341, "step": 703400 }, { "epoch": 16.201528967486414, "grad_norm": 3.1101975440979004, "learning_rate": 6.00682072686552e-06, "loss": 0.5324, "step": 703600 }, { "epoch": 16.206134291240673, "grad_norm": 3.0388479232788086, "learning_rate": 5.9995491630430035e-06, "loss": 0.5287, "step": 703800 }, { "epoch": 16.210739614994935, "grad_norm": 2.731867790222168, "learning_rate": 5.9922775992204886e-06, "loss": 0.5305, "step": 704000 }, { "epoch": 16.215344938749194, "grad_norm": 3.142996311187744, "learning_rate": 5.985006035397973e-06, "loss": 0.5323, "step": 704200 }, { "epoch": 16.219950262503453, "grad_norm": 3.147834062576294, "learning_rate": 5.977734471575458e-06, "loss": 0.5253, "step": 704400 }, { "epoch": 16.224555586257715, "grad_norm": 2.3879802227020264, "learning_rate": 5.970462907752941e-06, "loss": 0.5245, "step": 704600 }, { "epoch": 16.229160910011974, "grad_norm": 2.7794997692108154, "learning_rate": 5.9631913439304254e-06, "loss": 0.5124, "step": 704800 }, { "epoch": 16.233766233766232, "grad_norm": 3.1171607971191406, "learning_rate": 5.9559197801079105e-06, "loss": 0.5293, "step": 705000 }, { "epoch": 16.238371557520495, "grad_norm": 3.1548678874969482, "learning_rate": 5.948648216285395e-06, "loss": 0.523, "step": 705200 }, { "epoch": 16.242976881274753, "grad_norm": 3.2496373653411865, "learning_rate": 5.941376652462878e-06, "loss": 0.5198, "step": 705400 }, { "epoch": 16.247582205029012, "grad_norm": 3.0726122856140137, "learning_rate": 5.934105088640363e-06, "loss": 0.5216, "step": 705600 }, { "epoch": 16.252187528783274, "grad_norm": 3.4089269638061523, "learning_rate": 5.92686988263696e-06, "loss": 0.5391, "step": 705800 }, { "epoch": 16.256792852537533, "grad_norm": 2.813822031021118, "learning_rate": 5.919598318814444e-06, "loss": 0.5194, "step": 706000 }, { "epoch": 16.26139817629179, "grad_norm": 3.2995693683624268, "learning_rate": 5.912326754991929e-06, "loss": 0.5242, "step": 706200 }, { "epoch": 16.266003500046054, "grad_norm": 3.632739543914795, "learning_rate": 5.905055191169413e-06, "loss": 0.5327, "step": 706400 }, { "epoch": 16.270608823800313, "grad_norm": 2.6692042350769043, "learning_rate": 5.897783627346897e-06, "loss": 0.5227, "step": 706600 }, { "epoch": 16.27521414755457, "grad_norm": 3.4040117263793945, "learning_rate": 5.890512063524382e-06, "loss": 0.524, "step": 706800 }, { "epoch": 16.279819471308834, "grad_norm": 2.4502665996551514, "learning_rate": 5.883240499701866e-06, "loss": 0.5203, "step": 707000 }, { "epoch": 16.284424795063092, "grad_norm": 3.1462149620056152, "learning_rate": 5.8759689358793504e-06, "loss": 0.5262, "step": 707200 }, { "epoch": 16.289030118817355, "grad_norm": 3.5388402938842773, "learning_rate": 5.868697372056835e-06, "loss": 0.5278, "step": 707400 }, { "epoch": 16.293635442571613, "grad_norm": 2.5811402797698975, "learning_rate": 5.861425808234319e-06, "loss": 0.5255, "step": 707600 }, { "epoch": 16.298240766325872, "grad_norm": 2.9123117923736572, "learning_rate": 5.854154244411804e-06, "loss": 0.5326, "step": 707800 }, { "epoch": 16.302846090080134, "grad_norm": 3.3094441890716553, "learning_rate": 5.846919038408401e-06, "loss": 0.5216, "step": 708000 }, { "epoch": 16.307451413834393, "grad_norm": 3.1968655586242676, "learning_rate": 5.839647474585885e-06, "loss": 0.5238, "step": 708200 }, { "epoch": 16.31205673758865, "grad_norm": 3.054832935333252, "learning_rate": 5.8323759107633685e-06, "loss": 0.5256, "step": 708400 }, { "epoch": 16.316662061342914, "grad_norm": 2.9398386478424072, "learning_rate": 5.8251043469408535e-06, "loss": 0.5221, "step": 708600 }, { "epoch": 16.321267385097173, "grad_norm": 2.4585120677948, "learning_rate": 5.817832783118338e-06, "loss": 0.5256, "step": 708800 }, { "epoch": 16.32587270885143, "grad_norm": 3.5368127822875977, "learning_rate": 5.810561219295822e-06, "loss": 0.5297, "step": 709000 }, { "epoch": 16.330478032605694, "grad_norm": 3.043287515640259, "learning_rate": 5.803289655473306e-06, "loss": 0.5319, "step": 709200 }, { "epoch": 16.335083356359952, "grad_norm": 3.1237127780914307, "learning_rate": 5.79601809165079e-06, "loss": 0.5316, "step": 709400 }, { "epoch": 16.33968868011421, "grad_norm": 3.545804500579834, "learning_rate": 5.7887465278282755e-06, "loss": 0.5328, "step": 709600 }, { "epoch": 16.344294003868473, "grad_norm": 3.1963136196136475, "learning_rate": 5.78147496400576e-06, "loss": 0.5225, "step": 709800 }, { "epoch": 16.348899327622732, "grad_norm": 3.5646724700927734, "learning_rate": 5.774203400183243e-06, "loss": 0.5219, "step": 710000 }, { "epoch": 16.35350465137699, "grad_norm": 3.244006395339966, "learning_rate": 5.766931836360728e-06, "loss": 0.5275, "step": 710200 }, { "epoch": 16.358109975131253, "grad_norm": 3.042506456375122, "learning_rate": 5.759660272538212e-06, "loss": 0.5267, "step": 710400 }, { "epoch": 16.36271529888551, "grad_norm": 3.204932689666748, "learning_rate": 5.752425066534809e-06, "loss": 0.5237, "step": 710600 }, { "epoch": 16.36732062263977, "grad_norm": 4.271747589111328, "learning_rate": 5.7451535027122935e-06, "loss": 0.5332, "step": 710800 }, { "epoch": 16.371925946394033, "grad_norm": 2.5158495903015137, "learning_rate": 5.737881938889778e-06, "loss": 0.5368, "step": 711000 }, { "epoch": 16.37653127014829, "grad_norm": 3.7071948051452637, "learning_rate": 5.730610375067262e-06, "loss": 0.5206, "step": 711200 }, { "epoch": 16.38113659390255, "grad_norm": 3.480548858642578, "learning_rate": 5.723338811244747e-06, "loss": 0.5189, "step": 711400 }, { "epoch": 16.385741917656812, "grad_norm": 3.0035040378570557, "learning_rate": 5.716067247422231e-06, "loss": 0.527, "step": 711600 }, { "epoch": 16.39034724141107, "grad_norm": 3.1353304386138916, "learning_rate": 5.7087956835997146e-06, "loss": 0.5278, "step": 711800 }, { "epoch": 16.39495256516533, "grad_norm": 3.6932437419891357, "learning_rate": 5.7015241197772e-06, "loss": 0.5242, "step": 712000 }, { "epoch": 16.399557888919592, "grad_norm": 3.175496816635132, "learning_rate": 5.694252555954684e-06, "loss": 0.5197, "step": 712200 }, { "epoch": 16.40416321267385, "grad_norm": 3.2287960052490234, "learning_rate": 5.686980992132168e-06, "loss": 0.5237, "step": 712400 }, { "epoch": 16.40876853642811, "grad_norm": 3.57425594329834, "learning_rate": 5.679709428309652e-06, "loss": 0.5292, "step": 712600 }, { "epoch": 16.41337386018237, "grad_norm": 3.088691234588623, "learning_rate": 5.6724378644871365e-06, "loss": 0.5241, "step": 712800 }, { "epoch": 16.41797918393663, "grad_norm": 2.952932119369507, "learning_rate": 5.665202658483733e-06, "loss": 0.5209, "step": 713000 }, { "epoch": 16.42258450769089, "grad_norm": 3.005553960800171, "learning_rate": 5.6579310946612185e-06, "loss": 0.5168, "step": 713200 }, { "epoch": 16.42718983144515, "grad_norm": 3.226658821105957, "learning_rate": 5.650659530838703e-06, "loss": 0.525, "step": 713400 }, { "epoch": 16.43179515519941, "grad_norm": 2.5592401027679443, "learning_rate": 5.643387967016186e-06, "loss": 0.5251, "step": 713600 }, { "epoch": 16.436400478953672, "grad_norm": 3.2084832191467285, "learning_rate": 5.636116403193671e-06, "loss": 0.5257, "step": 713800 }, { "epoch": 16.44100580270793, "grad_norm": 3.55430006980896, "learning_rate": 5.628844839371155e-06, "loss": 0.5308, "step": 714000 }, { "epoch": 16.44561112646219, "grad_norm": 3.086090564727783, "learning_rate": 5.62157327554864e-06, "loss": 0.5316, "step": 714200 }, { "epoch": 16.450216450216452, "grad_norm": 3.0008442401885986, "learning_rate": 5.614301711726124e-06, "loss": 0.5259, "step": 714400 }, { "epoch": 16.45482177397071, "grad_norm": 2.9424831867218018, "learning_rate": 5.607030147903608e-06, "loss": 0.5265, "step": 714600 }, { "epoch": 16.45942709772497, "grad_norm": 3.0486032962799072, "learning_rate": 5.599758584081093e-06, "loss": 0.5277, "step": 714800 }, { "epoch": 16.46403242147923, "grad_norm": 2.7830045223236084, "learning_rate": 5.592487020258577e-06, "loss": 0.5324, "step": 715000 }, { "epoch": 16.46863774523349, "grad_norm": 3.4023308753967285, "learning_rate": 5.585215456436061e-06, "loss": 0.5357, "step": 715200 }, { "epoch": 16.47324306898775, "grad_norm": 3.0721235275268555, "learning_rate": 5.577943892613546e-06, "loss": 0.5222, "step": 715400 }, { "epoch": 16.47784839274201, "grad_norm": 3.580204486846924, "learning_rate": 5.57067232879103e-06, "loss": 0.5206, "step": 715600 }, { "epoch": 16.48245371649627, "grad_norm": 3.269157648086548, "learning_rate": 5.563400764968514e-06, "loss": 0.5242, "step": 715800 }, { "epoch": 16.48705904025053, "grad_norm": 4.157374858856201, "learning_rate": 5.556129201145998e-06, "loss": 0.5103, "step": 716000 }, { "epoch": 16.49166436400479, "grad_norm": 3.5531797409057617, "learning_rate": 5.548857637323483e-06, "loss": 0.5272, "step": 716200 }, { "epoch": 16.49626968775905, "grad_norm": 3.210127353668213, "learning_rate": 5.541586073500967e-06, "loss": 0.5267, "step": 716400 }, { "epoch": 16.50087501151331, "grad_norm": 3.227973461151123, "learning_rate": 5.534314509678452e-06, "loss": 0.5219, "step": 716600 }, { "epoch": 16.50548033526757, "grad_norm": 3.346400499343872, "learning_rate": 5.527042945855936e-06, "loss": 0.532, "step": 716800 }, { "epoch": 16.51008565902183, "grad_norm": 3.1097564697265625, "learning_rate": 5.51977138203342e-06, "loss": 0.5206, "step": 717000 }, { "epoch": 16.514690982776088, "grad_norm": 3.079420328140259, "learning_rate": 5.5124998182109046e-06, "loss": 0.5197, "step": 717200 }, { "epoch": 16.51929630653035, "grad_norm": 2.728698968887329, "learning_rate": 5.505228254388389e-06, "loss": 0.5102, "step": 717400 }, { "epoch": 16.52390163028461, "grad_norm": 2.969097137451172, "learning_rate": 5.497993048384986e-06, "loss": 0.517, "step": 717600 }, { "epoch": 16.528506954038868, "grad_norm": 4.207714557647705, "learning_rate": 5.490721484562471e-06, "loss": 0.5329, "step": 717800 }, { "epoch": 16.53311227779313, "grad_norm": 3.510308027267456, "learning_rate": 5.483449920739954e-06, "loss": 0.5141, "step": 718000 }, { "epoch": 16.53771760154739, "grad_norm": 3.0636706352233887, "learning_rate": 5.476178356917438e-06, "loss": 0.5215, "step": 718200 }, { "epoch": 16.542322925301647, "grad_norm": 3.4108800888061523, "learning_rate": 5.468943150914035e-06, "loss": 0.5338, "step": 718400 }, { "epoch": 16.54692824905591, "grad_norm": 3.3463311195373535, "learning_rate": 5.46167158709152e-06, "loss": 0.5261, "step": 718600 }, { "epoch": 16.55153357281017, "grad_norm": 3.2347726821899414, "learning_rate": 5.4544000232690045e-06, "loss": 0.5086, "step": 718800 }, { "epoch": 16.556138896564427, "grad_norm": 3.8457565307617188, "learning_rate": 5.447128459446489e-06, "loss": 0.5202, "step": 719000 }, { "epoch": 16.56074422031869, "grad_norm": 2.905299425125122, "learning_rate": 5.439856895623973e-06, "loss": 0.5187, "step": 719200 }, { "epoch": 16.565349544072948, "grad_norm": 3.7381389141082764, "learning_rate": 5.432585331801457e-06, "loss": 0.5302, "step": 719400 }, { "epoch": 16.569954867827207, "grad_norm": 2.6707417964935303, "learning_rate": 5.425313767978942e-06, "loss": 0.534, "step": 719600 }, { "epoch": 16.57456019158147, "grad_norm": 2.757823944091797, "learning_rate": 5.418042204156426e-06, "loss": 0.5162, "step": 719800 }, { "epoch": 16.579165515335728, "grad_norm": 2.5773825645446777, "learning_rate": 5.41077064033391e-06, "loss": 0.527, "step": 720000 }, { "epoch": 16.583770839089986, "grad_norm": 3.3267204761505127, "learning_rate": 5.403499076511395e-06, "loss": 0.5241, "step": 720200 }, { "epoch": 16.58837616284425, "grad_norm": 2.8863368034362793, "learning_rate": 5.396227512688879e-06, "loss": 0.5274, "step": 720400 }, { "epoch": 16.592981486598507, "grad_norm": 3.51645827293396, "learning_rate": 5.388955948866363e-06, "loss": 0.5319, "step": 720600 }, { "epoch": 16.59758681035277, "grad_norm": 3.1848063468933105, "learning_rate": 5.381684385043848e-06, "loss": 0.5271, "step": 720800 }, { "epoch": 16.602192134107028, "grad_norm": 3.438476085662842, "learning_rate": 5.374412821221332e-06, "loss": 0.5299, "step": 721000 }, { "epoch": 16.606797457861287, "grad_norm": 3.2890145778656006, "learning_rate": 5.367141257398817e-06, "loss": 0.5313, "step": 721200 }, { "epoch": 16.61140278161555, "grad_norm": 3.603616952896118, "learning_rate": 5.3598696935763e-06, "loss": 0.5249, "step": 721400 }, { "epoch": 16.616008105369808, "grad_norm": 3.5745153427124023, "learning_rate": 5.3525981297537845e-06, "loss": 0.5234, "step": 721600 }, { "epoch": 16.620613429124067, "grad_norm": 2.906506061553955, "learning_rate": 5.3453265659312695e-06, "loss": 0.5255, "step": 721800 }, { "epoch": 16.62521875287833, "grad_norm": 3.5672848224639893, "learning_rate": 5.338055002108754e-06, "loss": 0.5171, "step": 722000 }, { "epoch": 16.629824076632588, "grad_norm": 2.794722080230713, "learning_rate": 5.330783438286238e-06, "loss": 0.528, "step": 722200 }, { "epoch": 16.634429400386846, "grad_norm": 2.808533191680908, "learning_rate": 5.323511874463722e-06, "loss": 0.5162, "step": 722400 }, { "epoch": 16.63903472414111, "grad_norm": 2.948071241378784, "learning_rate": 5.316240310641206e-06, "loss": 0.522, "step": 722600 }, { "epoch": 16.643640047895367, "grad_norm": 3.144495964050293, "learning_rate": 5.3089687468186915e-06, "loss": 0.5295, "step": 722800 }, { "epoch": 16.648245371649626, "grad_norm": 3.648850440979004, "learning_rate": 5.301697182996175e-06, "loss": 0.5313, "step": 723000 }, { "epoch": 16.652850695403888, "grad_norm": 3.740377902984619, "learning_rate": 5.294425619173659e-06, "loss": 0.5188, "step": 723200 }, { "epoch": 16.657456019158147, "grad_norm": 3.226428270339966, "learning_rate": 5.287154055351144e-06, "loss": 0.5244, "step": 723400 }, { "epoch": 16.662061342912406, "grad_norm": 3.131943941116333, "learning_rate": 5.279882491528628e-06, "loss": 0.5309, "step": 723600 }, { "epoch": 16.666666666666668, "grad_norm": 3.345491409301758, "learning_rate": 5.2726109277061126e-06, "loss": 0.5319, "step": 723800 }, { "epoch": 16.671271990420927, "grad_norm": 2.5959975719451904, "learning_rate": 5.265339363883597e-06, "loss": 0.5249, "step": 724000 }, { "epoch": 16.675877314175185, "grad_norm": 2.993213415145874, "learning_rate": 5.258067800061081e-06, "loss": 0.5219, "step": 724200 }, { "epoch": 16.680482637929448, "grad_norm": 3.5764873027801514, "learning_rate": 5.250796236238566e-06, "loss": 0.5264, "step": 724400 }, { "epoch": 16.685087961683706, "grad_norm": 3.0356667041778564, "learning_rate": 5.24352467241605e-06, "loss": 0.5139, "step": 724600 }, { "epoch": 16.689693285437965, "grad_norm": 2.6434290409088135, "learning_rate": 5.236253108593534e-06, "loss": 0.5276, "step": 724800 }, { "epoch": 16.694298609192227, "grad_norm": 3.0571916103363037, "learning_rate": 5.228981544771019e-06, "loss": 0.5153, "step": 725000 }, { "epoch": 16.698903932946486, "grad_norm": 3.077500104904175, "learning_rate": 5.221709980948503e-06, "loss": 0.5254, "step": 725200 }, { "epoch": 16.703509256700745, "grad_norm": 3.5419986248016357, "learning_rate": 5.214438417125988e-06, "loss": 0.5232, "step": 725400 }, { "epoch": 16.708114580455007, "grad_norm": 2.9213297367095947, "learning_rate": 5.207166853303471e-06, "loss": 0.5284, "step": 725600 }, { "epoch": 16.712719904209266, "grad_norm": 3.5840189456939697, "learning_rate": 5.199895289480956e-06, "loss": 0.532, "step": 725800 }, { "epoch": 16.717325227963524, "grad_norm": 3.3302419185638428, "learning_rate": 5.1926600834775525e-06, "loss": 0.5257, "step": 726000 }, { "epoch": 16.721930551717787, "grad_norm": 3.81793212890625, "learning_rate": 5.1854248774741494e-06, "loss": 0.5225, "step": 726200 }, { "epoch": 16.726535875472045, "grad_norm": 3.0137243270874023, "learning_rate": 5.1781533136516345e-06, "loss": 0.5172, "step": 726400 }, { "epoch": 16.731141199226304, "grad_norm": 2.965055465698242, "learning_rate": 5.170881749829119e-06, "loss": 0.5248, "step": 726600 }, { "epoch": 16.735746522980566, "grad_norm": 2.7242608070373535, "learning_rate": 5.163610186006602e-06, "loss": 0.5156, "step": 726800 }, { "epoch": 16.740351846734825, "grad_norm": 2.787821054458618, "learning_rate": 5.156338622184087e-06, "loss": 0.5251, "step": 727000 }, { "epoch": 16.744957170489087, "grad_norm": 3.265836238861084, "learning_rate": 5.149067058361571e-06, "loss": 0.5232, "step": 727200 }, { "epoch": 16.749562494243346, "grad_norm": 2.891186475753784, "learning_rate": 5.1417954945390564e-06, "loss": 0.5294, "step": 727400 }, { "epoch": 16.754167817997605, "grad_norm": 4.678770065307617, "learning_rate": 5.13452393071654e-06, "loss": 0.5194, "step": 727600 }, { "epoch": 16.758773141751867, "grad_norm": 3.1750643253326416, "learning_rate": 5.127252366894024e-06, "loss": 0.5256, "step": 727800 }, { "epoch": 16.763378465506126, "grad_norm": 4.042088508605957, "learning_rate": 5.119980803071509e-06, "loss": 0.5231, "step": 728000 }, { "epoch": 16.767983789260384, "grad_norm": 3.6129207611083984, "learning_rate": 5.112709239248993e-06, "loss": 0.5343, "step": 728200 }, { "epoch": 16.772589113014647, "grad_norm": 2.856450319290161, "learning_rate": 5.105437675426477e-06, "loss": 0.5233, "step": 728400 }, { "epoch": 16.777194436768905, "grad_norm": 3.1779849529266357, "learning_rate": 5.098166111603962e-06, "loss": 0.5195, "step": 728600 }, { "epoch": 16.781799760523164, "grad_norm": 3.1687657833099365, "learning_rate": 5.090894547781446e-06, "loss": 0.5315, "step": 728800 }, { "epoch": 16.786405084277426, "grad_norm": 2.3541252613067627, "learning_rate": 5.083622983958931e-06, "loss": 0.5206, "step": 729000 }, { "epoch": 16.791010408031685, "grad_norm": 3.6288070678710938, "learning_rate": 5.076351420136414e-06, "loss": 0.5189, "step": 729200 }, { "epoch": 16.795615731785944, "grad_norm": 3.1729352474212646, "learning_rate": 5.069079856313899e-06, "loss": 0.5242, "step": 729400 }, { "epoch": 16.800221055540206, "grad_norm": 3.039484739303589, "learning_rate": 5.061808292491384e-06, "loss": 0.5298, "step": 729600 }, { "epoch": 16.804826379294465, "grad_norm": 3.6854867935180664, "learning_rate": 5.0546094443070924e-06, "loss": 0.5296, "step": 729800 }, { "epoch": 16.809431703048723, "grad_norm": 3.062591791152954, "learning_rate": 5.0473378804845775e-06, "loss": 0.5335, "step": 730000 }, { "epoch": 16.814037026802986, "grad_norm": 3.3723227977752686, "learning_rate": 5.040066316662062e-06, "loss": 0.5266, "step": 730200 }, { "epoch": 16.818642350557244, "grad_norm": 2.7272493839263916, "learning_rate": 5.032794752839545e-06, "loss": 0.5222, "step": 730400 }, { "epoch": 16.823247674311503, "grad_norm": 3.0595741271972656, "learning_rate": 5.02552318901703e-06, "loss": 0.5252, "step": 730600 }, { "epoch": 16.827852998065765, "grad_norm": 3.748103380203247, "learning_rate": 5.018251625194514e-06, "loss": 0.5266, "step": 730800 }, { "epoch": 16.832458321820024, "grad_norm": 2.6202964782714844, "learning_rate": 5.0109800613719995e-06, "loss": 0.5449, "step": 731000 }, { "epoch": 16.837063645574283, "grad_norm": 2.5717170238494873, "learning_rate": 5.003708497549483e-06, "loss": 0.5277, "step": 731200 }, { "epoch": 16.841668969328545, "grad_norm": 2.716874361038208, "learning_rate": 4.996436933726967e-06, "loss": 0.5235, "step": 731400 }, { "epoch": 16.846274293082804, "grad_norm": 3.9380040168762207, "learning_rate": 4.989165369904452e-06, "loss": 0.5278, "step": 731600 }, { "epoch": 16.850879616837062, "grad_norm": 3.0047407150268555, "learning_rate": 4.981893806081936e-06, "loss": 0.5331, "step": 731800 }, { "epoch": 16.855484940591325, "grad_norm": 3.2942941188812256, "learning_rate": 4.97462224225942e-06, "loss": 0.5394, "step": 732000 }, { "epoch": 16.860090264345583, "grad_norm": 2.960007905960083, "learning_rate": 4.967350678436905e-06, "loss": 0.5284, "step": 732200 }, { "epoch": 16.864695588099842, "grad_norm": 4.325229167938232, "learning_rate": 4.960079114614389e-06, "loss": 0.5142, "step": 732400 }, { "epoch": 16.869300911854104, "grad_norm": 3.9659183025360107, "learning_rate": 4.952807550791874e-06, "loss": 0.5277, "step": 732600 }, { "epoch": 16.873906235608363, "grad_norm": 4.010354042053223, "learning_rate": 4.9455359869693574e-06, "loss": 0.545, "step": 732800 }, { "epoch": 16.87851155936262, "grad_norm": 2.861638307571411, "learning_rate": 4.938264423146842e-06, "loss": 0.5232, "step": 733000 }, { "epoch": 16.883116883116884, "grad_norm": 3.198781967163086, "learning_rate": 4.930992859324327e-06, "loss": 0.5184, "step": 733200 }, { "epoch": 16.887722206871143, "grad_norm": 3.7329859733581543, "learning_rate": 4.923721295501811e-06, "loss": 0.5179, "step": 733400 }, { "epoch": 16.8923275306254, "grad_norm": 3.9860944747924805, "learning_rate": 4.916449731679295e-06, "loss": 0.5298, "step": 733600 }, { "epoch": 16.896932854379664, "grad_norm": 3.4595282077789307, "learning_rate": 4.909178167856779e-06, "loss": 0.5261, "step": 733800 }, { "epoch": 16.901538178133922, "grad_norm": 3.319841146469116, "learning_rate": 4.901906604034264e-06, "loss": 0.5225, "step": 734000 }, { "epoch": 16.90614350188818, "grad_norm": 3.025320529937744, "learning_rate": 4.894635040211749e-06, "loss": 0.5423, "step": 734200 }, { "epoch": 16.910748825642443, "grad_norm": 3.0882341861724854, "learning_rate": 4.887363476389233e-06, "loss": 0.5239, "step": 734400 }, { "epoch": 16.915354149396702, "grad_norm": 3.2497994899749756, "learning_rate": 4.880091912566716e-06, "loss": 0.5168, "step": 734600 }, { "epoch": 16.919959473150964, "grad_norm": 3.428805351257324, "learning_rate": 4.872820348744201e-06, "loss": 0.5209, "step": 734800 }, { "epoch": 16.924564796905223, "grad_norm": 3.8149008750915527, "learning_rate": 4.8655487849216855e-06, "loss": 0.5226, "step": 735000 }, { "epoch": 16.92917012065948, "grad_norm": 2.430985450744629, "learning_rate": 4.85827722109917e-06, "loss": 0.5272, "step": 735200 }, { "epoch": 16.933775444413744, "grad_norm": 3.142275333404541, "learning_rate": 4.851005657276654e-06, "loss": 0.5294, "step": 735400 }, { "epoch": 16.938380768168003, "grad_norm": 3.180042028427124, "learning_rate": 4.843734093454138e-06, "loss": 0.5203, "step": 735600 }, { "epoch": 16.94298609192226, "grad_norm": 3.004331350326538, "learning_rate": 4.836462529631623e-06, "loss": 0.5244, "step": 735800 }, { "epoch": 16.947591415676523, "grad_norm": 2.585542678833008, "learning_rate": 4.8291909658091075e-06, "loss": 0.5289, "step": 736000 }, { "epoch": 16.952196739430782, "grad_norm": 3.3415260314941406, "learning_rate": 4.821919401986591e-06, "loss": 0.5253, "step": 736200 }, { "epoch": 16.95680206318504, "grad_norm": 3.3344316482543945, "learning_rate": 4.814647838164076e-06, "loss": 0.5234, "step": 736400 }, { "epoch": 16.961407386939303, "grad_norm": 2.819533348083496, "learning_rate": 4.807412632160673e-06, "loss": 0.5313, "step": 736600 }, { "epoch": 16.966012710693562, "grad_norm": 3.417186737060547, "learning_rate": 4.80017742615727e-06, "loss": 0.5333, "step": 736800 }, { "epoch": 16.97061803444782, "grad_norm": 3.4736814498901367, "learning_rate": 4.792905862334754e-06, "loss": 0.5133, "step": 737000 }, { "epoch": 16.975223358202083, "grad_norm": 2.8336431980133057, "learning_rate": 4.785634298512238e-06, "loss": 0.5268, "step": 737200 }, { "epoch": 16.97982868195634, "grad_norm": 3.896120071411133, "learning_rate": 4.778362734689722e-06, "loss": 0.5304, "step": 737400 }, { "epoch": 16.9844340057106, "grad_norm": 3.212686777114868, "learning_rate": 4.771127528686319e-06, "loss": 0.5216, "step": 737600 }, { "epoch": 16.989039329464863, "grad_norm": 2.8021464347839355, "learning_rate": 4.7638559648638035e-06, "loss": 0.5258, "step": 737800 }, { "epoch": 16.99364465321912, "grad_norm": 2.3054471015930176, "learning_rate": 4.756584401041289e-06, "loss": 0.5119, "step": 738000 }, { "epoch": 16.99824997697338, "grad_norm": 3.4670674800872803, "learning_rate": 4.749312837218773e-06, "loss": 0.5228, "step": 738200 }, { "epoch": 17.0, "eval_loss": 0.513902485370636, "eval_runtime": 169.6655, "eval_samples_per_second": 167.158, "eval_steps_per_second": 10.45, "step": 738276 }, { "epoch": 17.002855300727642, "grad_norm": 4.795275688171387, "learning_rate": 4.74207763121537e-06, "loss": 0.5208, "step": 738400 }, { "epoch": 17.0074606244819, "grad_norm": 2.918687582015991, "learning_rate": 4.734806067392853e-06, "loss": 0.5252, "step": 738600 }, { "epoch": 17.01206594823616, "grad_norm": 2.214686632156372, "learning_rate": 4.727534503570338e-06, "loss": 0.5238, "step": 738800 }, { "epoch": 17.016671271990422, "grad_norm": 3.4804065227508545, "learning_rate": 4.720262939747822e-06, "loss": 0.5244, "step": 739000 }, { "epoch": 17.02127659574468, "grad_norm": 2.5966029167175293, "learning_rate": 4.712991375925307e-06, "loss": 0.5301, "step": 739200 }, { "epoch": 17.02588191949894, "grad_norm": 2.7424144744873047, "learning_rate": 4.705719812102791e-06, "loss": 0.5181, "step": 739400 }, { "epoch": 17.0304872432532, "grad_norm": 2.9380509853363037, "learning_rate": 4.698448248280275e-06, "loss": 0.5324, "step": 739600 }, { "epoch": 17.03509256700746, "grad_norm": 2.6877858638763428, "learning_rate": 4.69117668445776e-06, "loss": 0.5186, "step": 739800 }, { "epoch": 17.03969789076172, "grad_norm": 2.7260663509368896, "learning_rate": 4.683905120635244e-06, "loss": 0.5212, "step": 740000 }, { "epoch": 17.04430321451598, "grad_norm": 3.156268358230591, "learning_rate": 4.676633556812728e-06, "loss": 0.5262, "step": 740200 }, { "epoch": 17.04890853827024, "grad_norm": 4.205811500549316, "learning_rate": 4.669361992990213e-06, "loss": 0.5204, "step": 740400 }, { "epoch": 17.0535138620245, "grad_norm": 3.210843563079834, "learning_rate": 4.662090429167697e-06, "loss": 0.5324, "step": 740600 }, { "epoch": 17.05811918577876, "grad_norm": 2.8761239051818848, "learning_rate": 4.654818865345181e-06, "loss": 0.5184, "step": 740800 }, { "epoch": 17.06272450953302, "grad_norm": 2.9451096057891846, "learning_rate": 4.647547301522665e-06, "loss": 0.5132, "step": 741000 }, { "epoch": 17.067329833287282, "grad_norm": 2.4379804134368896, "learning_rate": 4.64027573770015e-06, "loss": 0.5156, "step": 741200 }, { "epoch": 17.07193515704154, "grad_norm": 2.7910115718841553, "learning_rate": 4.633004173877635e-06, "loss": 0.5234, "step": 741400 }, { "epoch": 17.0765404807958, "grad_norm": 3.084688663482666, "learning_rate": 4.625732610055119e-06, "loss": 0.5324, "step": 741600 }, { "epoch": 17.08114580455006, "grad_norm": 2.9508092403411865, "learning_rate": 4.618461046232602e-06, "loss": 0.512, "step": 741800 }, { "epoch": 17.08575112830432, "grad_norm": 4.045099258422852, "learning_rate": 4.611225840229199e-06, "loss": 0.5115, "step": 742000 }, { "epoch": 17.09035645205858, "grad_norm": 3.059542655944824, "learning_rate": 4.603954276406684e-06, "loss": 0.5239, "step": 742200 }, { "epoch": 17.09496177581284, "grad_norm": 3.1609458923339844, "learning_rate": 4.5966827125841685e-06, "loss": 0.5215, "step": 742400 }, { "epoch": 17.0995670995671, "grad_norm": 3.208566427230835, "learning_rate": 4.589411148761653e-06, "loss": 0.5125, "step": 742600 }, { "epoch": 17.10417242332136, "grad_norm": 3.126858949661255, "learning_rate": 4.582139584939137e-06, "loss": 0.5145, "step": 742800 }, { "epoch": 17.10877774707562, "grad_norm": 2.690805673599243, "learning_rate": 4.574868021116621e-06, "loss": 0.533, "step": 743000 }, { "epoch": 17.11338307082988, "grad_norm": 3.323164939880371, "learning_rate": 4.567596457294106e-06, "loss": 0.5194, "step": 743200 }, { "epoch": 17.11798839458414, "grad_norm": 2.800487995147705, "learning_rate": 4.5603248934715904e-06, "loss": 0.5217, "step": 743400 }, { "epoch": 17.1225937183384, "grad_norm": 3.2171924114227295, "learning_rate": 4.553053329649074e-06, "loss": 0.5251, "step": 743600 }, { "epoch": 17.12719904209266, "grad_norm": 2.8016812801361084, "learning_rate": 4.545781765826559e-06, "loss": 0.516, "step": 743800 }, { "epoch": 17.131804365846918, "grad_norm": 2.8154873847961426, "learning_rate": 4.538510202004043e-06, "loss": 0.5167, "step": 744000 }, { "epoch": 17.13640968960118, "grad_norm": 3.5362391471862793, "learning_rate": 4.531238638181527e-06, "loss": 0.5388, "step": 744200 }, { "epoch": 17.14101501335544, "grad_norm": 3.001676321029663, "learning_rate": 4.523967074359012e-06, "loss": 0.5284, "step": 744400 }, { "epoch": 17.145620337109698, "grad_norm": 2.9126548767089844, "learning_rate": 4.516695510536496e-06, "loss": 0.5217, "step": 744600 }, { "epoch": 17.15022566086396, "grad_norm": 2.922006368637085, "learning_rate": 4.509423946713981e-06, "loss": 0.5176, "step": 744800 }, { "epoch": 17.15483098461822, "grad_norm": 3.5556321144104004, "learning_rate": 4.502152382891465e-06, "loss": 0.5301, "step": 745000 }, { "epoch": 17.159436308372477, "grad_norm": 2.726541519165039, "learning_rate": 4.494880819068949e-06, "loss": 0.5158, "step": 745200 }, { "epoch": 17.16404163212674, "grad_norm": 2.8062782287597656, "learning_rate": 4.4876092552464335e-06, "loss": 0.5243, "step": 745400 }, { "epoch": 17.168646955880998, "grad_norm": 2.6974618434906006, "learning_rate": 4.480337691423918e-06, "loss": 0.5189, "step": 745600 }, { "epoch": 17.173252279635257, "grad_norm": 4.060593128204346, "learning_rate": 4.473066127601402e-06, "loss": 0.514, "step": 745800 }, { "epoch": 17.17785760338952, "grad_norm": 2.953789472579956, "learning_rate": 4.465794563778887e-06, "loss": 0.5274, "step": 746000 }, { "epoch": 17.182462927143778, "grad_norm": 2.9467928409576416, "learning_rate": 4.45852299995637e-06, "loss": 0.5094, "step": 746200 }, { "epoch": 17.187068250898037, "grad_norm": 3.19561505317688, "learning_rate": 4.4512514361338546e-06, "loss": 0.5341, "step": 746400 }, { "epoch": 17.1916735746523, "grad_norm": 2.521677017211914, "learning_rate": 4.44397987231134e-06, "loss": 0.525, "step": 746600 }, { "epoch": 17.196278898406558, "grad_norm": 3.030155897140503, "learning_rate": 4.436708308488824e-06, "loss": 0.5327, "step": 746800 }, { "epoch": 17.200884222160816, "grad_norm": 3.0446267127990723, "learning_rate": 4.429436744666308e-06, "loss": 0.5185, "step": 747000 }, { "epoch": 17.20548954591508, "grad_norm": 3.0444605350494385, "learning_rate": 4.422165180843792e-06, "loss": 0.5256, "step": 747200 }, { "epoch": 17.210094869669337, "grad_norm": 3.9111011028289795, "learning_rate": 4.4148936170212765e-06, "loss": 0.5247, "step": 747400 }, { "epoch": 17.214700193423596, "grad_norm": 3.1988956928253174, "learning_rate": 4.407622053198762e-06, "loss": 0.5177, "step": 747600 }, { "epoch": 17.219305517177858, "grad_norm": 3.0281171798706055, "learning_rate": 4.400350489376245e-06, "loss": 0.5212, "step": 747800 }, { "epoch": 17.223910840932117, "grad_norm": 3.074202060699463, "learning_rate": 4.393078925553729e-06, "loss": 0.5171, "step": 748000 }, { "epoch": 17.22851616468638, "grad_norm": 3.3265304565429688, "learning_rate": 4.385807361731214e-06, "loss": 0.5188, "step": 748200 }, { "epoch": 17.233121488440638, "grad_norm": 3.696972370147705, "learning_rate": 4.378572155727811e-06, "loss": 0.5163, "step": 748400 }, { "epoch": 17.237726812194897, "grad_norm": 2.553654193878174, "learning_rate": 4.371300591905295e-06, "loss": 0.5148, "step": 748600 }, { "epoch": 17.24233213594916, "grad_norm": 3.0239148139953613, "learning_rate": 4.36402902808278e-06, "loss": 0.5086, "step": 748800 }, { "epoch": 17.246937459703418, "grad_norm": 3.5661909580230713, "learning_rate": 4.356757464260264e-06, "loss": 0.5203, "step": 749000 }, { "epoch": 17.251542783457676, "grad_norm": 3.1638379096984863, "learning_rate": 4.349485900437748e-06, "loss": 0.5089, "step": 749200 }, { "epoch": 17.25614810721194, "grad_norm": 3.8323419094085693, "learning_rate": 4.342214336615233e-06, "loss": 0.527, "step": 749400 }, { "epoch": 17.260753430966197, "grad_norm": 2.8561997413635254, "learning_rate": 4.3349427727927165e-06, "loss": 0.5227, "step": 749600 }, { "epoch": 17.265358754720456, "grad_norm": 2.703752040863037, "learning_rate": 4.327671208970201e-06, "loss": 0.5293, "step": 749800 }, { "epoch": 17.269964078474718, "grad_norm": 2.9282515048980713, "learning_rate": 4.320399645147686e-06, "loss": 0.5195, "step": 750000 }, { "epoch": 17.274569402228977, "grad_norm": 3.2129111289978027, "learning_rate": 4.31312808132517e-06, "loss": 0.5199, "step": 750200 }, { "epoch": 17.279174725983236, "grad_norm": 3.368345260620117, "learning_rate": 4.305856517502654e-06, "loss": 0.5246, "step": 750400 }, { "epoch": 17.283780049737498, "grad_norm": 3.3532567024230957, "learning_rate": 4.298621311499251e-06, "loss": 0.5149, "step": 750600 }, { "epoch": 17.288385373491757, "grad_norm": 3.7765021324157715, "learning_rate": 4.291349747676735e-06, "loss": 0.5282, "step": 750800 }, { "epoch": 17.292990697246015, "grad_norm": 2.6797635555267334, "learning_rate": 4.2840781838542195e-06, "loss": 0.5261, "step": 751000 }, { "epoch": 17.297596021000277, "grad_norm": 2.490145683288574, "learning_rate": 4.276806620031705e-06, "loss": 0.5254, "step": 751200 }, { "epoch": 17.302201344754536, "grad_norm": 2.785992383956909, "learning_rate": 4.269535056209188e-06, "loss": 0.5196, "step": 751400 }, { "epoch": 17.306806668508795, "grad_norm": 3.1607396602630615, "learning_rate": 4.262263492386672e-06, "loss": 0.5217, "step": 751600 }, { "epoch": 17.311411992263057, "grad_norm": 3.664815902709961, "learning_rate": 4.254991928564157e-06, "loss": 0.5251, "step": 751800 }, { "epoch": 17.316017316017316, "grad_norm": 3.4818427562713623, "learning_rate": 4.247756722560754e-06, "loss": 0.5282, "step": 752000 }, { "epoch": 17.320622639771575, "grad_norm": 3.277512311935425, "learning_rate": 4.240521516557351e-06, "loss": 0.5343, "step": 752200 }, { "epoch": 17.325227963525837, "grad_norm": 3.601278305053711, "learning_rate": 4.233249952734835e-06, "loss": 0.527, "step": 752400 }, { "epoch": 17.329833287280096, "grad_norm": 3.438324451446533, "learning_rate": 4.2259783889123195e-06, "loss": 0.5108, "step": 752600 }, { "epoch": 17.334438611034354, "grad_norm": 3.622999429702759, "learning_rate": 4.218706825089804e-06, "loss": 0.522, "step": 752800 }, { "epoch": 17.339043934788616, "grad_norm": 2.718357563018799, "learning_rate": 4.211435261267288e-06, "loss": 0.5287, "step": 753000 }, { "epoch": 17.343649258542875, "grad_norm": 3.1246633529663086, "learning_rate": 4.204163697444773e-06, "loss": 0.5293, "step": 753200 }, { "epoch": 17.348254582297134, "grad_norm": 2.551449775695801, "learning_rate": 4.196892133622257e-06, "loss": 0.5222, "step": 753400 }, { "epoch": 17.352859906051396, "grad_norm": 2.530979633331299, "learning_rate": 4.189620569799741e-06, "loss": 0.5185, "step": 753600 }, { "epoch": 17.357465229805655, "grad_norm": 3.4675188064575195, "learning_rate": 4.182349005977226e-06, "loss": 0.5193, "step": 753800 }, { "epoch": 17.362070553559914, "grad_norm": 3.597015619277954, "learning_rate": 4.17507744215471e-06, "loss": 0.5124, "step": 754000 }, { "epoch": 17.366675877314176, "grad_norm": 3.48524808883667, "learning_rate": 4.167805878332195e-06, "loss": 0.5167, "step": 754200 }, { "epoch": 17.371281201068435, "grad_norm": 2.2207281589508057, "learning_rate": 4.160534314509678e-06, "loss": 0.5343, "step": 754400 }, { "epoch": 17.375886524822697, "grad_norm": 3.222555160522461, "learning_rate": 4.1532627506871626e-06, "loss": 0.5166, "step": 754600 }, { "epoch": 17.380491848576956, "grad_norm": 3.4512622356414795, "learning_rate": 4.145991186864648e-06, "loss": 0.5148, "step": 754800 }, { "epoch": 17.385097172331214, "grad_norm": 2.870035171508789, "learning_rate": 4.138719623042132e-06, "loss": 0.5195, "step": 755000 }, { "epoch": 17.389702496085476, "grad_norm": 3.166290521621704, "learning_rate": 4.131448059219615e-06, "loss": 0.5147, "step": 755200 }, { "epoch": 17.394307819839735, "grad_norm": 3.1299068927764893, "learning_rate": 4.1241764953971e-06, "loss": 0.5193, "step": 755400 }, { "epoch": 17.398913143593994, "grad_norm": 2.6893653869628906, "learning_rate": 4.1169049315745845e-06, "loss": 0.5246, "step": 755600 }, { "epoch": 17.403518467348256, "grad_norm": 2.988776206970215, "learning_rate": 4.1096333677520696e-06, "loss": 0.5223, "step": 755800 }, { "epoch": 17.408123791102515, "grad_norm": 3.235541820526123, "learning_rate": 4.102361803929553e-06, "loss": 0.5116, "step": 756000 }, { "epoch": 17.412729114856774, "grad_norm": 3.031001091003418, "learning_rate": 4.09512659792615e-06, "loss": 0.5167, "step": 756200 }, { "epoch": 17.417334438611036, "grad_norm": 2.9991252422332764, "learning_rate": 4.087855034103634e-06, "loss": 0.5125, "step": 756400 }, { "epoch": 17.421939762365295, "grad_norm": 3.0221939086914062, "learning_rate": 4.080583470281119e-06, "loss": 0.5213, "step": 756600 }, { "epoch": 17.426545086119553, "grad_norm": 2.6054844856262207, "learning_rate": 4.073311906458603e-06, "loss": 0.5255, "step": 756800 }, { "epoch": 17.431150409873815, "grad_norm": 2.908795118331909, "learning_rate": 4.066040342636087e-06, "loss": 0.5223, "step": 757000 }, { "epoch": 17.435755733628074, "grad_norm": 3.4502451419830322, "learning_rate": 4.058768778813572e-06, "loss": 0.5242, "step": 757200 }, { "epoch": 17.440361057382333, "grad_norm": 3.335181474685669, "learning_rate": 4.051497214991056e-06, "loss": 0.5198, "step": 757400 }, { "epoch": 17.444966381136595, "grad_norm": 3.5651845932006836, "learning_rate": 4.044225651168541e-06, "loss": 0.5176, "step": 757600 }, { "epoch": 17.449571704890854, "grad_norm": 2.8494350910186768, "learning_rate": 4.0369540873460245e-06, "loss": 0.52, "step": 757800 }, { "epoch": 17.454177028645113, "grad_norm": 2.9451777935028076, "learning_rate": 4.029682523523509e-06, "loss": 0.5285, "step": 758000 }, { "epoch": 17.458782352399375, "grad_norm": 3.2299864292144775, "learning_rate": 4.022410959700994e-06, "loss": 0.5242, "step": 758200 }, { "epoch": 17.463387676153634, "grad_norm": 2.959676742553711, "learning_rate": 4.015139395878478e-06, "loss": 0.5261, "step": 758400 }, { "epoch": 17.467992999907892, "grad_norm": 3.600370407104492, "learning_rate": 4.007867832055961e-06, "loss": 0.5173, "step": 758600 }, { "epoch": 17.472598323662154, "grad_norm": 2.899125814437866, "learning_rate": 4.000596268233446e-06, "loss": 0.5236, "step": 758800 }, { "epoch": 17.477203647416413, "grad_norm": 3.0333738327026367, "learning_rate": 3.993324704410931e-06, "loss": 0.5209, "step": 759000 }, { "epoch": 17.481808971170672, "grad_norm": 2.875760078430176, "learning_rate": 3.986053140588416e-06, "loss": 0.5118, "step": 759200 }, { "epoch": 17.486414294924934, "grad_norm": 2.935258150100708, "learning_rate": 3.978781576765899e-06, "loss": 0.5157, "step": 759400 }, { "epoch": 17.491019618679193, "grad_norm": 3.5687882900238037, "learning_rate": 3.971510012943383e-06, "loss": 0.5255, "step": 759600 }, { "epoch": 17.49562494243345, "grad_norm": 3.5541269779205322, "learning_rate": 3.96427480693998e-06, "loss": 0.5209, "step": 759800 }, { "epoch": 17.500230266187714, "grad_norm": 2.5105998516082764, "learning_rate": 3.957003243117465e-06, "loss": 0.5237, "step": 760000 }, { "epoch": 17.504835589941973, "grad_norm": 3.4563724994659424, "learning_rate": 3.9497316792949495e-06, "loss": 0.5289, "step": 760200 }, { "epoch": 17.50944091369623, "grad_norm": 4.190462112426758, "learning_rate": 3.942460115472433e-06, "loss": 0.5144, "step": 760400 }, { "epoch": 17.514046237450493, "grad_norm": 3.6691291332244873, "learning_rate": 3.935188551649918e-06, "loss": 0.5129, "step": 760600 }, { "epoch": 17.518651561204752, "grad_norm": 3.044640064239502, "learning_rate": 3.927916987827402e-06, "loss": 0.5197, "step": 760800 }, { "epoch": 17.52325688495901, "grad_norm": 3.2396230697631836, "learning_rate": 3.920645424004887e-06, "loss": 0.5099, "step": 761000 }, { "epoch": 17.527862208713273, "grad_norm": 3.3681046962738037, "learning_rate": 3.9133738601823706e-06, "loss": 0.5258, "step": 761200 }, { "epoch": 17.532467532467532, "grad_norm": 3.473139524459839, "learning_rate": 3.906102296359855e-06, "loss": 0.5112, "step": 761400 }, { "epoch": 17.53707285622179, "grad_norm": 3.2254271507263184, "learning_rate": 3.89883073253734e-06, "loss": 0.5194, "step": 761600 }, { "epoch": 17.541678179976053, "grad_norm": 3.007357120513916, "learning_rate": 3.891559168714824e-06, "loss": 0.5226, "step": 761800 }, { "epoch": 17.54628350373031, "grad_norm": 3.1616954803466797, "learning_rate": 3.884287604892308e-06, "loss": 0.5195, "step": 762000 }, { "epoch": 17.550888827484574, "grad_norm": 3.4159657955169678, "learning_rate": 3.8770160410697925e-06, "loss": 0.5196, "step": 762200 }, { "epoch": 17.555494151238833, "grad_norm": 2.448462724685669, "learning_rate": 3.869744477247277e-06, "loss": 0.5221, "step": 762400 }, { "epoch": 17.56009947499309, "grad_norm": 3.6879563331604004, "learning_rate": 3.862472913424762e-06, "loss": 0.5182, "step": 762600 }, { "epoch": 17.564704798747353, "grad_norm": 3.100130796432495, "learning_rate": 3.855201349602246e-06, "loss": 0.5108, "step": 762800 }, { "epoch": 17.569310122501612, "grad_norm": 3.8761160373687744, "learning_rate": 3.847929785779729e-06, "loss": 0.529, "step": 763000 }, { "epoch": 17.57391544625587, "grad_norm": 3.630197525024414, "learning_rate": 3.8406582219572145e-06, "loss": 0.5229, "step": 763200 }, { "epoch": 17.578520770010133, "grad_norm": 3.126621961593628, "learning_rate": 3.833386658134699e-06, "loss": 0.5199, "step": 763400 }, { "epoch": 17.583126093764392, "grad_norm": 2.9933745861053467, "learning_rate": 3.826115094312183e-06, "loss": 0.5093, "step": 763600 }, { "epoch": 17.58773141751865, "grad_norm": 3.0438787937164307, "learning_rate": 3.818843530489667e-06, "loss": 0.5211, "step": 763800 }, { "epoch": 17.592336741272913, "grad_norm": 3.802643299102783, "learning_rate": 3.8115719666671513e-06, "loss": 0.5248, "step": 764000 }, { "epoch": 17.59694206502717, "grad_norm": 4.053820610046387, "learning_rate": 3.8043004028446364e-06, "loss": 0.5208, "step": 764200 }, { "epoch": 17.60154738878143, "grad_norm": 2.8927087783813477, "learning_rate": 3.7970651968412333e-06, "loss": 0.5216, "step": 764400 }, { "epoch": 17.606152712535692, "grad_norm": 3.9510834217071533, "learning_rate": 3.789793633018717e-06, "loss": 0.5174, "step": 764600 }, { "epoch": 17.61075803628995, "grad_norm": 2.498469114303589, "learning_rate": 3.7825220691962013e-06, "loss": 0.5162, "step": 764800 }, { "epoch": 17.61536336004421, "grad_norm": 3.019011974334717, "learning_rate": 3.775250505373686e-06, "loss": 0.525, "step": 765000 }, { "epoch": 17.619968683798472, "grad_norm": 3.5621178150177, "learning_rate": 3.76797894155117e-06, "loss": 0.5176, "step": 765200 }, { "epoch": 17.62457400755273, "grad_norm": 3.7816877365112305, "learning_rate": 3.760707377728654e-06, "loss": 0.5132, "step": 765400 }, { "epoch": 17.62917933130699, "grad_norm": 2.997817277908325, "learning_rate": 3.753435813906139e-06, "loss": 0.5202, "step": 765600 }, { "epoch": 17.633784655061252, "grad_norm": 3.401996374130249, "learning_rate": 3.746164250083623e-06, "loss": 0.513, "step": 765800 }, { "epoch": 17.63838997881551, "grad_norm": 3.3747670650482178, "learning_rate": 3.7388926862611075e-06, "loss": 0.5162, "step": 766000 }, { "epoch": 17.64299530256977, "grad_norm": 3.419926404953003, "learning_rate": 3.7316211224385917e-06, "loss": 0.5122, "step": 766200 }, { "epoch": 17.64760062632403, "grad_norm": 3.317556381225586, "learning_rate": 3.7243859164351886e-06, "loss": 0.5277, "step": 766400 }, { "epoch": 17.65220595007829, "grad_norm": 3.193814516067505, "learning_rate": 3.7171143526126733e-06, "loss": 0.5075, "step": 766600 }, { "epoch": 17.65681127383255, "grad_norm": 2.69360089302063, "learning_rate": 3.709842788790157e-06, "loss": 0.519, "step": 766800 }, { "epoch": 17.66141659758681, "grad_norm": 2.8568925857543945, "learning_rate": 3.7025712249676417e-06, "loss": 0.5266, "step": 767000 }, { "epoch": 17.66602192134107, "grad_norm": 3.275151491165161, "learning_rate": 3.695299661145126e-06, "loss": 0.522, "step": 767200 }, { "epoch": 17.67062724509533, "grad_norm": 3.3356289863586426, "learning_rate": 3.6880280973226106e-06, "loss": 0.5257, "step": 767400 }, { "epoch": 17.67523256884959, "grad_norm": 3.011615514755249, "learning_rate": 3.6807565335000943e-06, "loss": 0.5226, "step": 767600 }, { "epoch": 17.67983789260385, "grad_norm": 2.5866901874542236, "learning_rate": 3.673484969677579e-06, "loss": 0.5196, "step": 767800 }, { "epoch": 17.684443216358112, "grad_norm": 3.7336740493774414, "learning_rate": 3.666213405855063e-06, "loss": 0.5119, "step": 768000 }, { "epoch": 17.68904854011237, "grad_norm": 2.9361438751220703, "learning_rate": 3.658941842032548e-06, "loss": 0.5093, "step": 768200 }, { "epoch": 17.69365386386663, "grad_norm": 3.3662490844726562, "learning_rate": 3.6516702782100317e-06, "loss": 0.5179, "step": 768400 }, { "epoch": 17.69825918762089, "grad_norm": 3.797208547592163, "learning_rate": 3.6444350722066286e-06, "loss": 0.5157, "step": 768600 }, { "epoch": 17.70286451137515, "grad_norm": 3.147529363632202, "learning_rate": 3.637163508384113e-06, "loss": 0.5324, "step": 768800 }, { "epoch": 17.70746983512941, "grad_norm": 3.4828877449035645, "learning_rate": 3.62992830238071e-06, "loss": 0.5217, "step": 769000 }, { "epoch": 17.71207515888367, "grad_norm": 3.616196393966675, "learning_rate": 3.6226567385581943e-06, "loss": 0.5199, "step": 769200 }, { "epoch": 17.71668048263793, "grad_norm": 2.796513319015503, "learning_rate": 3.615385174735679e-06, "loss": 0.5121, "step": 769400 }, { "epoch": 17.72128580639219, "grad_norm": 2.718034029006958, "learning_rate": 3.6081136109131628e-06, "loss": 0.5194, "step": 769600 }, { "epoch": 17.72589113014645, "grad_norm": 3.4165828227996826, "learning_rate": 3.6008784049097597e-06, "loss": 0.5348, "step": 769800 }, { "epoch": 17.73049645390071, "grad_norm": 3.0737781524658203, "learning_rate": 3.5936068410872443e-06, "loss": 0.517, "step": 770000 }, { "epoch": 17.735101777654968, "grad_norm": 3.283372163772583, "learning_rate": 3.5863352772647285e-06, "loss": 0.5262, "step": 770200 }, { "epoch": 17.73970710140923, "grad_norm": 2.618170738220215, "learning_rate": 3.579063713442213e-06, "loss": 0.5252, "step": 770400 }, { "epoch": 17.74431242516349, "grad_norm": 3.1445319652557373, "learning_rate": 3.571792149619697e-06, "loss": 0.5196, "step": 770600 }, { "epoch": 17.748917748917748, "grad_norm": 3.0439717769622803, "learning_rate": 3.5645205857971816e-06, "loss": 0.5282, "step": 770800 }, { "epoch": 17.75352307267201, "grad_norm": 2.830927610397339, "learning_rate": 3.557249021974666e-06, "loss": 0.517, "step": 771000 }, { "epoch": 17.75812839642627, "grad_norm": 2.9426324367523193, "learning_rate": 3.5499774581521505e-06, "loss": 0.5185, "step": 771200 }, { "epoch": 17.762733720180528, "grad_norm": 3.1209681034088135, "learning_rate": 3.5427058943296343e-06, "loss": 0.5221, "step": 771400 }, { "epoch": 17.76733904393479, "grad_norm": 2.9420995712280273, "learning_rate": 3.535434330507119e-06, "loss": 0.5119, "step": 771600 }, { "epoch": 17.77194436768905, "grad_norm": 2.995070219039917, "learning_rate": 3.5281627666846036e-06, "loss": 0.5254, "step": 771800 }, { "epoch": 17.776549691443307, "grad_norm": 3.6873717308044434, "learning_rate": 3.5208912028620878e-06, "loss": 0.527, "step": 772000 }, { "epoch": 17.78115501519757, "grad_norm": 2.82792592048645, "learning_rate": 3.513619639039572e-06, "loss": 0.5208, "step": 772200 }, { "epoch": 17.785760338951828, "grad_norm": 2.409423351287842, "learning_rate": 3.5063480752170562e-06, "loss": 0.526, "step": 772400 }, { "epoch": 17.790365662706087, "grad_norm": 2.7182860374450684, "learning_rate": 3.499076511394541e-06, "loss": 0.5197, "step": 772600 }, { "epoch": 17.79497098646035, "grad_norm": 4.183868885040283, "learning_rate": 3.491804947572025e-06, "loss": 0.5077, "step": 772800 }, { "epoch": 17.799576310214608, "grad_norm": 3.585371971130371, "learning_rate": 3.4845333837495093e-06, "loss": 0.5102, "step": 773000 }, { "epoch": 17.804181633968867, "grad_norm": 3.362762451171875, "learning_rate": 3.4772618199269935e-06, "loss": 0.5208, "step": 773200 }, { "epoch": 17.80878695772313, "grad_norm": 3.437537431716919, "learning_rate": 3.469990256104478e-06, "loss": 0.531, "step": 773400 }, { "epoch": 17.813392281477388, "grad_norm": 3.5331273078918457, "learning_rate": 3.4627186922819624e-06, "loss": 0.504, "step": 773600 }, { "epoch": 17.817997605231646, "grad_norm": 2.810429811477661, "learning_rate": 3.4554471284594466e-06, "loss": 0.5231, "step": 773800 }, { "epoch": 17.82260292898591, "grad_norm": 2.8518621921539307, "learning_rate": 3.448175564636931e-06, "loss": 0.5173, "step": 774000 }, { "epoch": 17.827208252740167, "grad_norm": 4.182199478149414, "learning_rate": 3.4409040008144155e-06, "loss": 0.5206, "step": 774200 }, { "epoch": 17.831813576494426, "grad_norm": 2.786973237991333, "learning_rate": 3.4336324369918997e-06, "loss": 0.5256, "step": 774400 }, { "epoch": 17.836418900248688, "grad_norm": 4.052456855773926, "learning_rate": 3.426360873169384e-06, "loss": 0.5245, "step": 774600 }, { "epoch": 17.841024224002947, "grad_norm": 3.5962717533111572, "learning_rate": 3.419089309346868e-06, "loss": 0.5331, "step": 774800 }, { "epoch": 17.845629547757206, "grad_norm": 3.3177649974823, "learning_rate": 3.4118177455243528e-06, "loss": 0.5154, "step": 775000 }, { "epoch": 17.850234871511468, "grad_norm": 2.6768603324890137, "learning_rate": 3.4045461817018366e-06, "loss": 0.5156, "step": 775200 }, { "epoch": 17.854840195265727, "grad_norm": 3.424499034881592, "learning_rate": 3.397274617879321e-06, "loss": 0.5172, "step": 775400 }, { "epoch": 17.85944551901999, "grad_norm": 2.6786439418792725, "learning_rate": 3.3900030540568054e-06, "loss": 0.5164, "step": 775600 }, { "epoch": 17.864050842774247, "grad_norm": 4.423825740814209, "learning_rate": 3.38273149023429e-06, "loss": 0.5096, "step": 775800 }, { "epoch": 17.868656166528506, "grad_norm": 3.1935222148895264, "learning_rate": 3.375459926411774e-06, "loss": 0.5238, "step": 776000 }, { "epoch": 17.87326149028277, "grad_norm": 3.05393648147583, "learning_rate": 3.3681883625892585e-06, "loss": 0.5086, "step": 776200 }, { "epoch": 17.877866814037027, "grad_norm": 2.8868277072906494, "learning_rate": 3.3609167987667427e-06, "loss": 0.5192, "step": 776400 }, { "epoch": 17.882472137791286, "grad_norm": 3.3218915462493896, "learning_rate": 3.3536815927633396e-06, "loss": 0.5108, "step": 776600 }, { "epoch": 17.887077461545548, "grad_norm": 3.428025007247925, "learning_rate": 3.3464100289408243e-06, "loss": 0.5158, "step": 776800 }, { "epoch": 17.891682785299807, "grad_norm": 2.7553398609161377, "learning_rate": 3.339138465118308e-06, "loss": 0.512, "step": 777000 }, { "epoch": 17.896288109054066, "grad_norm": 3.779360294342041, "learning_rate": 3.3318669012957927e-06, "loss": 0.5345, "step": 777200 }, { "epoch": 17.900893432808328, "grad_norm": 4.041235446929932, "learning_rate": 3.324595337473277e-06, "loss": 0.5183, "step": 777400 }, { "epoch": 17.905498756562586, "grad_norm": 3.0583138465881348, "learning_rate": 3.3173237736507616e-06, "loss": 0.5256, "step": 777600 }, { "epoch": 17.910104080316845, "grad_norm": 2.8636226654052734, "learning_rate": 3.3100522098282454e-06, "loss": 0.5244, "step": 777800 }, { "epoch": 17.914709404071107, "grad_norm": 3.2224411964416504, "learning_rate": 3.30278064600573e-06, "loss": 0.5183, "step": 778000 }, { "epoch": 17.919314727825366, "grad_norm": 3.5364627838134766, "learning_rate": 3.2955090821832142e-06, "loss": 0.5119, "step": 778200 }, { "epoch": 17.923920051579625, "grad_norm": 3.234290361404419, "learning_rate": 3.288237518360699e-06, "loss": 0.5245, "step": 778400 }, { "epoch": 17.928525375333887, "grad_norm": 2.7013673782348633, "learning_rate": 3.2809659545381827e-06, "loss": 0.5182, "step": 778600 }, { "epoch": 17.933130699088146, "grad_norm": 3.9952573776245117, "learning_rate": 3.2736943907156673e-06, "loss": 0.5145, "step": 778800 }, { "epoch": 17.937736022842405, "grad_norm": 3.0978848934173584, "learning_rate": 3.2664228268931515e-06, "loss": 0.5202, "step": 779000 }, { "epoch": 17.942341346596667, "grad_norm": 3.4453442096710205, "learning_rate": 3.2591876208897484e-06, "loss": 0.5161, "step": 779200 }, { "epoch": 17.946946670350926, "grad_norm": 2.99537992477417, "learning_rate": 3.251916057067233e-06, "loss": 0.5138, "step": 779400 }, { "epoch": 17.951551994105184, "grad_norm": 2.7019853591918945, "learning_rate": 3.244644493244717e-06, "loss": 0.5056, "step": 779600 }, { "epoch": 17.956157317859446, "grad_norm": 3.723837375640869, "learning_rate": 3.2373729294222015e-06, "loss": 0.5151, "step": 779800 }, { "epoch": 17.960762641613705, "grad_norm": 2.7109344005584717, "learning_rate": 3.230101365599686e-06, "loss": 0.5082, "step": 780000 }, { "epoch": 17.965367965367964, "grad_norm": 3.6867780685424805, "learning_rate": 3.2228298017771704e-06, "loss": 0.5128, "step": 780200 }, { "epoch": 17.969973289122226, "grad_norm": 3.721771478652954, "learning_rate": 3.2155945957737673e-06, "loss": 0.5192, "step": 780400 }, { "epoch": 17.974578612876485, "grad_norm": 2.7441351413726807, "learning_rate": 3.2083230319512515e-06, "loss": 0.5246, "step": 780600 }, { "epoch": 17.979183936630744, "grad_norm": 3.119499921798706, "learning_rate": 3.2010514681287357e-06, "loss": 0.519, "step": 780800 }, { "epoch": 17.983789260385006, "grad_norm": 5.068978786468506, "learning_rate": 3.1937799043062204e-06, "loss": 0.5247, "step": 781000 }, { "epoch": 17.988394584139265, "grad_norm": 2.891235828399658, "learning_rate": 3.1865083404837046e-06, "loss": 0.5135, "step": 781200 }, { "epoch": 17.992999907893523, "grad_norm": 3.192763566970825, "learning_rate": 3.179236776661189e-06, "loss": 0.5187, "step": 781400 }, { "epoch": 17.997605231647785, "grad_norm": 3.4980311393737793, "learning_rate": 3.171965212838673e-06, "loss": 0.5234, "step": 781600 }, { "epoch": 18.0, "eval_loss": 0.5081140398979187, "eval_runtime": 170.1048, "eval_samples_per_second": 166.727, "eval_steps_per_second": 10.423, "step": 781704 }, { "epoch": 18.002210555402044, "grad_norm": 2.9518814086914062, "learning_rate": 3.1646936490161577e-06, "loss": 0.5216, "step": 781800 }, { "epoch": 18.006815879156306, "grad_norm": 2.917123794555664, "learning_rate": 3.157422085193642e-06, "loss": 0.5168, "step": 782000 }, { "epoch": 18.011421202910565, "grad_norm": 3.122274875640869, "learning_rate": 3.150150521371126e-06, "loss": 0.511, "step": 782200 }, { "epoch": 18.016026526664824, "grad_norm": 3.571920394897461, "learning_rate": 3.1428789575486103e-06, "loss": 0.5183, "step": 782400 }, { "epoch": 18.020631850419086, "grad_norm": 3.9012715816497803, "learning_rate": 3.135607393726095e-06, "loss": 0.5222, "step": 782600 }, { "epoch": 18.025237174173345, "grad_norm": 3.0531198978424072, "learning_rate": 3.128335829903579e-06, "loss": 0.5064, "step": 782800 }, { "epoch": 18.029842497927604, "grad_norm": 3.265467405319214, "learning_rate": 3.121100623900176e-06, "loss": 0.5119, "step": 783000 }, { "epoch": 18.034447821681866, "grad_norm": 2.407761335372925, "learning_rate": 3.1138290600776603e-06, "loss": 0.5109, "step": 783200 }, { "epoch": 18.039053145436124, "grad_norm": 3.1489205360412598, "learning_rate": 3.1065574962551445e-06, "loss": 0.509, "step": 783400 }, { "epoch": 18.043658469190383, "grad_norm": 3.7624034881591797, "learning_rate": 3.099285932432629e-06, "loss": 0.5208, "step": 783600 }, { "epoch": 18.048263792944645, "grad_norm": 3.1477630138397217, "learning_rate": 3.0920143686101134e-06, "loss": 0.5228, "step": 783800 }, { "epoch": 18.052869116698904, "grad_norm": 3.3497445583343506, "learning_rate": 3.0847428047875976e-06, "loss": 0.5191, "step": 784000 }, { "epoch": 18.057474440453163, "grad_norm": 2.5725488662719727, "learning_rate": 3.077471240965082e-06, "loss": 0.5346, "step": 784200 }, { "epoch": 18.062079764207425, "grad_norm": 3.2691822052001953, "learning_rate": 3.0701996771425665e-06, "loss": 0.5029, "step": 784400 }, { "epoch": 18.066685087961684, "grad_norm": 3.542738914489746, "learning_rate": 3.0629281133200507e-06, "loss": 0.5097, "step": 784600 }, { "epoch": 18.071290411715943, "grad_norm": 3.2178702354431152, "learning_rate": 3.055656549497535e-06, "loss": 0.5153, "step": 784800 }, { "epoch": 18.075895735470205, "grad_norm": 3.4788858890533447, "learning_rate": 3.048384985675019e-06, "loss": 0.5136, "step": 785000 }, { "epoch": 18.080501059224463, "grad_norm": 2.703796625137329, "learning_rate": 3.041113421852504e-06, "loss": 0.525, "step": 785200 }, { "epoch": 18.085106382978722, "grad_norm": 3.2644095420837402, "learning_rate": 3.033841858029988e-06, "loss": 0.5144, "step": 785400 }, { "epoch": 18.089711706732984, "grad_norm": 3.0157806873321533, "learning_rate": 3.0265702942074722e-06, "loss": 0.5259, "step": 785600 }, { "epoch": 18.094317030487243, "grad_norm": 3.0737905502319336, "learning_rate": 3.0192987303849565e-06, "loss": 0.5122, "step": 785800 }, { "epoch": 18.098922354241502, "grad_norm": 2.9035747051239014, "learning_rate": 3.012027166562441e-06, "loss": 0.5151, "step": 786000 }, { "epoch": 18.103527677995764, "grad_norm": 2.6687934398651123, "learning_rate": 3.0047556027399253e-06, "loss": 0.516, "step": 786200 }, { "epoch": 18.108133001750023, "grad_norm": 3.1634881496429443, "learning_rate": 2.9974840389174095e-06, "loss": 0.5203, "step": 786400 }, { "epoch": 18.11273832550428, "grad_norm": 2.5920820236206055, "learning_rate": 2.9902124750948938e-06, "loss": 0.5052, "step": 786600 }, { "epoch": 18.117343649258544, "grad_norm": 3.145585536956787, "learning_rate": 2.9829409112723784e-06, "loss": 0.509, "step": 786800 }, { "epoch": 18.121948973012802, "grad_norm": 3.541355609893799, "learning_rate": 2.9756693474498626e-06, "loss": 0.5237, "step": 787000 }, { "epoch": 18.12655429676706, "grad_norm": 2.4166955947875977, "learning_rate": 2.968397783627347e-06, "loss": 0.508, "step": 787200 }, { "epoch": 18.131159620521323, "grad_norm": 3.0432989597320557, "learning_rate": 2.961126219804831e-06, "loss": 0.5131, "step": 787400 }, { "epoch": 18.135764944275582, "grad_norm": 2.8021647930145264, "learning_rate": 2.9538546559823157e-06, "loss": 0.5154, "step": 787600 }, { "epoch": 18.14037026802984, "grad_norm": 4.511200428009033, "learning_rate": 2.9465830921598e-06, "loss": 0.5155, "step": 787800 }, { "epoch": 18.144975591784103, "grad_norm": 3.441237449645996, "learning_rate": 2.939311528337284e-06, "loss": 0.5131, "step": 788000 }, { "epoch": 18.149580915538362, "grad_norm": 3.7599685192108154, "learning_rate": 2.9320399645147688e-06, "loss": 0.5169, "step": 788200 }, { "epoch": 18.15418623929262, "grad_norm": 3.4300057888031006, "learning_rate": 2.9248047585113653e-06, "loss": 0.5213, "step": 788400 }, { "epoch": 18.158791563046883, "grad_norm": 3.6189639568328857, "learning_rate": 2.91753319468885e-06, "loss": 0.5173, "step": 788600 }, { "epoch": 18.16339688680114, "grad_norm": 2.8627123832702637, "learning_rate": 2.9102616308663346e-06, "loss": 0.5261, "step": 788800 }, { "epoch": 18.168002210555404, "grad_norm": 3.8576009273529053, "learning_rate": 2.9029900670438183e-06, "loss": 0.52, "step": 789000 }, { "epoch": 18.172607534309662, "grad_norm": 3.4420907497406006, "learning_rate": 2.895718503221303e-06, "loss": 0.5262, "step": 789200 }, { "epoch": 18.17721285806392, "grad_norm": 3.3978118896484375, "learning_rate": 2.888446939398787e-06, "loss": 0.5168, "step": 789400 }, { "epoch": 18.181818181818183, "grad_norm": 2.9462685585021973, "learning_rate": 2.881175375576272e-06, "loss": 0.5105, "step": 789600 }, { "epoch": 18.186423505572442, "grad_norm": 2.9186863899230957, "learning_rate": 2.8739038117537556e-06, "loss": 0.5146, "step": 789800 }, { "epoch": 18.1910288293267, "grad_norm": 2.7235682010650635, "learning_rate": 2.8666322479312403e-06, "loss": 0.5206, "step": 790000 }, { "epoch": 18.195634153080963, "grad_norm": 3.430112361907959, "learning_rate": 2.8593606841087245e-06, "loss": 0.5105, "step": 790200 }, { "epoch": 18.200239476835222, "grad_norm": 3.3343279361724854, "learning_rate": 2.852089120286209e-06, "loss": 0.5144, "step": 790400 }, { "epoch": 18.20484480058948, "grad_norm": 3.0267772674560547, "learning_rate": 2.844817556463693e-06, "loss": 0.5233, "step": 790600 }, { "epoch": 18.209450124343743, "grad_norm": 3.3748552799224854, "learning_rate": 2.8375459926411776e-06, "loss": 0.5131, "step": 790800 }, { "epoch": 18.214055448098, "grad_norm": 2.5174360275268555, "learning_rate": 2.830274428818662e-06, "loss": 0.5182, "step": 791000 }, { "epoch": 18.21866077185226, "grad_norm": 3.363555669784546, "learning_rate": 2.8230028649961465e-06, "loss": 0.531, "step": 791200 }, { "epoch": 18.223266095606522, "grad_norm": 2.635622024536133, "learning_rate": 2.8157313011736302e-06, "loss": 0.5156, "step": 791400 }, { "epoch": 18.22787141936078, "grad_norm": 3.1038053035736084, "learning_rate": 2.808459737351115e-06, "loss": 0.52, "step": 791600 }, { "epoch": 18.23247674311504, "grad_norm": 3.0769765377044678, "learning_rate": 2.801188173528599e-06, "loss": 0.5156, "step": 791800 }, { "epoch": 18.237082066869302, "grad_norm": 2.5016119480133057, "learning_rate": 2.7939166097060838e-06, "loss": 0.5227, "step": 792000 }, { "epoch": 18.24168739062356, "grad_norm": 3.1567609310150146, "learning_rate": 2.7866450458835676e-06, "loss": 0.5215, "step": 792200 }, { "epoch": 18.24629271437782, "grad_norm": 3.602930784225464, "learning_rate": 2.779373482061052e-06, "loss": 0.5226, "step": 792400 }, { "epoch": 18.250898038132082, "grad_norm": 2.699720859527588, "learning_rate": 2.7721019182385364e-06, "loss": 0.5195, "step": 792600 }, { "epoch": 18.25550336188634, "grad_norm": 2.6667487621307373, "learning_rate": 2.764830354416021e-06, "loss": 0.5111, "step": 792800 }, { "epoch": 18.2601086856406, "grad_norm": 3.1403419971466064, "learning_rate": 2.757558790593505e-06, "loss": 0.5139, "step": 793000 }, { "epoch": 18.26471400939486, "grad_norm": 3.1445913314819336, "learning_rate": 2.7502872267709895e-06, "loss": 0.5169, "step": 793200 }, { "epoch": 18.26931933314912, "grad_norm": 3.3420560359954834, "learning_rate": 2.7430156629484737e-06, "loss": 0.5188, "step": 793400 }, { "epoch": 18.27392465690338, "grad_norm": 3.481611490249634, "learning_rate": 2.7357804569450706e-06, "loss": 0.51, "step": 793600 }, { "epoch": 18.27852998065764, "grad_norm": 3.8973641395568848, "learning_rate": 2.7285088931225553e-06, "loss": 0.5136, "step": 793800 }, { "epoch": 18.2831353044119, "grad_norm": 2.5481739044189453, "learning_rate": 2.721237329300039e-06, "loss": 0.5212, "step": 794000 }, { "epoch": 18.28774062816616, "grad_norm": 2.917550802230835, "learning_rate": 2.7139657654775237e-06, "loss": 0.5216, "step": 794200 }, { "epoch": 18.29234595192042, "grad_norm": 3.176706314086914, "learning_rate": 2.706694201655008e-06, "loss": 0.5028, "step": 794400 }, { "epoch": 18.29695127567468, "grad_norm": 3.2188303470611572, "learning_rate": 2.699458995651605e-06, "loss": 0.52, "step": 794600 }, { "epoch": 18.301556599428938, "grad_norm": 3.91021466255188, "learning_rate": 2.6921874318290895e-06, "loss": 0.5127, "step": 794800 }, { "epoch": 18.3061619231832, "grad_norm": 3.8334686756134033, "learning_rate": 2.6849158680065733e-06, "loss": 0.5062, "step": 795000 }, { "epoch": 18.31076724693746, "grad_norm": 3.018327236175537, "learning_rate": 2.677644304184058e-06, "loss": 0.5228, "step": 795200 }, { "epoch": 18.31537257069172, "grad_norm": 4.303511619567871, "learning_rate": 2.670372740361542e-06, "loss": 0.5138, "step": 795400 }, { "epoch": 18.31997789444598, "grad_norm": 3.589290142059326, "learning_rate": 2.6631011765390268e-06, "loss": 0.5281, "step": 795600 }, { "epoch": 18.32458321820024, "grad_norm": 2.7188146114349365, "learning_rate": 2.6558296127165106e-06, "loss": 0.5093, "step": 795800 }, { "epoch": 18.3291885419545, "grad_norm": 3.7704238891601562, "learning_rate": 2.6485580488939952e-06, "loss": 0.5106, "step": 796000 }, { "epoch": 18.33379386570876, "grad_norm": 2.9038262367248535, "learning_rate": 2.6412864850714794e-06, "loss": 0.5047, "step": 796200 }, { "epoch": 18.33839918946302, "grad_norm": 3.1424877643585205, "learning_rate": 2.634014921248964e-06, "loss": 0.5227, "step": 796400 }, { "epoch": 18.34300451321728, "grad_norm": 3.6267952919006348, "learning_rate": 2.626743357426448e-06, "loss": 0.5094, "step": 796600 }, { "epoch": 18.34760983697154, "grad_norm": 3.1225740909576416, "learning_rate": 2.6194717936039325e-06, "loss": 0.5222, "step": 796800 }, { "epoch": 18.352215160725798, "grad_norm": 3.5148890018463135, "learning_rate": 2.612200229781417e-06, "loss": 0.5134, "step": 797000 }, { "epoch": 18.35682048448006, "grad_norm": 3.0749411582946777, "learning_rate": 2.6049286659589014e-06, "loss": 0.5055, "step": 797200 }, { "epoch": 18.36142580823432, "grad_norm": 3.1104702949523926, "learning_rate": 2.5976571021363856e-06, "loss": 0.5224, "step": 797400 }, { "epoch": 18.366031131988578, "grad_norm": 4.1002020835876465, "learning_rate": 2.59038553831387e-06, "loss": 0.5125, "step": 797600 }, { "epoch": 18.37063645574284, "grad_norm": 3.385798692703247, "learning_rate": 2.5831139744913545e-06, "loss": 0.5238, "step": 797800 }, { "epoch": 18.3752417794971, "grad_norm": 2.667778253555298, "learning_rate": 2.5758424106688383e-06, "loss": 0.519, "step": 798000 }, { "epoch": 18.379847103251358, "grad_norm": 3.486220121383667, "learning_rate": 2.568570846846323e-06, "loss": 0.5123, "step": 798200 }, { "epoch": 18.38445242700562, "grad_norm": 2.9387404918670654, "learning_rate": 2.56133564084292e-06, "loss": 0.5075, "step": 798400 }, { "epoch": 18.38905775075988, "grad_norm": 3.13200044631958, "learning_rate": 2.554064077020404e-06, "loss": 0.5107, "step": 798600 }, { "epoch": 18.393663074514137, "grad_norm": 2.374018669128418, "learning_rate": 2.5467925131978887e-06, "loss": 0.5245, "step": 798800 }, { "epoch": 18.3982683982684, "grad_norm": 3.3281686305999756, "learning_rate": 2.539520949375373e-06, "loss": 0.5111, "step": 799000 }, { "epoch": 18.402873722022658, "grad_norm": 2.587111711502075, "learning_rate": 2.532249385552857e-06, "loss": 0.5144, "step": 799200 }, { "epoch": 18.407479045776917, "grad_norm": 2.7880938053131104, "learning_rate": 2.5249778217303413e-06, "loss": 0.5129, "step": 799400 }, { "epoch": 18.41208436953118, "grad_norm": 2.612704038619995, "learning_rate": 2.517706257907826e-06, "loss": 0.5169, "step": 799600 }, { "epoch": 18.416689693285438, "grad_norm": 3.7261786460876465, "learning_rate": 2.5104346940853098e-06, "loss": 0.5128, "step": 799800 }, { "epoch": 18.421295017039697, "grad_norm": 3.7250654697418213, "learning_rate": 2.5031631302627944e-06, "loss": 0.5108, "step": 800000 }, { "epoch": 18.42590034079396, "grad_norm": 3.5736753940582275, "learning_rate": 2.4958915664402786e-06, "loss": 0.5307, "step": 800200 }, { "epoch": 18.430505664548217, "grad_norm": 2.80611515045166, "learning_rate": 2.4886200026177633e-06, "loss": 0.5141, "step": 800400 }, { "epoch": 18.435110988302476, "grad_norm": 3.6763579845428467, "learning_rate": 2.481348438795247e-06, "loss": 0.5067, "step": 800600 }, { "epoch": 18.43971631205674, "grad_norm": 2.915642023086548, "learning_rate": 2.4740768749727317e-06, "loss": 0.5219, "step": 800800 }, { "epoch": 18.444321635810997, "grad_norm": 4.065909385681152, "learning_rate": 2.466805311150216e-06, "loss": 0.5083, "step": 801000 }, { "epoch": 18.448926959565256, "grad_norm": 3.1720340251922607, "learning_rate": 2.4595337473277006e-06, "loss": 0.5222, "step": 801200 }, { "epoch": 18.453532283319518, "grad_norm": 2.6975619792938232, "learning_rate": 2.4522621835051844e-06, "loss": 0.5067, "step": 801400 }, { "epoch": 18.458137607073777, "grad_norm": 2.7786128520965576, "learning_rate": 2.444990619682669e-06, "loss": 0.5181, "step": 801600 }, { "epoch": 18.462742930828036, "grad_norm": 2.5183842182159424, "learning_rate": 2.4377190558601532e-06, "loss": 0.5166, "step": 801800 }, { "epoch": 18.467348254582298, "grad_norm": 2.8333635330200195, "learning_rate": 2.430447492037638e-06, "loss": 0.5163, "step": 802000 }, { "epoch": 18.471953578336556, "grad_norm": 3.31788969039917, "learning_rate": 2.4231759282151217e-06, "loss": 0.518, "step": 802200 }, { "epoch": 18.476558902090815, "grad_norm": 3.1232898235321045, "learning_rate": 2.4159043643926063e-06, "loss": 0.5137, "step": 802400 }, { "epoch": 18.481164225845077, "grad_norm": 3.5303895473480225, "learning_rate": 2.4086691583892032e-06, "loss": 0.5241, "step": 802600 }, { "epoch": 18.485769549599336, "grad_norm": 2.972893714904785, "learning_rate": 2.4013975945666874e-06, "loss": 0.5078, "step": 802800 }, { "epoch": 18.4903748733536, "grad_norm": 2.6240293979644775, "learning_rate": 2.394126030744172e-06, "loss": 0.5166, "step": 803000 }, { "epoch": 18.494980197107857, "grad_norm": 2.6254336833953857, "learning_rate": 2.386890824740769e-06, "loss": 0.5195, "step": 803200 }, { "epoch": 18.499585520862116, "grad_norm": 3.5183451175689697, "learning_rate": 2.3796192609182528e-06, "loss": 0.5125, "step": 803400 }, { "epoch": 18.504190844616378, "grad_norm": 2.951514720916748, "learning_rate": 2.3723476970957374e-06, "loss": 0.515, "step": 803600 }, { "epoch": 18.508796168370637, "grad_norm": 3.723506450653076, "learning_rate": 2.3650761332732216e-06, "loss": 0.5108, "step": 803800 }, { "epoch": 18.513401492124895, "grad_norm": 2.8375089168548584, "learning_rate": 2.3578045694507063e-06, "loss": 0.5099, "step": 804000 }, { "epoch": 18.518006815879158, "grad_norm": 2.5292952060699463, "learning_rate": 2.35053300562819e-06, "loss": 0.5176, "step": 804200 }, { "epoch": 18.522612139633416, "grad_norm": 2.64688777923584, "learning_rate": 2.3432614418056747e-06, "loss": 0.5067, "step": 804400 }, { "epoch": 18.527217463387675, "grad_norm": 3.6783273220062256, "learning_rate": 2.335989877983159e-06, "loss": 0.5204, "step": 804600 }, { "epoch": 18.531822787141937, "grad_norm": 3.5552361011505127, "learning_rate": 2.3287183141606436e-06, "loss": 0.5043, "step": 804800 }, { "epoch": 18.536428110896196, "grad_norm": 3.2236578464508057, "learning_rate": 2.3214467503381274e-06, "loss": 0.5089, "step": 805000 }, { "epoch": 18.541033434650455, "grad_norm": 3.2282896041870117, "learning_rate": 2.314175186515612e-06, "loss": 0.5058, "step": 805200 }, { "epoch": 18.545638758404717, "grad_norm": 2.5554604530334473, "learning_rate": 2.3069036226930963e-06, "loss": 0.5093, "step": 805400 }, { "epoch": 18.550244082158976, "grad_norm": 3.2304649353027344, "learning_rate": 2.299632058870581e-06, "loss": 0.5146, "step": 805600 }, { "epoch": 18.554849405913235, "grad_norm": 4.473756790161133, "learning_rate": 2.292360495048065e-06, "loss": 0.505, "step": 805800 }, { "epoch": 18.559454729667497, "grad_norm": 3.0843703746795654, "learning_rate": 2.2850889312255493e-06, "loss": 0.5167, "step": 806000 }, { "epoch": 18.564060053421755, "grad_norm": 3.773874044418335, "learning_rate": 2.277817367403034e-06, "loss": 0.5102, "step": 806200 }, { "epoch": 18.568665377176014, "grad_norm": 3.845970630645752, "learning_rate": 2.270545803580518e-06, "loss": 0.5235, "step": 806400 }, { "epoch": 18.573270700930276, "grad_norm": 3.6182453632354736, "learning_rate": 2.2632742397580024e-06, "loss": 0.5057, "step": 806600 }, { "epoch": 18.577876024684535, "grad_norm": 3.738835573196411, "learning_rate": 2.2560026759354866e-06, "loss": 0.5207, "step": 806800 }, { "epoch": 18.582481348438794, "grad_norm": 3.5557377338409424, "learning_rate": 2.2487311121129713e-06, "loss": 0.529, "step": 807000 }, { "epoch": 18.587086672193056, "grad_norm": 3.059429168701172, "learning_rate": 2.2414595482904555e-06, "loss": 0.5075, "step": 807200 }, { "epoch": 18.591691995947315, "grad_norm": 3.12056303024292, "learning_rate": 2.2342243422870524e-06, "loss": 0.5114, "step": 807400 }, { "epoch": 18.596297319701574, "grad_norm": 3.1706645488739014, "learning_rate": 2.2269527784645366e-06, "loss": 0.5217, "step": 807600 }, { "epoch": 18.600902643455836, "grad_norm": 3.850172281265259, "learning_rate": 2.219681214642021e-06, "loss": 0.5137, "step": 807800 }, { "epoch": 18.605507967210094, "grad_norm": 3.080021858215332, "learning_rate": 2.2124096508195055e-06, "loss": 0.5139, "step": 808000 }, { "epoch": 18.610113290964353, "grad_norm": 2.29182505607605, "learning_rate": 2.2051380869969897e-06, "loss": 0.5026, "step": 808200 }, { "epoch": 18.614718614718615, "grad_norm": 3.1314103603363037, "learning_rate": 2.197866523174474e-06, "loss": 0.5123, "step": 808400 }, { "epoch": 18.619323938472874, "grad_norm": 3.334752082824707, "learning_rate": 2.190594959351958e-06, "loss": 0.5101, "step": 808600 }, { "epoch": 18.623929262227133, "grad_norm": 3.3216257095336914, "learning_rate": 2.1833233955294428e-06, "loss": 0.5094, "step": 808800 }, { "epoch": 18.628534585981395, "grad_norm": 2.6570966243743896, "learning_rate": 2.176051831706927e-06, "loss": 0.5118, "step": 809000 }, { "epoch": 18.633139909735654, "grad_norm": 3.783985137939453, "learning_rate": 2.1687802678844112e-06, "loss": 0.5226, "step": 809200 }, { "epoch": 18.637745233489916, "grad_norm": 3.1615185737609863, "learning_rate": 2.1615087040618954e-06, "loss": 0.5194, "step": 809400 }, { "epoch": 18.642350557244175, "grad_norm": 3.571136951446533, "learning_rate": 2.15423714023938e-06, "loss": 0.5248, "step": 809600 }, { "epoch": 18.646955880998433, "grad_norm": 2.769298553466797, "learning_rate": 2.147001934235977e-06, "loss": 0.5104, "step": 809800 }, { "epoch": 18.651561204752696, "grad_norm": 2.678037166595459, "learning_rate": 2.1397303704134612e-06, "loss": 0.5246, "step": 810000 }, { "epoch": 18.656166528506954, "grad_norm": 3.228646993637085, "learning_rate": 2.1324588065909454e-06, "loss": 0.502, "step": 810200 }, { "epoch": 18.660771852261213, "grad_norm": 2.839290142059326, "learning_rate": 2.1252236005875423e-06, "loss": 0.5132, "step": 810400 }, { "epoch": 18.665377176015475, "grad_norm": 2.865377187728882, "learning_rate": 2.1179520367650266e-06, "loss": 0.5063, "step": 810600 }, { "epoch": 18.669982499769734, "grad_norm": 4.583608150482178, "learning_rate": 2.110680472942511e-06, "loss": 0.5155, "step": 810800 }, { "epoch": 18.674587823523993, "grad_norm": 2.7023823261260986, "learning_rate": 2.1034089091199954e-06, "loss": 0.5033, "step": 811000 }, { "epoch": 18.679193147278255, "grad_norm": 2.9620158672332764, "learning_rate": 2.0961373452974796e-06, "loss": 0.5184, "step": 811200 }, { "epoch": 18.683798471032514, "grad_norm": 2.519291639328003, "learning_rate": 2.088865781474964e-06, "loss": 0.5126, "step": 811400 }, { "epoch": 18.688403794786772, "grad_norm": 2.743424415588379, "learning_rate": 2.0815942176524485e-06, "loss": 0.509, "step": 811600 }, { "epoch": 18.693009118541035, "grad_norm": 3.654670000076294, "learning_rate": 2.0743226538299327e-06, "loss": 0.5155, "step": 811800 }, { "epoch": 18.697614442295293, "grad_norm": 3.285017251968384, "learning_rate": 2.067051090007417e-06, "loss": 0.5126, "step": 812000 }, { "epoch": 18.702219766049552, "grad_norm": 2.268007755279541, "learning_rate": 2.059779526184901e-06, "loss": 0.5104, "step": 812200 }, { "epoch": 18.706825089803814, "grad_norm": 3.560014247894287, "learning_rate": 2.052507962362386e-06, "loss": 0.5294, "step": 812400 }, { "epoch": 18.711430413558073, "grad_norm": 2.835283041000366, "learning_rate": 2.04523639853987e-06, "loss": 0.5132, "step": 812600 }, { "epoch": 18.716035737312332, "grad_norm": 2.768789291381836, "learning_rate": 2.0379648347173542e-06, "loss": 0.5094, "step": 812800 }, { "epoch": 18.720641061066594, "grad_norm": 2.965252637863159, "learning_rate": 2.0306932708948385e-06, "loss": 0.5031, "step": 813000 }, { "epoch": 18.725246384820853, "grad_norm": 3.548333168029785, "learning_rate": 2.023421707072323e-06, "loss": 0.5152, "step": 813200 }, { "epoch": 18.72985170857511, "grad_norm": 2.307636260986328, "learning_rate": 2.0161501432498073e-06, "loss": 0.5115, "step": 813400 }, { "epoch": 18.734457032329374, "grad_norm": 3.0482430458068848, "learning_rate": 2.0088785794272915e-06, "loss": 0.5234, "step": 813600 }, { "epoch": 18.739062356083632, "grad_norm": 2.7281389236450195, "learning_rate": 2.0016070156047758e-06, "loss": 0.5183, "step": 813800 }, { "epoch": 18.74366767983789, "grad_norm": 3.274071216583252, "learning_rate": 1.9943354517822604e-06, "loss": 0.5144, "step": 814000 }, { "epoch": 18.748273003592153, "grad_norm": 4.017879486083984, "learning_rate": 1.9870638879597446e-06, "loss": 0.5162, "step": 814200 }, { "epoch": 18.752878327346412, "grad_norm": 3.111361265182495, "learning_rate": 1.9798286819563415e-06, "loss": 0.5183, "step": 814400 }, { "epoch": 18.75748365110067, "grad_norm": 2.9756317138671875, "learning_rate": 1.9725571181338258e-06, "loss": 0.516, "step": 814600 }, { "epoch": 18.762088974854933, "grad_norm": 2.925050973892212, "learning_rate": 1.96528555431131e-06, "loss": 0.5216, "step": 814800 }, { "epoch": 18.766694298609192, "grad_norm": 3.605431079864502, "learning_rate": 1.9580139904887946e-06, "loss": 0.5169, "step": 815000 }, { "epoch": 18.77129962236345, "grad_norm": 4.39565372467041, "learning_rate": 1.9507424266662793e-06, "loss": 0.5192, "step": 815200 }, { "epoch": 18.775904946117713, "grad_norm": 2.9125242233276367, "learning_rate": 1.943470862843763e-06, "loss": 0.5228, "step": 815400 }, { "epoch": 18.78051026987197, "grad_norm": 2.8098058700561523, "learning_rate": 1.9361992990212477e-06, "loss": 0.5089, "step": 815600 }, { "epoch": 18.78511559362623, "grad_norm": 3.1895477771759033, "learning_rate": 1.928927735198732e-06, "loss": 0.5062, "step": 815800 }, { "epoch": 18.789720917380492, "grad_norm": 3.4902753829956055, "learning_rate": 1.9216561713762166e-06, "loss": 0.5108, "step": 816000 }, { "epoch": 18.79432624113475, "grad_norm": 2.913203239440918, "learning_rate": 1.9143846075537004e-06, "loss": 0.5183, "step": 816200 }, { "epoch": 18.798931564889013, "grad_norm": 3.108332872390747, "learning_rate": 1.9071130437311848e-06, "loss": 0.514, "step": 816400 }, { "epoch": 18.803536888643272, "grad_norm": 3.0031442642211914, "learning_rate": 1.8998778377277817e-06, "loss": 0.5184, "step": 816600 }, { "epoch": 18.80814221239753, "grad_norm": 3.7675793170928955, "learning_rate": 1.8926062739052661e-06, "loss": 0.5141, "step": 816800 }, { "epoch": 18.812747536151793, "grad_norm": 3.3338944911956787, "learning_rate": 1.8853347100827506e-06, "loss": 0.5124, "step": 817000 }, { "epoch": 18.81735285990605, "grad_norm": 4.226090908050537, "learning_rate": 1.8780631462602346e-06, "loss": 0.5115, "step": 817200 }, { "epoch": 18.82195818366031, "grad_norm": 3.7023580074310303, "learning_rate": 1.870791582437719e-06, "loss": 0.5124, "step": 817400 }, { "epoch": 18.826563507414573, "grad_norm": 2.838956832885742, "learning_rate": 1.8635200186152034e-06, "loss": 0.524, "step": 817600 }, { "epoch": 18.83116883116883, "grad_norm": 3.4643635749816895, "learning_rate": 1.8562484547926876e-06, "loss": 0.5137, "step": 817800 }, { "epoch": 18.83577415492309, "grad_norm": 3.38789963722229, "learning_rate": 1.848976890970172e-06, "loss": 0.5007, "step": 818000 }, { "epoch": 18.840379478677352, "grad_norm": 3.213732957839966, "learning_rate": 1.8417053271476563e-06, "loss": 0.5106, "step": 818200 }, { "epoch": 18.84498480243161, "grad_norm": 3.5286900997161865, "learning_rate": 1.8344337633251407e-06, "loss": 0.5138, "step": 818400 }, { "epoch": 18.84959012618587, "grad_norm": 3.5109329223632812, "learning_rate": 1.827162199502625e-06, "loss": 0.5178, "step": 818600 }, { "epoch": 18.854195449940132, "grad_norm": 2.604468822479248, "learning_rate": 1.8198906356801094e-06, "loss": 0.5086, "step": 818800 }, { "epoch": 18.85880077369439, "grad_norm": 3.2741763591766357, "learning_rate": 1.8126190718575936e-06, "loss": 0.5152, "step": 819000 }, { "epoch": 18.86340609744865, "grad_norm": 3.35343337059021, "learning_rate": 1.8053475080350782e-06, "loss": 0.5163, "step": 819200 }, { "epoch": 18.86801142120291, "grad_norm": 3.6704483032226562, "learning_rate": 1.7980759442125625e-06, "loss": 0.5255, "step": 819400 }, { "epoch": 18.87261674495717, "grad_norm": 3.315521240234375, "learning_rate": 1.790804380390047e-06, "loss": 0.5222, "step": 819600 }, { "epoch": 18.87722206871143, "grad_norm": 2.896103858947754, "learning_rate": 1.7835328165675311e-06, "loss": 0.5166, "step": 819800 }, { "epoch": 18.88182739246569, "grad_norm": 2.6083381175994873, "learning_rate": 1.7762612527450155e-06, "loss": 0.5098, "step": 820000 }, { "epoch": 18.88643271621995, "grad_norm": 3.2741894721984863, "learning_rate": 1.7689896889224998e-06, "loss": 0.5153, "step": 820200 }, { "epoch": 18.89103803997421, "grad_norm": 2.9880030155181885, "learning_rate": 1.7617181250999842e-06, "loss": 0.5124, "step": 820400 }, { "epoch": 18.89564336372847, "grad_norm": 3.405291795730591, "learning_rate": 1.7544465612774684e-06, "loss": 0.518, "step": 820600 }, { "epoch": 18.90024868748273, "grad_norm": 2.607799768447876, "learning_rate": 1.7472113552740653e-06, "loss": 0.5123, "step": 820800 }, { "epoch": 18.90485401123699, "grad_norm": 3.2214035987854004, "learning_rate": 1.7399397914515498e-06, "loss": 0.5064, "step": 821000 }, { "epoch": 18.90945933499125, "grad_norm": 3.4306139945983887, "learning_rate": 1.7327045854481467e-06, "loss": 0.5062, "step": 821200 }, { "epoch": 18.91406465874551, "grad_norm": 2.69059681892395, "learning_rate": 1.7254330216256309e-06, "loss": 0.5194, "step": 821400 }, { "epoch": 18.918669982499768, "grad_norm": 3.644313097000122, "learning_rate": 1.7181614578031153e-06, "loss": 0.5043, "step": 821600 }, { "epoch": 18.92327530625403, "grad_norm": 3.1063473224639893, "learning_rate": 1.7108898939805995e-06, "loss": 0.5124, "step": 821800 }, { "epoch": 18.92788063000829, "grad_norm": 3.852067708969116, "learning_rate": 1.703618330158084e-06, "loss": 0.5106, "step": 822000 }, { "epoch": 18.932485953762548, "grad_norm": 2.8518900871276855, "learning_rate": 1.6963467663355682e-06, "loss": 0.5175, "step": 822200 }, { "epoch": 18.93709127751681, "grad_norm": 2.894487142562866, "learning_rate": 1.6890752025130526e-06, "loss": 0.5137, "step": 822400 }, { "epoch": 18.94169660127107, "grad_norm": 3.1942451000213623, "learning_rate": 1.6818036386905368e-06, "loss": 0.5127, "step": 822600 }, { "epoch": 18.94630192502533, "grad_norm": 3.005305051803589, "learning_rate": 1.6745320748680213e-06, "loss": 0.5046, "step": 822800 }, { "epoch": 18.95090724877959, "grad_norm": 4.5873284339904785, "learning_rate": 1.6672605110455055e-06, "loss": 0.5146, "step": 823000 }, { "epoch": 18.95551257253385, "grad_norm": 2.7985775470733643, "learning_rate": 1.65998894722299e-06, "loss": 0.5285, "step": 823200 }, { "epoch": 18.96011789628811, "grad_norm": 3.6909046173095703, "learning_rate": 1.6527173834004741e-06, "loss": 0.522, "step": 823400 }, { "epoch": 18.96472322004237, "grad_norm": 2.9598708152770996, "learning_rate": 1.6454458195779586e-06, "loss": 0.5164, "step": 823600 }, { "epoch": 18.969328543796628, "grad_norm": 3.4591517448425293, "learning_rate": 1.6381742557554428e-06, "loss": 0.5118, "step": 823800 }, { "epoch": 18.97393386755089, "grad_norm": 3.0720736980438232, "learning_rate": 1.6309026919329272e-06, "loss": 0.527, "step": 824000 }, { "epoch": 18.97853919130515, "grad_norm": 4.1211018562316895, "learning_rate": 1.6236311281104114e-06, "loss": 0.5107, "step": 824200 }, { "epoch": 18.983144515059408, "grad_norm": 2.694967746734619, "learning_rate": 1.6163595642878959e-06, "loss": 0.5063, "step": 824400 }, { "epoch": 18.98774983881367, "grad_norm": 2.379441022872925, "learning_rate": 1.60908800046538e-06, "loss": 0.5177, "step": 824600 }, { "epoch": 18.99235516256793, "grad_norm": 2.991995096206665, "learning_rate": 1.6018164366428645e-06, "loss": 0.5018, "step": 824800 }, { "epoch": 18.996960486322187, "grad_norm": 3.4043898582458496, "learning_rate": 1.5945448728203487e-06, "loss": 0.5125, "step": 825000 }, { "epoch": 19.0, "eval_loss": 0.5035088062286377, "eval_runtime": 162.1896, "eval_samples_per_second": 174.863, "eval_steps_per_second": 10.932, "step": 825132 }, { "epoch": 19.00156581007645, "grad_norm": 2.809699296951294, "learning_rate": 1.5873096668169456e-06, "loss": 0.5089, "step": 825200 }, { "epoch": 19.00617113383071, "grad_norm": 3.992568254470825, "learning_rate": 1.58003810299443e-06, "loss": 0.5059, "step": 825400 }, { "epoch": 19.010776457584967, "grad_norm": 2.535313129425049, "learning_rate": 1.5727665391719143e-06, "loss": 0.5081, "step": 825600 }, { "epoch": 19.01538178133923, "grad_norm": 2.8412795066833496, "learning_rate": 1.5654949753493987e-06, "loss": 0.521, "step": 825800 }, { "epoch": 19.019987105093488, "grad_norm": 3.4555296897888184, "learning_rate": 1.558223411526883e-06, "loss": 0.5154, "step": 826000 }, { "epoch": 19.024592428847747, "grad_norm": 3.3591201305389404, "learning_rate": 1.5509518477043674e-06, "loss": 0.5027, "step": 826200 }, { "epoch": 19.02919775260201, "grad_norm": 2.9588282108306885, "learning_rate": 1.5436802838818516e-06, "loss": 0.516, "step": 826400 }, { "epoch": 19.033803076356268, "grad_norm": 3.250680446624756, "learning_rate": 1.536408720059336e-06, "loss": 0.5068, "step": 826600 }, { "epoch": 19.038408400110526, "grad_norm": 2.998945713043213, "learning_rate": 1.5291371562368202e-06, "loss": 0.5137, "step": 826800 }, { "epoch": 19.04301372386479, "grad_norm": 2.949064254760742, "learning_rate": 1.5218655924143047e-06, "loss": 0.4984, "step": 827000 }, { "epoch": 19.047619047619047, "grad_norm": 3.1367082595825195, "learning_rate": 1.5146303864109016e-06, "loss": 0.5117, "step": 827200 }, { "epoch": 19.052224371373306, "grad_norm": 2.956386089324951, "learning_rate": 1.5073588225883858e-06, "loss": 0.5142, "step": 827400 }, { "epoch": 19.05682969512757, "grad_norm": 3.038003444671631, "learning_rate": 1.5000872587658702e-06, "loss": 0.4999, "step": 827600 }, { "epoch": 19.061435018881827, "grad_norm": 3.545823335647583, "learning_rate": 1.4928156949433545e-06, "loss": 0.5041, "step": 827800 }, { "epoch": 19.066040342636086, "grad_norm": 2.507877826690674, "learning_rate": 1.4855804889399514e-06, "loss": 0.5173, "step": 828000 }, { "epoch": 19.070645666390348, "grad_norm": 2.498446464538574, "learning_rate": 1.4783089251174358e-06, "loss": 0.5251, "step": 828200 }, { "epoch": 19.075250990144607, "grad_norm": 3.057725429534912, "learning_rate": 1.47103736129492e-06, "loss": 0.511, "step": 828400 }, { "epoch": 19.079856313898865, "grad_norm": 2.1749329566955566, "learning_rate": 1.4637657974724044e-06, "loss": 0.49, "step": 828600 }, { "epoch": 19.084461637653128, "grad_norm": 2.94429874420166, "learning_rate": 1.4564942336498887e-06, "loss": 0.5181, "step": 828800 }, { "epoch": 19.089066961407386, "grad_norm": 2.4639828205108643, "learning_rate": 1.449222669827373e-06, "loss": 0.5149, "step": 829000 }, { "epoch": 19.093672285161645, "grad_norm": 3.422637462615967, "learning_rate": 1.4419511060048575e-06, "loss": 0.5154, "step": 829200 }, { "epoch": 19.098277608915907, "grad_norm": 2.8384313583374023, "learning_rate": 1.434679542182342e-06, "loss": 0.505, "step": 829400 }, { "epoch": 19.102882932670166, "grad_norm": 2.945739984512329, "learning_rate": 1.4274079783598262e-06, "loss": 0.5124, "step": 829600 }, { "epoch": 19.10748825642443, "grad_norm": 2.952714204788208, "learning_rate": 1.4201364145373106e-06, "loss": 0.5124, "step": 829800 }, { "epoch": 19.112093580178687, "grad_norm": 4.0392680168151855, "learning_rate": 1.4128648507147948e-06, "loss": 0.5164, "step": 830000 }, { "epoch": 19.116698903932946, "grad_norm": 3.5149545669555664, "learning_rate": 1.4055932868922793e-06, "loss": 0.511, "step": 830200 }, { "epoch": 19.121304227687208, "grad_norm": 2.5348634719848633, "learning_rate": 1.3983217230697635e-06, "loss": 0.5078, "step": 830400 }, { "epoch": 19.125909551441467, "grad_norm": 3.143413782119751, "learning_rate": 1.3910865170663604e-06, "loss": 0.5182, "step": 830600 }, { "epoch": 19.130514875195725, "grad_norm": 2.6804118156433105, "learning_rate": 1.3838149532438448e-06, "loss": 0.5154, "step": 830800 }, { "epoch": 19.135120198949988, "grad_norm": 3.680730104446411, "learning_rate": 1.376543389421329e-06, "loss": 0.5083, "step": 831000 }, { "epoch": 19.139725522704246, "grad_norm": 3.507612943649292, "learning_rate": 1.369308183417926e-06, "loss": 0.5128, "step": 831200 }, { "epoch": 19.144330846458505, "grad_norm": 2.8536086082458496, "learning_rate": 1.3620366195954104e-06, "loss": 0.4996, "step": 831400 }, { "epoch": 19.148936170212767, "grad_norm": 3.133923053741455, "learning_rate": 1.3547650557728946e-06, "loss": 0.5172, "step": 831600 }, { "epoch": 19.153541493967026, "grad_norm": 3.9035439491271973, "learning_rate": 1.347493491950379e-06, "loss": 0.5043, "step": 831800 }, { "epoch": 19.158146817721285, "grad_norm": 2.622859477996826, "learning_rate": 1.3402219281278632e-06, "loss": 0.5147, "step": 832000 }, { "epoch": 19.162752141475547, "grad_norm": 3.7109525203704834, "learning_rate": 1.3329503643053477e-06, "loss": 0.5115, "step": 832200 }, { "epoch": 19.167357465229806, "grad_norm": 3.2106027603149414, "learning_rate": 1.325678800482832e-06, "loss": 0.5089, "step": 832400 }, { "epoch": 19.171962788984064, "grad_norm": 2.890758514404297, "learning_rate": 1.3184072366603163e-06, "loss": 0.5142, "step": 832600 }, { "epoch": 19.176568112738327, "grad_norm": 3.477639675140381, "learning_rate": 1.3111356728378005e-06, "loss": 0.5041, "step": 832800 }, { "epoch": 19.181173436492585, "grad_norm": 3.0593950748443604, "learning_rate": 1.303864109015285e-06, "loss": 0.5304, "step": 833000 }, { "epoch": 19.185778760246844, "grad_norm": 2.967453956604004, "learning_rate": 1.2965925451927692e-06, "loss": 0.5035, "step": 833200 }, { "epoch": 19.190384084001106, "grad_norm": 3.582881450653076, "learning_rate": 1.2893209813702536e-06, "loss": 0.5136, "step": 833400 }, { "epoch": 19.194989407755365, "grad_norm": 3.2862138748168945, "learning_rate": 1.2820494175477379e-06, "loss": 0.5052, "step": 833600 }, { "epoch": 19.199594731509624, "grad_norm": 3.457026720046997, "learning_rate": 1.2747778537252223e-06, "loss": 0.5166, "step": 833800 }, { "epoch": 19.204200055263886, "grad_norm": 3.373370885848999, "learning_rate": 1.2675062899027065e-06, "loss": 0.5075, "step": 834000 }, { "epoch": 19.208805379018145, "grad_norm": 3.317833185195923, "learning_rate": 1.260234726080191e-06, "loss": 0.5078, "step": 834200 }, { "epoch": 19.213410702772403, "grad_norm": 2.9720406532287598, "learning_rate": 1.2529631622576752e-06, "loss": 0.5074, "step": 834400 }, { "epoch": 19.218016026526666, "grad_norm": 3.7691690921783447, "learning_rate": 1.245727956254272e-06, "loss": 0.5205, "step": 834600 }, { "epoch": 19.222621350280924, "grad_norm": 3.1545891761779785, "learning_rate": 1.2384563924317565e-06, "loss": 0.5151, "step": 834800 }, { "epoch": 19.227226674035183, "grad_norm": 2.8010830879211426, "learning_rate": 1.2311848286092407e-06, "loss": 0.515, "step": 835000 }, { "epoch": 19.231831997789445, "grad_norm": 2.3824315071105957, "learning_rate": 1.2239132647867251e-06, "loss": 0.5145, "step": 835200 }, { "epoch": 19.236437321543704, "grad_norm": 3.0972163677215576, "learning_rate": 1.2166417009642094e-06, "loss": 0.5138, "step": 835400 }, { "epoch": 19.241042645297963, "grad_norm": 2.7618439197540283, "learning_rate": 1.2093701371416938e-06, "loss": 0.5125, "step": 835600 }, { "epoch": 19.245647969052225, "grad_norm": 3.029303789138794, "learning_rate": 1.202098573319178e-06, "loss": 0.5098, "step": 835800 }, { "epoch": 19.250253292806484, "grad_norm": 3.3938510417938232, "learning_rate": 1.1948270094966622e-06, "loss": 0.5101, "step": 836000 }, { "epoch": 19.254858616560742, "grad_norm": 3.0954368114471436, "learning_rate": 1.1875554456741467e-06, "loss": 0.5129, "step": 836200 }, { "epoch": 19.259463940315005, "grad_norm": 2.748145818710327, "learning_rate": 1.1802838818516309e-06, "loss": 0.517, "step": 836400 }, { "epoch": 19.264069264069263, "grad_norm": 3.037444829940796, "learning_rate": 1.1730123180291153e-06, "loss": 0.5031, "step": 836600 }, { "epoch": 19.268674587823526, "grad_norm": 2.792266845703125, "learning_rate": 1.1657407542065995e-06, "loss": 0.5045, "step": 836800 }, { "epoch": 19.273279911577784, "grad_norm": 2.9860918521881104, "learning_rate": 1.158469190384084e-06, "loss": 0.5285, "step": 837000 }, { "epoch": 19.277885235332043, "grad_norm": 3.5693347454071045, "learning_rate": 1.1511976265615682e-06, "loss": 0.5165, "step": 837200 }, { "epoch": 19.282490559086305, "grad_norm": 2.5332469940185547, "learning_rate": 1.1439260627390526e-06, "loss": 0.5243, "step": 837400 }, { "epoch": 19.287095882840564, "grad_norm": 3.3224828243255615, "learning_rate": 1.1366544989165368e-06, "loss": 0.5106, "step": 837600 }, { "epoch": 19.291701206594823, "grad_norm": 3.539848804473877, "learning_rate": 1.1293829350940213e-06, "loss": 0.516, "step": 837800 }, { "epoch": 19.296306530349085, "grad_norm": 4.043239116668701, "learning_rate": 1.1221113712715057e-06, "loss": 0.5165, "step": 838000 }, { "epoch": 19.300911854103344, "grad_norm": 3.046630620956421, "learning_rate": 1.1148398074489901e-06, "loss": 0.5079, "step": 838200 }, { "epoch": 19.305517177857602, "grad_norm": 2.7998993396759033, "learning_rate": 1.1075682436264743e-06, "loss": 0.5248, "step": 838400 }, { "epoch": 19.310122501611865, "grad_norm": 3.3862979412078857, "learning_rate": 1.1002966798039588e-06, "loss": 0.5173, "step": 838600 }, { "epoch": 19.314727825366123, "grad_norm": 3.7672178745269775, "learning_rate": 1.093025115981443e-06, "loss": 0.5157, "step": 838800 }, { "epoch": 19.319333149120382, "grad_norm": 3.1289238929748535, "learning_rate": 1.0857535521589274e-06, "loss": 0.5193, "step": 839000 }, { "epoch": 19.323938472874644, "grad_norm": 3.5175318717956543, "learning_rate": 1.0784819883364116e-06, "loss": 0.5246, "step": 839200 }, { "epoch": 19.328543796628903, "grad_norm": 2.3767151832580566, "learning_rate": 1.071210424513896e-06, "loss": 0.5068, "step": 839400 }, { "epoch": 19.333149120383162, "grad_norm": 2.1531851291656494, "learning_rate": 1.0639388606913803e-06, "loss": 0.5172, "step": 839600 }, { "epoch": 19.337754444137424, "grad_norm": 3.3230369091033936, "learning_rate": 1.0566672968688647e-06, "loss": 0.5105, "step": 839800 }, { "epoch": 19.342359767891683, "grad_norm": 3.470402956008911, "learning_rate": 1.049395733046349e-06, "loss": 0.5121, "step": 840000 }, { "epoch": 19.34696509164594, "grad_norm": 3.0232300758361816, "learning_rate": 1.0421241692238334e-06, "loss": 0.5038, "step": 840200 }, { "epoch": 19.351570415400204, "grad_norm": 2.391409397125244, "learning_rate": 1.0348526054013176e-06, "loss": 0.5104, "step": 840400 }, { "epoch": 19.356175739154462, "grad_norm": 2.891075849533081, "learning_rate": 1.027581041578802e-06, "loss": 0.5088, "step": 840600 }, { "epoch": 19.36078106290872, "grad_norm": 3.1132750511169434, "learning_rate": 1.0203094777562862e-06, "loss": 0.5115, "step": 840800 }, { "epoch": 19.365386386662983, "grad_norm": 2.8549110889434814, "learning_rate": 1.0130742717528832e-06, "loss": 0.5142, "step": 841000 }, { "epoch": 19.369991710417242, "grad_norm": 3.4312000274658203, "learning_rate": 1.0058027079303676e-06, "loss": 0.5001, "step": 841200 }, { "epoch": 19.3745970341715, "grad_norm": 3.5737624168395996, "learning_rate": 9.985675019269645e-07, "loss": 0.5036, "step": 841400 }, { "epoch": 19.379202357925763, "grad_norm": 2.611541748046875, "learning_rate": 9.912959381044487e-07, "loss": 0.5061, "step": 841600 }, { "epoch": 19.38380768168002, "grad_norm": 3.1176815032958984, "learning_rate": 9.840243742819331e-07, "loss": 0.5011, "step": 841800 }, { "epoch": 19.38841300543428, "grad_norm": 3.4241085052490234, "learning_rate": 9.767528104594174e-07, "loss": 0.5101, "step": 842000 }, { "epoch": 19.393018329188543, "grad_norm": 3.769061803817749, "learning_rate": 9.694812466369018e-07, "loss": 0.5212, "step": 842200 }, { "epoch": 19.3976236529428, "grad_norm": 3.25093674659729, "learning_rate": 9.62209682814386e-07, "loss": 0.5043, "step": 842400 }, { "epoch": 19.40222897669706, "grad_norm": 3.0781145095825195, "learning_rate": 9.549381189918704e-07, "loss": 0.5031, "step": 842600 }, { "epoch": 19.406834300451322, "grad_norm": 2.676129102706909, "learning_rate": 9.476665551693547e-07, "loss": 0.5187, "step": 842800 }, { "epoch": 19.41143962420558, "grad_norm": 3.2870798110961914, "learning_rate": 9.403949913468391e-07, "loss": 0.5144, "step": 843000 }, { "epoch": 19.41604494795984, "grad_norm": 2.994854211807251, "learning_rate": 9.331234275243234e-07, "loss": 0.5111, "step": 843200 }, { "epoch": 19.420650271714102, "grad_norm": 4.02811336517334, "learning_rate": 9.258882215209203e-07, "loss": 0.5123, "step": 843400 }, { "epoch": 19.42525559546836, "grad_norm": 2.6425375938415527, "learning_rate": 9.186166576984047e-07, "loss": 0.5134, "step": 843600 }, { "epoch": 19.429860919222623, "grad_norm": 3.219266891479492, "learning_rate": 9.11345093875889e-07, "loss": 0.5197, "step": 843800 }, { "epoch": 19.43446624297688, "grad_norm": 3.178544282913208, "learning_rate": 9.040735300533733e-07, "loss": 0.5238, "step": 844000 }, { "epoch": 19.43907156673114, "grad_norm": 2.9460504055023193, "learning_rate": 8.968019662308576e-07, "loss": 0.5171, "step": 844200 }, { "epoch": 19.443676890485403, "grad_norm": 2.947618246078491, "learning_rate": 8.89530402408342e-07, "loss": 0.5113, "step": 844400 }, { "epoch": 19.44828221423966, "grad_norm": 3.1434247493743896, "learning_rate": 8.822588385858263e-07, "loss": 0.5121, "step": 844600 }, { "epoch": 19.45288753799392, "grad_norm": 4.052412986755371, "learning_rate": 8.749872747633106e-07, "loss": 0.5132, "step": 844800 }, { "epoch": 19.457492861748182, "grad_norm": 2.8195929527282715, "learning_rate": 8.677157109407949e-07, "loss": 0.5209, "step": 845000 }, { "epoch": 19.46209818550244, "grad_norm": 2.5590524673461914, "learning_rate": 8.604805049373918e-07, "loss": 0.5102, "step": 845200 }, { "epoch": 19.4667035092567, "grad_norm": 2.98577618598938, "learning_rate": 8.532089411148762e-07, "loss": 0.5053, "step": 845400 }, { "epoch": 19.471308833010962, "grad_norm": 3.019172430038452, "learning_rate": 8.459373772923605e-07, "loss": 0.5076, "step": 845600 }, { "epoch": 19.47591415676522, "grad_norm": 3.0066978931427, "learning_rate": 8.386658134698449e-07, "loss": 0.5134, "step": 845800 }, { "epoch": 19.48051948051948, "grad_norm": 2.8860740661621094, "learning_rate": 8.313942496473292e-07, "loss": 0.5171, "step": 846000 }, { "epoch": 19.48512480427374, "grad_norm": 3.4044198989868164, "learning_rate": 8.241226858248136e-07, "loss": 0.5019, "step": 846200 }, { "epoch": 19.489730128028, "grad_norm": 2.7292261123657227, "learning_rate": 8.168511220022979e-07, "loss": 0.5093, "step": 846400 }, { "epoch": 19.49433545178226, "grad_norm": 3.8156752586364746, "learning_rate": 8.095795581797822e-07, "loss": 0.5105, "step": 846600 }, { "epoch": 19.49894077553652, "grad_norm": 3.2243752479553223, "learning_rate": 8.023079943572666e-07, "loss": 0.5104, "step": 846800 }, { "epoch": 19.50354609929078, "grad_norm": 3.429675340652466, "learning_rate": 7.950364305347509e-07, "loss": 0.5064, "step": 847000 }, { "epoch": 19.50815142304504, "grad_norm": 2.856170415878296, "learning_rate": 7.877648667122352e-07, "loss": 0.5156, "step": 847200 }, { "epoch": 19.5127567467993, "grad_norm": 3.27316951751709, "learning_rate": 7.804933028897195e-07, "loss": 0.4983, "step": 847400 }, { "epoch": 19.51736207055356, "grad_norm": 3.1368207931518555, "learning_rate": 7.732217390672039e-07, "loss": 0.5118, "step": 847600 }, { "epoch": 19.52196739430782, "grad_norm": 2.677781581878662, "learning_rate": 7.659501752446882e-07, "loss": 0.5106, "step": 847800 }, { "epoch": 19.52657271806208, "grad_norm": 2.9507906436920166, "learning_rate": 7.586786114221725e-07, "loss": 0.5203, "step": 848000 }, { "epoch": 19.53117804181634, "grad_norm": 3.28304123878479, "learning_rate": 7.514070475996568e-07, "loss": 0.5062, "step": 848200 }, { "epoch": 19.535783365570598, "grad_norm": 3.1453018188476562, "learning_rate": 7.44135483777141e-07, "loss": 0.5085, "step": 848400 }, { "epoch": 19.54038868932486, "grad_norm": 2.7345168590545654, "learning_rate": 7.368639199546254e-07, "loss": 0.5128, "step": 848600 }, { "epoch": 19.54499401307912, "grad_norm": 3.114023447036743, "learning_rate": 7.295923561321097e-07, "loss": 0.5089, "step": 848800 }, { "epoch": 19.549599336833378, "grad_norm": 2.9676406383514404, "learning_rate": 7.22320792309594e-07, "loss": 0.5033, "step": 849000 }, { "epoch": 19.55420466058764, "grad_norm": 3.178053855895996, "learning_rate": 7.150492284870785e-07, "loss": 0.5015, "step": 849200 }, { "epoch": 19.5588099843419, "grad_norm": 2.661820411682129, "learning_rate": 7.077776646645628e-07, "loss": 0.5122, "step": 849400 }, { "epoch": 19.563415308096157, "grad_norm": 3.9830830097198486, "learning_rate": 7.005061008420471e-07, "loss": 0.5087, "step": 849600 }, { "epoch": 19.56802063185042, "grad_norm": 2.1384353637695312, "learning_rate": 6.932345370195314e-07, "loss": 0.5082, "step": 849800 }, { "epoch": 19.57262595560468, "grad_norm": 3.7364590167999268, "learning_rate": 6.859629731970158e-07, "loss": 0.4953, "step": 850000 }, { "epoch": 19.57723127935894, "grad_norm": 2.6958413124084473, "learning_rate": 6.786914093745001e-07, "loss": 0.5113, "step": 850200 }, { "epoch": 19.5818366031132, "grad_norm": 3.8758256435394287, "learning_rate": 6.714198455519844e-07, "loss": 0.507, "step": 850400 }, { "epoch": 19.586441926867458, "grad_norm": 2.9180338382720947, "learning_rate": 6.641482817294687e-07, "loss": 0.519, "step": 850600 }, { "epoch": 19.59104725062172, "grad_norm": 2.6355738639831543, "learning_rate": 6.568767179069531e-07, "loss": 0.4985, "step": 850800 }, { "epoch": 19.59565257437598, "grad_norm": 2.67168927192688, "learning_rate": 6.4964151190355e-07, "loss": 0.5045, "step": 851000 }, { "epoch": 19.600257898130238, "grad_norm": 4.244370460510254, "learning_rate": 6.423699480810343e-07, "loss": 0.5072, "step": 851200 }, { "epoch": 19.6048632218845, "grad_norm": 3.0310142040252686, "learning_rate": 6.350983842585186e-07, "loss": 0.5097, "step": 851400 }, { "epoch": 19.60946854563876, "grad_norm": 3.2820241451263428, "learning_rate": 6.278268204360029e-07, "loss": 0.5038, "step": 851600 }, { "epoch": 19.614073869393017, "grad_norm": 3.4763691425323486, "learning_rate": 6.205552566134873e-07, "loss": 0.5262, "step": 851800 }, { "epoch": 19.61867919314728, "grad_norm": 3.8789632320404053, "learning_rate": 6.132836927909716e-07, "loss": 0.5077, "step": 852000 }, { "epoch": 19.62328451690154, "grad_norm": 2.8242287635803223, "learning_rate": 6.060121289684559e-07, "loss": 0.5073, "step": 852200 }, { "epoch": 19.627889840655797, "grad_norm": 3.055107593536377, "learning_rate": 5.987405651459402e-07, "loss": 0.5079, "step": 852400 }, { "epoch": 19.63249516441006, "grad_norm": 3.6184749603271484, "learning_rate": 5.914690013234246e-07, "loss": 0.5184, "step": 852600 }, { "epoch": 19.637100488164318, "grad_norm": 2.965026378631592, "learning_rate": 5.841974375009089e-07, "loss": 0.5054, "step": 852800 }, { "epoch": 19.641705811918577, "grad_norm": 2.952613115310669, "learning_rate": 5.769258736783932e-07, "loss": 0.5095, "step": 853000 }, { "epoch": 19.64631113567284, "grad_norm": 3.1526806354522705, "learning_rate": 5.696543098558776e-07, "loss": 0.501, "step": 853200 }, { "epoch": 19.650916459427098, "grad_norm": 2.6503217220306396, "learning_rate": 5.62382746033362e-07, "loss": 0.5078, "step": 853400 }, { "epoch": 19.655521783181356, "grad_norm": 3.481039524078369, "learning_rate": 5.551111822108463e-07, "loss": 0.5219, "step": 853600 }, { "epoch": 19.66012710693562, "grad_norm": 2.944347858428955, "learning_rate": 5.478396183883306e-07, "loss": 0.5095, "step": 853800 }, { "epoch": 19.664732430689877, "grad_norm": 3.2116689682006836, "learning_rate": 5.40568054565815e-07, "loss": 0.5008, "step": 854000 }, { "epoch": 19.669337754444136, "grad_norm": 2.333085775375366, "learning_rate": 5.332964907432993e-07, "loss": 0.5028, "step": 854200 }, { "epoch": 19.6739430781984, "grad_norm": 3.2551276683807373, "learning_rate": 5.260249269207836e-07, "loss": 0.5081, "step": 854400 }, { "epoch": 19.678548401952657, "grad_norm": 2.8687589168548584, "learning_rate": 5.187533630982679e-07, "loss": 0.5078, "step": 854600 }, { "epoch": 19.683153725706916, "grad_norm": 2.7332775592803955, "learning_rate": 5.114817992757523e-07, "loss": 0.5065, "step": 854800 }, { "epoch": 19.687759049461178, "grad_norm": 3.2149274349212646, "learning_rate": 5.042465932723492e-07, "loss": 0.5121, "step": 855000 }, { "epoch": 19.692364373215437, "grad_norm": 3.2051103115081787, "learning_rate": 4.969750294498335e-07, "loss": 0.5176, "step": 855200 }, { "epoch": 19.696969696969695, "grad_norm": 4.147640228271484, "learning_rate": 4.897034656273178e-07, "loss": 0.4981, "step": 855400 }, { "epoch": 19.701575020723958, "grad_norm": 2.6071434020996094, "learning_rate": 4.824319018048021e-07, "loss": 0.519, "step": 855600 }, { "epoch": 19.706180344478216, "grad_norm": 2.8884124755859375, "learning_rate": 4.7516033798228646e-07, "loss": 0.5043, "step": 855800 }, { "epoch": 19.710785668232475, "grad_norm": 3.735039234161377, "learning_rate": 4.678887741597708e-07, "loss": 0.5042, "step": 856000 }, { "epoch": 19.715390991986737, "grad_norm": 2.957726001739502, "learning_rate": 4.606172103372551e-07, "loss": 0.5158, "step": 856200 }, { "epoch": 19.719996315740996, "grad_norm": 2.8788440227508545, "learning_rate": 4.533456465147395e-07, "loss": 0.5084, "step": 856400 }, { "epoch": 19.724601639495255, "grad_norm": 3.8605153560638428, "learning_rate": 4.460740826922238e-07, "loss": 0.5201, "step": 856600 }, { "epoch": 19.729206963249517, "grad_norm": 3.0567197799682617, "learning_rate": 4.3880251886970814e-07, "loss": 0.4976, "step": 856800 }, { "epoch": 19.733812287003776, "grad_norm": 3.3743460178375244, "learning_rate": 4.3153095504719246e-07, "loss": 0.509, "step": 857000 }, { "epoch": 19.738417610758034, "grad_norm": 2.7658159732818604, "learning_rate": 4.242593912246768e-07, "loss": 0.5259, "step": 857200 }, { "epoch": 19.743022934512297, "grad_norm": 3.7195098400115967, "learning_rate": 4.170241852212737e-07, "loss": 0.5088, "step": 857400 }, { "epoch": 19.747628258266555, "grad_norm": 2.6873762607574463, "learning_rate": 4.09752621398758e-07, "loss": 0.5168, "step": 857600 }, { "epoch": 19.752233582020818, "grad_norm": 2.6738898754119873, "learning_rate": 4.0248105757624235e-07, "loss": 0.5081, "step": 857800 }, { "epoch": 19.756838905775076, "grad_norm": 2.91037917137146, "learning_rate": 3.9520949375372667e-07, "loss": 0.5066, "step": 858000 }, { "epoch": 19.761444229529335, "grad_norm": 2.883091926574707, "learning_rate": 3.87937929931211e-07, "loss": 0.5125, "step": 858200 }, { "epoch": 19.766049553283597, "grad_norm": 2.677788257598877, "learning_rate": 3.806663661086953e-07, "loss": 0.5051, "step": 858400 }, { "epoch": 19.770654877037856, "grad_norm": 3.4007511138916016, "learning_rate": 3.7339480228617965e-07, "loss": 0.5061, "step": 858600 }, { "epoch": 19.775260200792115, "grad_norm": 3.597402572631836, "learning_rate": 3.6612323846366403e-07, "loss": 0.5014, "step": 858800 }, { "epoch": 19.779865524546377, "grad_norm": 3.0766663551330566, "learning_rate": 3.5885167464114835e-07, "loss": 0.5205, "step": 859000 }, { "epoch": 19.784470848300636, "grad_norm": 3.5148487091064453, "learning_rate": 3.516164686377452e-07, "loss": 0.5238, "step": 859200 }, { "epoch": 19.789076172054894, "grad_norm": 2.736527681350708, "learning_rate": 3.4434490481522953e-07, "loss": 0.5033, "step": 859400 }, { "epoch": 19.793681495809157, "grad_norm": 3.351614236831665, "learning_rate": 3.370733409927139e-07, "loss": 0.5144, "step": 859600 }, { "epoch": 19.798286819563415, "grad_norm": 2.9245457649230957, "learning_rate": 3.2980177717019824e-07, "loss": 0.5079, "step": 859800 }, { "epoch": 19.802892143317674, "grad_norm": 2.770165205001831, "learning_rate": 3.2253021334768256e-07, "loss": 0.5054, "step": 860000 }, { "epoch": 19.807497467071936, "grad_norm": 3.2325165271759033, "learning_rate": 3.152586495251669e-07, "loss": 0.5162, "step": 860200 }, { "epoch": 19.812102790826195, "grad_norm": 2.7553768157958984, "learning_rate": 3.079870857026512e-07, "loss": 0.5005, "step": 860400 }, { "epoch": 19.816708114580454, "grad_norm": 3.5754334926605225, "learning_rate": 3.0071552188013554e-07, "loss": 0.5289, "step": 860600 }, { "epoch": 19.821313438334716, "grad_norm": 3.362692356109619, "learning_rate": 2.9344395805761986e-07, "loss": 0.5098, "step": 860800 }, { "epoch": 19.825918762088975, "grad_norm": 2.9253594875335693, "learning_rate": 2.861723942351042e-07, "loss": 0.5011, "step": 861000 }, { "epoch": 19.830524085843233, "grad_norm": 3.8785996437072754, "learning_rate": 2.7890083041258857e-07, "loss": 0.5125, "step": 861200 }, { "epoch": 19.835129409597496, "grad_norm": 3.061610698699951, "learning_rate": 2.716292665900729e-07, "loss": 0.5266, "step": 861400 }, { "epoch": 19.839734733351754, "grad_norm": 3.2867627143859863, "learning_rate": 2.643577027675572e-07, "loss": 0.5109, "step": 861600 }, { "epoch": 19.844340057106013, "grad_norm": 3.554579496383667, "learning_rate": 2.5708613894504154e-07, "loss": 0.5218, "step": 861800 }, { "epoch": 19.848945380860275, "grad_norm": 3.178513288497925, "learning_rate": 2.4981457512252587e-07, "loss": 0.5147, "step": 862000 }, { "epoch": 19.853550704614534, "grad_norm": 3.356754779815674, "learning_rate": 2.425430113000102e-07, "loss": 0.5141, "step": 862200 }, { "epoch": 19.858156028368793, "grad_norm": 2.666679859161377, "learning_rate": 2.3527144747749452e-07, "loss": 0.5183, "step": 862400 }, { "epoch": 19.862761352123055, "grad_norm": 3.7374143600463867, "learning_rate": 2.2799988365497884e-07, "loss": 0.5166, "step": 862600 }, { "epoch": 19.867366675877314, "grad_norm": 3.064572334289551, "learning_rate": 2.2072831983246317e-07, "loss": 0.5002, "step": 862800 }, { "epoch": 19.871971999631572, "grad_norm": 3.1594460010528564, "learning_rate": 2.134567560099475e-07, "loss": 0.5128, "step": 863000 }, { "epoch": 19.876577323385835, "grad_norm": 3.091594934463501, "learning_rate": 2.0618519218743182e-07, "loss": 0.5092, "step": 863200 }, { "epoch": 19.881182647140093, "grad_norm": 3.6168246269226074, "learning_rate": 1.9891362836491617e-07, "loss": 0.5154, "step": 863400 }, { "epoch": 19.885787970894356, "grad_norm": 3.3390212059020996, "learning_rate": 1.916420645424005e-07, "loss": 0.5118, "step": 863600 }, { "epoch": 19.890393294648614, "grad_norm": 2.9384098052978516, "learning_rate": 1.8437050071988482e-07, "loss": 0.4973, "step": 863800 }, { "epoch": 19.894998618402873, "grad_norm": 3.3351097106933594, "learning_rate": 1.7713529471648173e-07, "loss": 0.5152, "step": 864000 }, { "epoch": 19.899603942157135, "grad_norm": 3.4556448459625244, "learning_rate": 1.6986373089396606e-07, "loss": 0.5134, "step": 864200 }, { "epoch": 19.904209265911394, "grad_norm": 3.008500337600708, "learning_rate": 1.6262852489056296e-07, "loss": 0.5085, "step": 864400 }, { "epoch": 19.908814589665653, "grad_norm": 2.8039371967315674, "learning_rate": 1.5535696106804732e-07, "loss": 0.513, "step": 864600 }, { "epoch": 19.913419913419915, "grad_norm": 4.19392204284668, "learning_rate": 1.4808539724553164e-07, "loss": 0.5077, "step": 864800 }, { "epoch": 19.918025237174174, "grad_norm": 3.3566884994506836, "learning_rate": 1.4081383342301597e-07, "loss": 0.5156, "step": 865000 }, { "epoch": 19.922630560928432, "grad_norm": 3.012193202972412, "learning_rate": 1.335422696005003e-07, "loss": 0.5019, "step": 865200 }, { "epoch": 19.927235884682695, "grad_norm": 3.2162373065948486, "learning_rate": 1.2627070577798462e-07, "loss": 0.5135, "step": 865400 }, { "epoch": 19.931841208436953, "grad_norm": 3.0207014083862305, "learning_rate": 1.1903549977458152e-07, "loss": 0.506, "step": 865600 }, { "epoch": 19.936446532191212, "grad_norm": 3.044567823410034, "learning_rate": 1.1176393595206585e-07, "loss": 0.5098, "step": 865800 }, { "epoch": 19.941051855945474, "grad_norm": 2.884814977645874, "learning_rate": 1.0449237212955019e-07, "loss": 0.5142, "step": 866000 }, { "epoch": 19.945657179699733, "grad_norm": 3.6238908767700195, "learning_rate": 9.722080830703451e-08, "loss": 0.5142, "step": 866200 }, { "epoch": 19.95026250345399, "grad_norm": 2.9918384552001953, "learning_rate": 8.994924448451885e-08, "loss": 0.518, "step": 866400 }, { "epoch": 19.954867827208254, "grad_norm": 3.4772942066192627, "learning_rate": 8.267768066200316e-08, "loss": 0.5172, "step": 866600 }, { "epoch": 19.959473150962513, "grad_norm": 3.1812825202941895, "learning_rate": 7.54061168394875e-08, "loss": 0.5074, "step": 866800 }, { "epoch": 19.96407847471677, "grad_norm": 3.3182106018066406, "learning_rate": 6.813455301697183e-08, "loss": 0.5054, "step": 867000 }, { "epoch": 19.968683798471034, "grad_norm": 3.577573299407959, "learning_rate": 6.086298919445617e-08, "loss": 0.5108, "step": 867200 }, { "epoch": 19.973289122225292, "grad_norm": 3.277371644973755, "learning_rate": 5.359142537194049e-08, "loss": 0.5053, "step": 867400 }, { "epoch": 19.97789444597955, "grad_norm": 2.689061164855957, "learning_rate": 4.631986154942482e-08, "loss": 0.516, "step": 867600 }, { "epoch": 19.982499769733813, "grad_norm": 2.8559155464172363, "learning_rate": 3.904829772690915e-08, "loss": 0.5057, "step": 867800 }, { "epoch": 19.987105093488072, "grad_norm": 3.4477171897888184, "learning_rate": 3.177673390439348e-08, "loss": 0.5065, "step": 868000 }, { "epoch": 19.99171041724233, "grad_norm": 3.181300163269043, "learning_rate": 2.450517008187781e-08, "loss": 0.5202, "step": 868200 }, { "epoch": 19.996315740996593, "grad_norm": 2.9424209594726562, "learning_rate": 1.723360625936214e-08, "loss": 0.5106, "step": 868400 }, { "epoch": 20.0, "eval_loss": 0.5036933422088623, "eval_runtime": 162.2891, "eval_samples_per_second": 174.756, "eval_steps_per_second": 10.925, "step": 868560 } ], "logging_steps": 200, "max_steps": 868560, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.661507840870656e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }