diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30588 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 500, + "global_step": 868560, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004605323754259924, + "grad_norm": 2.7894906997680664, + "learning_rate": 1.3746891406465874e-07, + "loss": 0.6038, + "step": 200 + }, + { + "epoch": 0.009210647508519848, + "grad_norm": 3.5113091468811035, + "learning_rate": 2.756286266924565e-07, + "loss": 0.5979, + "step": 400 + }, + { + "epoch": 0.013815971262779773, + "grad_norm": 3.262929677963257, + "learning_rate": 4.137883393202542e-07, + "loss": 0.6018, + "step": 600 + }, + { + "epoch": 0.018421295017039697, + "grad_norm": 2.736161231994629, + "learning_rate": 5.51948051948052e-07, + "loss": 0.5985, + "step": 800 + }, + { + "epoch": 0.023026618771299624, + "grad_norm": 3.1657299995422363, + "learning_rate": 6.901077645758497e-07, + "loss": 0.6069, + "step": 1000 + }, + { + "epoch": 0.027631942525559547, + "grad_norm": 2.5964059829711914, + "learning_rate": 8.282674772036474e-07, + "loss": 0.6026, + "step": 1200 + }, + { + "epoch": 0.032237266279819474, + "grad_norm": 2.9468190670013428, + "learning_rate": 9.664271898314452e-07, + "loss": 0.6118, + "step": 1400 + }, + { + "epoch": 0.036842590034079394, + "grad_norm": 3.6232540607452393, + "learning_rate": 1.1045869024592427e-06, + "loss": 0.5998, + "step": 1600 + }, + { + "epoch": 0.04144791378833932, + "grad_norm": 2.8659310340881348, + "learning_rate": 1.2427466150870407e-06, + "loss": 0.6092, + "step": 1800 + }, + { + "epoch": 0.04605323754259925, + "grad_norm": 3.2160274982452393, + "learning_rate": 1.3809063277148384e-06, + "loss": 0.6177, + "step": 2000 + }, + { + "epoch": 0.05065856129685917, + "grad_norm": 2.949619770050049, + "learning_rate": 1.519066040342636e-06, + "loss": 0.6005, + "step": 2200 + }, + { + "epoch": 0.055263885051119094, + "grad_norm": 3.6632375717163086, + "learning_rate": 1.6572257529704338e-06, + "loss": 0.6053, + "step": 2400 + }, + { + "epoch": 0.05986920880537902, + "grad_norm": 3.9618542194366455, + "learning_rate": 1.7953854655982317e-06, + "loss": 0.6257, + "step": 2600 + }, + { + "epoch": 0.06447453255963895, + "grad_norm": 3.0995821952819824, + "learning_rate": 1.9335451782260294e-06, + "loss": 0.6203, + "step": 2800 + }, + { + "epoch": 0.06907985631389886, + "grad_norm": 3.9655020236968994, + "learning_rate": 2.071704890853827e-06, + "loss": 0.6165, + "step": 3000 + }, + { + "epoch": 0.07368518006815879, + "grad_norm": 2.8039631843566895, + "learning_rate": 2.209864603481625e-06, + "loss": 0.6196, + "step": 3200 + }, + { + "epoch": 0.07829050382241871, + "grad_norm": 3.1764214038848877, + "learning_rate": 2.3473335175462836e-06, + "loss": 0.61, + "step": 3400 + }, + { + "epoch": 0.08289582757667864, + "grad_norm": 3.0694644451141357, + "learning_rate": 2.4854932301740813e-06, + "loss": 0.6113, + "step": 3600 + }, + { + "epoch": 0.08750115133093857, + "grad_norm": 2.9657208919525146, + "learning_rate": 2.623652942801879e-06, + "loss": 0.6129, + "step": 3800 + }, + { + "epoch": 0.0921064750851985, + "grad_norm": 3.0443339347839355, + "learning_rate": 2.7618126554296768e-06, + "loss": 0.6021, + "step": 4000 + }, + { + "epoch": 0.09671179883945841, + "grad_norm": 3.029796838760376, + "learning_rate": 2.8992815694943355e-06, + "loss": 0.6093, + "step": 4200 + }, + { + "epoch": 0.10131712259371833, + "grad_norm": 3.724391222000122, + "learning_rate": 3.0374412821221332e-06, + "loss": 0.61, + "step": 4400 + }, + { + "epoch": 0.10592244634797826, + "grad_norm": 3.0700559616088867, + "learning_rate": 3.175600994749931e-06, + "loss": 0.612, + "step": 4600 + }, + { + "epoch": 0.11052777010223819, + "grad_norm": 2.6112992763519287, + "learning_rate": 3.313760707377729e-06, + "loss": 0.6075, + "step": 4800 + }, + { + "epoch": 0.11513309385649811, + "grad_norm": 3.0309786796569824, + "learning_rate": 3.4519204200055268e-06, + "loss": 0.6075, + "step": 5000 + }, + { + "epoch": 0.11973841761075804, + "grad_norm": 4.39963960647583, + "learning_rate": 3.590080132633324e-06, + "loss": 0.616, + "step": 5200 + }, + { + "epoch": 0.12434374136501795, + "grad_norm": 3.1912529468536377, + "learning_rate": 3.7282398452611218e-06, + "loss": 0.607, + "step": 5400 + }, + { + "epoch": 0.1289490651192779, + "grad_norm": 3.3386693000793457, + "learning_rate": 3.8663995578889195e-06, + "loss": 0.6155, + "step": 5600 + }, + { + "epoch": 0.13355438887353782, + "grad_norm": 3.854520082473755, + "learning_rate": 4.004559270516718e-06, + "loss": 0.6154, + "step": 5800 + }, + { + "epoch": 0.13815971262779772, + "grad_norm": 3.730701208114624, + "learning_rate": 4.142028184581376e-06, + "loss": 0.6088, + "step": 6000 + }, + { + "epoch": 0.14276503638205765, + "grad_norm": 4.679981231689453, + "learning_rate": 4.2801878972091745e-06, + "loss": 0.6186, + "step": 6200 + }, + { + "epoch": 0.14737036013631757, + "grad_norm": 3.9258837699890137, + "learning_rate": 4.418347609836971e-06, + "loss": 0.6166, + "step": 6400 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 3.088724136352539, + "learning_rate": 4.556507322464769e-06, + "loss": 0.6173, + "step": 6600 + }, + { + "epoch": 0.15658100764483743, + "grad_norm": 3.408620595932007, + "learning_rate": 4.694667035092567e-06, + "loss": 0.6005, + "step": 6800 + }, + { + "epoch": 0.16118633139909735, + "grad_norm": 3.2880802154541016, + "learning_rate": 4.8328267477203645e-06, + "loss": 0.614, + "step": 7000 + }, + { + "epoch": 0.16579165515335728, + "grad_norm": 2.6477365493774414, + "learning_rate": 4.970986460348163e-06, + "loss": 0.6111, + "step": 7200 + }, + { + "epoch": 0.1703969789076172, + "grad_norm": 3.5173377990722656, + "learning_rate": 5.10914617297596e-06, + "loss": 0.6173, + "step": 7400 + }, + { + "epoch": 0.17500230266187713, + "grad_norm": 2.5049519538879395, + "learning_rate": 5.247305885603758e-06, + "loss": 0.6016, + "step": 7600 + }, + { + "epoch": 0.17960762641613706, + "grad_norm": 4.8469157218933105, + "learning_rate": 5.385465598231556e-06, + "loss": 0.6142, + "step": 7800 + }, + { + "epoch": 0.184212950170397, + "grad_norm": 3.006009101867676, + "learning_rate": 5.5236253108593535e-06, + "loss": 0.6147, + "step": 8000 + }, + { + "epoch": 0.18881827392465692, + "grad_norm": 3.286186456680298, + "learning_rate": 5.661785023487152e-06, + "loss": 0.6037, + "step": 8200 + }, + { + "epoch": 0.19342359767891681, + "grad_norm": 2.788668394088745, + "learning_rate": 5.799944736114949e-06, + "loss": 0.6148, + "step": 8400 + }, + { + "epoch": 0.19802892143317674, + "grad_norm": 3.9425911903381348, + "learning_rate": 5.938104448742747e-06, + "loss": 0.6246, + "step": 8600 + }, + { + "epoch": 0.20263424518743667, + "grad_norm": 3.2533152103424072, + "learning_rate": 6.076264161370544e-06, + "loss": 0.6109, + "step": 8800 + }, + { + "epoch": 0.2072395689416966, + "grad_norm": 3.5510547161102295, + "learning_rate": 6.214423873998342e-06, + "loss": 0.6226, + "step": 9000 + }, + { + "epoch": 0.21184489269595652, + "grad_norm": 2.908961057662964, + "learning_rate": 6.35258358662614e-06, + "loss": 0.6152, + "step": 9200 + }, + { + "epoch": 0.21645021645021645, + "grad_norm": 3.2206296920776367, + "learning_rate": 6.490743299253938e-06, + "loss": 0.6119, + "step": 9400 + }, + { + "epoch": 0.22105554020447638, + "grad_norm": 2.8051042556762695, + "learning_rate": 6.628212213318597e-06, + "loss": 0.6112, + "step": 9600 + }, + { + "epoch": 0.2256608639587363, + "grad_norm": 3.4578733444213867, + "learning_rate": 6.766371925946394e-06, + "loss": 0.5983, + "step": 9800 + }, + { + "epoch": 0.23026618771299623, + "grad_norm": 3.29227614402771, + "learning_rate": 6.904531638574192e-06, + "loss": 0.6135, + "step": 10000 + }, + { + "epoch": 0.23487151146725616, + "grad_norm": 3.175769567489624, + "learning_rate": 7.04269135120199e-06, + "loss": 0.604, + "step": 10200 + }, + { + "epoch": 0.23947683522151608, + "grad_norm": 2.6275556087493896, + "learning_rate": 7.1808510638297875e-06, + "loss": 0.6145, + "step": 10400 + }, + { + "epoch": 0.244082158975776, + "grad_norm": 3.551853895187378, + "learning_rate": 7.319010776457585e-06, + "loss": 0.6302, + "step": 10600 + }, + { + "epoch": 0.2486874827300359, + "grad_norm": 2.5962212085723877, + "learning_rate": 7.457170489085383e-06, + "loss": 0.6079, + "step": 10800 + }, + { + "epoch": 0.25329280648429586, + "grad_norm": 3.54152774810791, + "learning_rate": 7.59533020171318e-06, + "loss": 0.626, + "step": 11000 + }, + { + "epoch": 0.2578981302385558, + "grad_norm": 3.380774974822998, + "learning_rate": 7.733489914340978e-06, + "loss": 0.6205, + "step": 11200 + }, + { + "epoch": 0.2625034539928157, + "grad_norm": 3.3264219760894775, + "learning_rate": 7.871649626968777e-06, + "loss": 0.6121, + "step": 11400 + }, + { + "epoch": 0.26710877774707564, + "grad_norm": 2.831803798675537, + "learning_rate": 8.009809339596573e-06, + "loss": 0.5994, + "step": 11600 + }, + { + "epoch": 0.27171410150133557, + "grad_norm": 3.301412582397461, + "learning_rate": 8.147969052224371e-06, + "loss": 0.6236, + "step": 11800 + }, + { + "epoch": 0.27631942525559544, + "grad_norm": 3.9409468173980713, + "learning_rate": 8.28612876485217e-06, + "loss": 0.6208, + "step": 12000 + }, + { + "epoch": 0.28092474900985537, + "grad_norm": 2.723642349243164, + "learning_rate": 8.424288477479967e-06, + "loss": 0.6137, + "step": 12200 + }, + { + "epoch": 0.2855300727641153, + "grad_norm": 3.9278056621551514, + "learning_rate": 8.562448190107766e-06, + "loss": 0.6156, + "step": 12400 + }, + { + "epoch": 0.2901353965183752, + "grad_norm": 3.5097107887268066, + "learning_rate": 8.700607902735564e-06, + "loss": 0.618, + "step": 12600 + }, + { + "epoch": 0.29474072027263515, + "grad_norm": 3.150387763977051, + "learning_rate": 8.83876761536336e-06, + "loss": 0.6165, + "step": 12800 + }, + { + "epoch": 0.2993460440268951, + "grad_norm": 3.5158212184906006, + "learning_rate": 8.976927327991158e-06, + "loss": 0.6156, + "step": 13000 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 3.6976168155670166, + "learning_rate": 9.115087040618956e-06, + "loss": 0.611, + "step": 13200 + }, + { + "epoch": 0.30855669153541493, + "grad_norm": 2.7496871948242188, + "learning_rate": 9.253246753246755e-06, + "loss": 0.6202, + "step": 13400 + }, + { + "epoch": 0.31316201528967486, + "grad_norm": 2.973073959350586, + "learning_rate": 9.391406465874553e-06, + "loss": 0.6151, + "step": 13600 + }, + { + "epoch": 0.3177673390439348, + "grad_norm": 4.366604804992676, + "learning_rate": 9.529566178502349e-06, + "loss": 0.6264, + "step": 13800 + }, + { + "epoch": 0.3223726627981947, + "grad_norm": 2.6890974044799805, + "learning_rate": 9.667725891130147e-06, + "loss": 0.6149, + "step": 14000 + }, + { + "epoch": 0.32697798655245464, + "grad_norm": 2.9724693298339844, + "learning_rate": 9.805885603757944e-06, + "loss": 0.6301, + "step": 14200 + }, + { + "epoch": 0.33158331030671456, + "grad_norm": 2.7939202785491943, + "learning_rate": 9.944045316385742e-06, + "loss": 0.6275, + "step": 14400 + }, + { + "epoch": 0.3361886340609745, + "grad_norm": 3.3362252712249756, + "learning_rate": 1.008220502901354e-05, + "loss": 0.6207, + "step": 14600 + }, + { + "epoch": 0.3407939578152344, + "grad_norm": 3.060162305831909, + "learning_rate": 1.0220364741641336e-05, + "loss": 0.6169, + "step": 14800 + }, + { + "epoch": 0.34539928156949434, + "grad_norm": 2.850121021270752, + "learning_rate": 1.0358524454269135e-05, + "loss": 0.6269, + "step": 15000 + }, + { + "epoch": 0.35000460532375427, + "grad_norm": 3.7358944416046143, + "learning_rate": 1.0496684166896933e-05, + "loss": 0.6223, + "step": 15200 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.362393379211426, + "learning_rate": 1.063484387952473e-05, + "loss": 0.6271, + "step": 15400 + }, + { + "epoch": 0.3592152528322741, + "grad_norm": 2.7046902179718018, + "learning_rate": 1.0773003592152529e-05, + "loss": 0.6194, + "step": 15600 + }, + { + "epoch": 0.36382057658653405, + "grad_norm": 3.0208888053894043, + "learning_rate": 1.0911163304780325e-05, + "loss": 0.6249, + "step": 15800 + }, + { + "epoch": 0.368425900340794, + "grad_norm": 3.113605260848999, + "learning_rate": 1.1049323017408124e-05, + "loss": 0.6144, + "step": 16000 + }, + { + "epoch": 0.3730312240950539, + "grad_norm": 3.3108925819396973, + "learning_rate": 1.1186791931472782e-05, + "loss": 0.6209, + "step": 16200 + }, + { + "epoch": 0.37763654784931383, + "grad_norm": 2.8616628646850586, + "learning_rate": 1.132495164410058e-05, + "loss": 0.6106, + "step": 16400 + }, + { + "epoch": 0.38224187160357376, + "grad_norm": 3.016742706298828, + "learning_rate": 1.1463111356728379e-05, + "loss": 0.6179, + "step": 16600 + }, + { + "epoch": 0.38684719535783363, + "grad_norm": 3.3050427436828613, + "learning_rate": 1.1601271069356177e-05, + "loss": 0.6225, + "step": 16800 + }, + { + "epoch": 0.39145251911209356, + "grad_norm": 2.2596371173858643, + "learning_rate": 1.1739430781983975e-05, + "loss": 0.6282, + "step": 17000 + }, + { + "epoch": 0.3960578428663535, + "grad_norm": 3.506793260574341, + "learning_rate": 1.1877590494611771e-05, + "loss": 0.6213, + "step": 17200 + }, + { + "epoch": 0.4006631666206134, + "grad_norm": 3.41317081451416, + "learning_rate": 1.2015059408676432e-05, + "loss": 0.6143, + "step": 17400 + }, + { + "epoch": 0.40526849037487334, + "grad_norm": 2.875344753265381, + "learning_rate": 1.2153219121304228e-05, + "loss": 0.6113, + "step": 17600 + }, + { + "epoch": 0.40987381412913326, + "grad_norm": 2.982757568359375, + "learning_rate": 1.2291378833932026e-05, + "loss": 0.6151, + "step": 17800 + }, + { + "epoch": 0.4144791378833932, + "grad_norm": 3.633155107498169, + "learning_rate": 1.2429538546559823e-05, + "loss": 0.6338, + "step": 18000 + }, + { + "epoch": 0.4190844616376531, + "grad_norm": 2.6880545616149902, + "learning_rate": 1.2567698259187621e-05, + "loss": 0.6159, + "step": 18200 + }, + { + "epoch": 0.42368978539191304, + "grad_norm": 2.981095790863037, + "learning_rate": 1.2705857971815419e-05, + "loss": 0.6312, + "step": 18400 + }, + { + "epoch": 0.42829510914617297, + "grad_norm": 2.658784866333008, + "learning_rate": 1.2844017684443215e-05, + "loss": 0.6183, + "step": 18600 + }, + { + "epoch": 0.4329004329004329, + "grad_norm": 2.975275754928589, + "learning_rate": 1.2982177397071014e-05, + "loss": 0.6246, + "step": 18800 + }, + { + "epoch": 0.4375057566546928, + "grad_norm": 2.6521835327148438, + "learning_rate": 1.3120337109698812e-05, + "loss": 0.6213, + "step": 19000 + }, + { + "epoch": 0.44211108040895275, + "grad_norm": 3.5868492126464844, + "learning_rate": 1.325849682232661e-05, + "loss": 0.6221, + "step": 19200 + }, + { + "epoch": 0.4467164041632127, + "grad_norm": 2.928968667984009, + "learning_rate": 1.3396656534954408e-05, + "loss": 0.6291, + "step": 19400 + }, + { + "epoch": 0.4513217279174726, + "grad_norm": 2.7386858463287354, + "learning_rate": 1.3534816247582204e-05, + "loss": 0.6154, + "step": 19600 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 2.974900245666504, + "learning_rate": 1.3672975960210003e-05, + "loss": 0.6147, + "step": 19800 + }, + { + "epoch": 0.46053237542599246, + "grad_norm": 3.2995572090148926, + "learning_rate": 1.38111356728378e-05, + "loss": 0.6335, + "step": 20000 + }, + { + "epoch": 0.4651376991802524, + "grad_norm": 2.7065649032592773, + "learning_rate": 1.3949295385465599e-05, + "loss": 0.6236, + "step": 20200 + }, + { + "epoch": 0.4697430229345123, + "grad_norm": 3.2409005165100098, + "learning_rate": 1.4086764299530258e-05, + "loss": 0.6245, + "step": 20400 + }, + { + "epoch": 0.47434834668877224, + "grad_norm": 3.296063184738159, + "learning_rate": 1.4224924012158056e-05, + "loss": 0.6198, + "step": 20600 + }, + { + "epoch": 0.47895367044303216, + "grad_norm": 3.3391969203948975, + "learning_rate": 1.4363083724785854e-05, + "loss": 0.6339, + "step": 20800 + }, + { + "epoch": 0.4835589941972921, + "grad_norm": 3.536306858062744, + "learning_rate": 1.450124343741365e-05, + "loss": 0.6264, + "step": 21000 + }, + { + "epoch": 0.488164317951552, + "grad_norm": 2.2294647693634033, + "learning_rate": 1.4639403150041448e-05, + "loss": 0.6182, + "step": 21200 + }, + { + "epoch": 0.49276964170581194, + "grad_norm": 2.4579017162323, + "learning_rate": 1.4777562862669247e-05, + "loss": 0.6287, + "step": 21400 + }, + { + "epoch": 0.4973749654600718, + "grad_norm": 3.0496151447296143, + "learning_rate": 1.4915722575297045e-05, + "loss": 0.6272, + "step": 21600 + }, + { + "epoch": 0.5019802892143318, + "grad_norm": 2.970477819442749, + "learning_rate": 1.5053882287924841e-05, + "loss": 0.6364, + "step": 21800 + }, + { + "epoch": 0.5065856129685917, + "grad_norm": 3.1261672973632812, + "learning_rate": 1.5192042000552638e-05, + "loss": 0.6387, + "step": 22000 + }, + { + "epoch": 0.5111909367228517, + "grad_norm": 3.239424228668213, + "learning_rate": 1.5330201713180437e-05, + "loss": 0.6235, + "step": 22200 + }, + { + "epoch": 0.5157962604771116, + "grad_norm": 3.2637672424316406, + "learning_rate": 1.5468361425808236e-05, + "loss": 0.6302, + "step": 22400 + }, + { + "epoch": 0.5204015842313715, + "grad_norm": 3.604191541671753, + "learning_rate": 1.560652113843603e-05, + "loss": 0.6193, + "step": 22600 + }, + { + "epoch": 0.5250069079856314, + "grad_norm": 3.1682140827178955, + "learning_rate": 1.574468085106383e-05, + "loss": 0.6231, + "step": 22800 + }, + { + "epoch": 0.5296122317398914, + "grad_norm": 2.781855583190918, + "learning_rate": 1.5882840563691627e-05, + "loss": 0.6283, + "step": 23000 + }, + { + "epoch": 0.5342175554941513, + "grad_norm": 2.9197840690612793, + "learning_rate": 1.6021000276319425e-05, + "loss": 0.6377, + "step": 23200 + }, + { + "epoch": 0.5388228792484112, + "grad_norm": 3.9863662719726562, + "learning_rate": 1.6159159988947223e-05, + "loss": 0.6231, + "step": 23400 + }, + { + "epoch": 0.5434282030026711, + "grad_norm": 2.8455443382263184, + "learning_rate": 1.629731970157502e-05, + "loss": 0.6227, + "step": 23600 + }, + { + "epoch": 0.5480335267569311, + "grad_norm": 2.6754982471466064, + "learning_rate": 1.643547941420282e-05, + "loss": 0.6314, + "step": 23800 + }, + { + "epoch": 0.5526388505111909, + "grad_norm": 3.931835651397705, + "learning_rate": 1.6573639126830617e-05, + "loss": 0.6298, + "step": 24000 + }, + { + "epoch": 0.5572441742654508, + "grad_norm": 3.3282952308654785, + "learning_rate": 1.6711798839458415e-05, + "loss": 0.6149, + "step": 24200 + }, + { + "epoch": 0.5618494980197107, + "grad_norm": 3.1311309337615967, + "learning_rate": 1.6849958552086214e-05, + "loss": 0.6408, + "step": 24400 + }, + { + "epoch": 0.5664548217739707, + "grad_norm": 4.448089122772217, + "learning_rate": 1.698811826471401e-05, + "loss": 0.6316, + "step": 24600 + }, + { + "epoch": 0.5710601455282306, + "grad_norm": 3.4819765090942383, + "learning_rate": 1.7126277977341806e-05, + "loss": 0.6361, + "step": 24800 + }, + { + "epoch": 0.5756654692824905, + "grad_norm": 3.762315034866333, + "learning_rate": 1.7264437689969605e-05, + "loss": 0.6267, + "step": 25000 + }, + { + "epoch": 0.5802707930367504, + "grad_norm": 3.5128722190856934, + "learning_rate": 1.7402597402597403e-05, + "loss": 0.6363, + "step": 25200 + }, + { + "epoch": 0.5848761167910104, + "grad_norm": 2.8912715911865234, + "learning_rate": 1.75407571152252e-05, + "loss": 0.6381, + "step": 25400 + }, + { + "epoch": 0.5894814405452703, + "grad_norm": 3.6335270404815674, + "learning_rate": 1.767753523072672e-05, + "loss": 0.6278, + "step": 25600 + }, + { + "epoch": 0.5940867642995302, + "grad_norm": 3.1860055923461914, + "learning_rate": 1.781569494335452e-05, + "loss": 0.6321, + "step": 25800 + }, + { + "epoch": 0.5986920880537902, + "grad_norm": 2.7040374279022217, + "learning_rate": 1.7953854655982317e-05, + "loss": 0.636, + "step": 26000 + }, + { + "epoch": 0.6032974118080501, + "grad_norm": 3.322178840637207, + "learning_rate": 1.8092014368610115e-05, + "loss": 0.636, + "step": 26200 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 3.2014033794403076, + "learning_rate": 1.8230174081237913e-05, + "loss": 0.6369, + "step": 26400 + }, + { + "epoch": 0.6125080593165699, + "grad_norm": 3.6202449798583984, + "learning_rate": 1.836833379386571e-05, + "loss": 0.6269, + "step": 26600 + }, + { + "epoch": 0.6171133830708299, + "grad_norm": 3.62076735496521, + "learning_rate": 1.850649350649351e-05, + "loss": 0.6307, + "step": 26800 + }, + { + "epoch": 0.6217187068250898, + "grad_norm": 2.5848195552825928, + "learning_rate": 1.8644653219121307e-05, + "loss": 0.6311, + "step": 27000 + }, + { + "epoch": 0.6263240305793497, + "grad_norm": 3.199153184890747, + "learning_rate": 1.8782812931749105e-05, + "loss": 0.639, + "step": 27200 + }, + { + "epoch": 0.6309293543336096, + "grad_norm": 2.6851189136505127, + "learning_rate": 1.89209726443769e-05, + "loss": 0.6422, + "step": 27400 + }, + { + "epoch": 0.6355346780878696, + "grad_norm": 3.650444507598877, + "learning_rate": 1.9059132357004698e-05, + "loss": 0.6412, + "step": 27600 + }, + { + "epoch": 0.6401400018421295, + "grad_norm": 2.841759204864502, + "learning_rate": 1.9197292069632496e-05, + "loss": 0.6419, + "step": 27800 + }, + { + "epoch": 0.6447453255963894, + "grad_norm": 3.3673157691955566, + "learning_rate": 1.9335451782260294e-05, + "loss": 0.6237, + "step": 28000 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 3.8618879318237305, + "learning_rate": 1.9473611494888093e-05, + "loss": 0.6369, + "step": 28200 + }, + { + "epoch": 0.6539559731049093, + "grad_norm": 2.8501274585723877, + "learning_rate": 1.9611771207515887e-05, + "loss": 0.6509, + "step": 28400 + }, + { + "epoch": 0.6585612968591692, + "grad_norm": 2.839573621749878, + "learning_rate": 1.9749930920143686e-05, + "loss": 0.6326, + "step": 28600 + }, + { + "epoch": 0.6631666206134291, + "grad_norm": 2.609498977661133, + "learning_rate": 1.9888090632771484e-05, + "loss": 0.6436, + "step": 28800 + }, + { + "epoch": 0.667771944367689, + "grad_norm": 3.8914794921875, + "learning_rate": 2.0026250345399282e-05, + "loss": 0.6317, + "step": 29000 + }, + { + "epoch": 0.672377268121949, + "grad_norm": 3.074779987335205, + "learning_rate": 2.016441005802708e-05, + "loss": 0.6473, + "step": 29200 + }, + { + "epoch": 0.6769825918762089, + "grad_norm": 3.1600515842437744, + "learning_rate": 2.0302569770654875e-05, + "loss": 0.6418, + "step": 29400 + }, + { + "epoch": 0.6815879156304688, + "grad_norm": 3.190645694732666, + "learning_rate": 2.0440729483282673e-05, + "loss": 0.6289, + "step": 29600 + }, + { + "epoch": 0.6861932393847288, + "grad_norm": 3.1118228435516357, + "learning_rate": 2.057888919591047e-05, + "loss": 0.6495, + "step": 29800 + }, + { + "epoch": 0.6907985631389887, + "grad_norm": 3.027270555496216, + "learning_rate": 2.071635810997513e-05, + "loss": 0.6393, + "step": 30000 + }, + { + "epoch": 0.6954038868932486, + "grad_norm": 2.9312691688537598, + "learning_rate": 2.085451782260293e-05, + "loss": 0.645, + "step": 30200 + }, + { + "epoch": 0.7000092106475085, + "grad_norm": 3.4496731758117676, + "learning_rate": 2.0992677535230728e-05, + "loss": 0.6522, + "step": 30400 + }, + { + "epoch": 0.7046145344017685, + "grad_norm": 3.7810165882110596, + "learning_rate": 2.1130837247858526e-05, + "loss": 0.653, + "step": 30600 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 2.8673808574676514, + "learning_rate": 2.126899696048632e-05, + "loss": 0.6249, + "step": 30800 + }, + { + "epoch": 0.7138251819102883, + "grad_norm": 2.933802843093872, + "learning_rate": 2.140715667311412e-05, + "loss": 0.6381, + "step": 31000 + }, + { + "epoch": 0.7184305056645482, + "grad_norm": 2.904672861099243, + "learning_rate": 2.1545316385741917e-05, + "loss": 0.643, + "step": 31200 + }, + { + "epoch": 0.7230358294188082, + "grad_norm": 3.2614898681640625, + "learning_rate": 2.1683476098369715e-05, + "loss": 0.6401, + "step": 31400 + }, + { + "epoch": 0.7276411531730681, + "grad_norm": 4.081573963165283, + "learning_rate": 2.1821635810997513e-05, + "loss": 0.6401, + "step": 31600 + }, + { + "epoch": 0.732246476927328, + "grad_norm": 2.736985206604004, + "learning_rate": 2.195979552362531e-05, + "loss": 0.6373, + "step": 31800 + }, + { + "epoch": 0.736851800681588, + "grad_norm": 3.4728550910949707, + "learning_rate": 2.209795523625311e-05, + "loss": 0.6488, + "step": 32000 + }, + { + "epoch": 0.7414571244358479, + "grad_norm": 2.758512496948242, + "learning_rate": 2.2236114948880908e-05, + "loss": 0.6492, + "step": 32200 + }, + { + "epoch": 0.7460624481901078, + "grad_norm": 3.4446616172790527, + "learning_rate": 2.2374274661508706e-05, + "loss": 0.6468, + "step": 32400 + }, + { + "epoch": 0.7506677719443677, + "grad_norm": 3.721409320831299, + "learning_rate": 2.2512434374136504e-05, + "loss": 0.6364, + "step": 32600 + }, + { + "epoch": 0.7552730956986277, + "grad_norm": 2.9497594833374023, + "learning_rate": 2.26505940867643e-05, + "loss": 0.6337, + "step": 32800 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 4.462852954864502, + "learning_rate": 2.2788753799392097e-05, + "loss": 0.6592, + "step": 33000 + }, + { + "epoch": 0.7644837432071475, + "grad_norm": 2.867154836654663, + "learning_rate": 2.2926913512019895e-05, + "loss": 0.6472, + "step": 33200 + }, + { + "epoch": 0.7690890669614074, + "grad_norm": 3.1840884685516357, + "learning_rate": 2.3065073224647693e-05, + "loss": 0.6454, + "step": 33400 + }, + { + "epoch": 0.7736943907156673, + "grad_norm": 2.719569206237793, + "learning_rate": 2.320323293727549e-05, + "loss": 0.6565, + "step": 33600 + }, + { + "epoch": 0.7782997144699272, + "grad_norm": 3.1424355506896973, + "learning_rate": 2.334139264990329e-05, + "loss": 0.6333, + "step": 33800 + }, + { + "epoch": 0.7829050382241871, + "grad_norm": 3.147237539291382, + "learning_rate": 2.347886156396795e-05, + "loss": 0.6462, + "step": 34000 + }, + { + "epoch": 0.787510361978447, + "grad_norm": 3.6933629512786865, + "learning_rate": 2.3617021276595744e-05, + "loss": 0.642, + "step": 34200 + }, + { + "epoch": 0.792115685732707, + "grad_norm": 3.1719932556152344, + "learning_rate": 2.3755180989223543e-05, + "loss": 0.6591, + "step": 34400 + }, + { + "epoch": 0.7967210094869669, + "grad_norm": 4.069468021392822, + "learning_rate": 2.389334070185134e-05, + "loss": 0.6558, + "step": 34600 + }, + { + "epoch": 0.8013263332412268, + "grad_norm": 3.7244644165039062, + "learning_rate": 2.403150041447914e-05, + "loss": 0.6586, + "step": 34800 + }, + { + "epoch": 0.8059316569954867, + "grad_norm": 2.9359726905822754, + "learning_rate": 2.41689693285438e-05, + "loss": 0.6425, + "step": 35000 + }, + { + "epoch": 0.8105369807497467, + "grad_norm": 3.2560675144195557, + "learning_rate": 2.4307129041171597e-05, + "loss": 0.6535, + "step": 35200 + }, + { + "epoch": 0.8151423045040066, + "grad_norm": 3.1228187084198, + "learning_rate": 2.4445288753799396e-05, + "loss": 0.6346, + "step": 35400 + }, + { + "epoch": 0.8197476282582665, + "grad_norm": 2.620872974395752, + "learning_rate": 2.458344846642719e-05, + "loss": 0.6412, + "step": 35600 + }, + { + "epoch": 0.8243529520125265, + "grad_norm": 3.165461540222168, + "learning_rate": 2.472160817905499e-05, + "loss": 0.643, + "step": 35800 + }, + { + "epoch": 0.8289582757667864, + "grad_norm": 3.4835033416748047, + "learning_rate": 2.4859767891682787e-05, + "loss": 0.6503, + "step": 36000 + }, + { + "epoch": 0.8335635995210463, + "grad_norm": 3.1745879650115967, + "learning_rate": 2.4997927604310585e-05, + "loss": 0.6572, + "step": 36200 + }, + { + "epoch": 0.8381689232753062, + "grad_norm": 2.9764230251312256, + "learning_rate": 2.513608731693838e-05, + "loss": 0.6426, + "step": 36400 + }, + { + "epoch": 0.8427742470295662, + "grad_norm": 2.6247684955596924, + "learning_rate": 2.5274247029566178e-05, + "loss": 0.6483, + "step": 36600 + }, + { + "epoch": 0.8473795707838261, + "grad_norm": 2.999863386154175, + "learning_rate": 2.5412406742193976e-05, + "loss": 0.6487, + "step": 36800 + }, + { + "epoch": 0.851984894538086, + "grad_norm": 6.160555839538574, + "learning_rate": 2.5550566454821774e-05, + "loss": 0.6596, + "step": 37000 + }, + { + "epoch": 0.8565902182923459, + "grad_norm": 3.588473320007324, + "learning_rate": 2.5688726167449572e-05, + "loss": 0.6562, + "step": 37200 + }, + { + "epoch": 0.8611955420466059, + "grad_norm": 2.841158628463745, + "learning_rate": 2.582688588007737e-05, + "loss": 0.6634, + "step": 37400 + }, + { + "epoch": 0.8658008658008658, + "grad_norm": 3.2244651317596436, + "learning_rate": 2.5965045592705168e-05, + "loss": 0.6436, + "step": 37600 + }, + { + "epoch": 0.8704061895551257, + "grad_norm": 2.9466798305511475, + "learning_rate": 2.6103205305332963e-05, + "loss": 0.6551, + "step": 37800 + }, + { + "epoch": 0.8750115133093856, + "grad_norm": 3.141784191131592, + "learning_rate": 2.624136501796076e-05, + "loss": 0.6607, + "step": 38000 + }, + { + "epoch": 0.8796168370636456, + "grad_norm": 3.590473175048828, + "learning_rate": 2.637952473058856e-05, + "loss": 0.6604, + "step": 38200 + }, + { + "epoch": 0.8842221608179055, + "grad_norm": 3.251824378967285, + "learning_rate": 2.6517684443216357e-05, + "loss": 0.6656, + "step": 38400 + }, + { + "epoch": 0.8888274845721654, + "grad_norm": 2.8062353134155273, + "learning_rate": 2.6655844155844156e-05, + "loss": 0.6494, + "step": 38600 + }, + { + "epoch": 0.8934328083264254, + "grad_norm": 2.897364616394043, + "learning_rate": 2.6794003868471954e-05, + "loss": 0.6474, + "step": 38800 + }, + { + "epoch": 0.8980381320806853, + "grad_norm": 3.172089099884033, + "learning_rate": 2.6932163581099752e-05, + "loss": 0.6601, + "step": 39000 + }, + { + "epoch": 0.9026434558349452, + "grad_norm": 3.3217780590057373, + "learning_rate": 2.707032329372755e-05, + "loss": 0.6426, + "step": 39200 + }, + { + "epoch": 0.9072487795892051, + "grad_norm": 3.3799307346343994, + "learning_rate": 2.7208483006355348e-05, + "loss": 0.6614, + "step": 39400 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 3.2434334754943848, + "learning_rate": 2.7346642718983146e-05, + "loss": 0.6546, + "step": 39600 + }, + { + "epoch": 0.916459427097725, + "grad_norm": 3.017298936843872, + "learning_rate": 2.7483420834484666e-05, + "loss": 0.6403, + "step": 39800 + }, + { + "epoch": 0.9210647508519849, + "grad_norm": 2.9930150508880615, + "learning_rate": 2.7621580547112464e-05, + "loss": 0.6527, + "step": 40000 + }, + { + "epoch": 0.9256700746062448, + "grad_norm": 2.474975347518921, + "learning_rate": 2.7759740259740262e-05, + "loss": 0.6559, + "step": 40200 + }, + { + "epoch": 0.9302753983605048, + "grad_norm": 3.3437626361846924, + "learning_rate": 2.789789997236806e-05, + "loss": 0.6479, + "step": 40400 + }, + { + "epoch": 0.9348807221147647, + "grad_norm": 3.64098858833313, + "learning_rate": 2.8036059684995855e-05, + "loss": 0.6671, + "step": 40600 + }, + { + "epoch": 0.9394860458690246, + "grad_norm": 3.154405355453491, + "learning_rate": 2.8174219397623653e-05, + "loss": 0.6498, + "step": 40800 + }, + { + "epoch": 0.9440913696232845, + "grad_norm": 3.1763336658477783, + "learning_rate": 2.831237911025145e-05, + "loss": 0.6581, + "step": 41000 + }, + { + "epoch": 0.9486966933775445, + "grad_norm": 3.8115978240966797, + "learning_rate": 2.845053882287925e-05, + "loss": 0.6544, + "step": 41200 + }, + { + "epoch": 0.9533020171318044, + "grad_norm": 3.476557731628418, + "learning_rate": 2.8588698535507047e-05, + "loss": 0.6607, + "step": 41400 + }, + { + "epoch": 0.9579073408860643, + "grad_norm": 3.1274263858795166, + "learning_rate": 2.8726858248134845e-05, + "loss": 0.6557, + "step": 41600 + }, + { + "epoch": 0.9625126646403243, + "grad_norm": 3.3021888732910156, + "learning_rate": 2.8865017960762644e-05, + "loss": 0.6529, + "step": 41800 + }, + { + "epoch": 0.9671179883945842, + "grad_norm": 2.258256435394287, + "learning_rate": 2.9003177673390442e-05, + "loss": 0.6537, + "step": 42000 + }, + { + "epoch": 0.9717233121488441, + "grad_norm": 3.9193155765533447, + "learning_rate": 2.914133738601824e-05, + "loss": 0.6521, + "step": 42200 + }, + { + "epoch": 0.976328635903104, + "grad_norm": 2.55462384223938, + "learning_rate": 2.9279497098646038e-05, + "loss": 0.661, + "step": 42400 + }, + { + "epoch": 0.980933959657364, + "grad_norm": 2.79758358001709, + "learning_rate": 2.9417656811273833e-05, + "loss": 0.6601, + "step": 42600 + }, + { + "epoch": 0.9855392834116239, + "grad_norm": 3.391768217086792, + "learning_rate": 2.955581652390163e-05, + "loss": 0.6502, + "step": 42800 + }, + { + "epoch": 0.9901446071658838, + "grad_norm": 2.6248672008514404, + "learning_rate": 2.969397623652943e-05, + "loss": 0.6678, + "step": 43000 + }, + { + "epoch": 0.9947499309201436, + "grad_norm": 2.8915598392486572, + "learning_rate": 2.9832135949157227e-05, + "loss": 0.6473, + "step": 43200 + }, + { + "epoch": 0.9993552546744036, + "grad_norm": 4.923104286193848, + "learning_rate": 2.9970295661785025e-05, + "loss": 0.6451, + "step": 43400 + }, + { + "epoch": 1.0, + "eval_loss": 0.6161314845085144, + "eval_runtime": 146.2574, + "eval_samples_per_second": 193.911, + "eval_steps_per_second": 12.122, + "step": 43428 + }, + { + "epoch": 1.0039605784286636, + "grad_norm": 3.111292600631714, + "learning_rate": 2.9994291822399326e-05, + "loss": 0.6622, + "step": 43600 + }, + { + "epoch": 1.0085659021829234, + "grad_norm": 3.6699461936950684, + "learning_rate": 2.998702025857681e-05, + "loss": 0.6542, + "step": 43800 + }, + { + "epoch": 1.0131712259371835, + "grad_norm": 3.563108205795288, + "learning_rate": 2.997978505257341e-05, + "loss": 0.6444, + "step": 44000 + }, + { + "epoch": 1.0177765496914433, + "grad_norm": 3.8684511184692383, + "learning_rate": 2.997251348875089e-05, + "loss": 0.6534, + "step": 44200 + }, + { + "epoch": 1.0223818734457033, + "grad_norm": 2.8675427436828613, + "learning_rate": 2.9965241924928375e-05, + "loss": 0.65, + "step": 44400 + }, + { + "epoch": 1.0269871971999631, + "grad_norm": 3.130798101425171, + "learning_rate": 2.995797036110586e-05, + "loss": 0.6566, + "step": 44600 + }, + { + "epoch": 1.0315925209542232, + "grad_norm": 3.477407932281494, + "learning_rate": 2.9950698797283344e-05, + "loss": 0.6701, + "step": 44800 + }, + { + "epoch": 1.036197844708483, + "grad_norm": 3.613179922103882, + "learning_rate": 2.994342723346083e-05, + "loss": 0.6636, + "step": 45000 + }, + { + "epoch": 1.040803168462743, + "grad_norm": 4.168652057647705, + "learning_rate": 2.9936155669638315e-05, + "loss": 0.656, + "step": 45200 + }, + { + "epoch": 1.0454084922170028, + "grad_norm": 3.4714102745056152, + "learning_rate": 2.992892046363491e-05, + "loss": 0.666, + "step": 45400 + }, + { + "epoch": 1.0500138159712629, + "grad_norm": 3.0046637058258057, + "learning_rate": 2.9921648899812396e-05, + "loss": 0.6547, + "step": 45600 + }, + { + "epoch": 1.0546191397255227, + "grad_norm": 2.514378070831299, + "learning_rate": 2.991437733598988e-05, + "loss": 0.6592, + "step": 45800 + }, + { + "epoch": 1.0592244634797827, + "grad_norm": 3.263707160949707, + "learning_rate": 2.990710577216736e-05, + "loss": 0.6544, + "step": 46000 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 2.668816566467285, + "learning_rate": 2.9899834208344847e-05, + "loss": 0.6631, + "step": 46200 + }, + { + "epoch": 1.0684351109883026, + "grad_norm": 3.3855276107788086, + "learning_rate": 2.9892562644522333e-05, + "loss": 0.6581, + "step": 46400 + }, + { + "epoch": 1.0730404347425624, + "grad_norm": 3.417602062225342, + "learning_rate": 2.9885291080699816e-05, + "loss": 0.6686, + "step": 46600 + }, + { + "epoch": 1.0776457584968224, + "grad_norm": 3.3175718784332275, + "learning_rate": 2.9878019516877302e-05, + "loss": 0.6322, + "step": 46800 + }, + { + "epoch": 1.0822510822510822, + "grad_norm": 3.263648748397827, + "learning_rate": 2.9870747953054784e-05, + "loss": 0.6482, + "step": 47000 + }, + { + "epoch": 1.0868564060053423, + "grad_norm": 3.151909589767456, + "learning_rate": 2.9863476389232267e-05, + "loss": 0.6526, + "step": 47200 + }, + { + "epoch": 1.091461729759602, + "grad_norm": 3.5228278636932373, + "learning_rate": 2.9856241183228865e-05, + "loss": 0.6459, + "step": 47400 + }, + { + "epoch": 1.096067053513862, + "grad_norm": 2.767185926437378, + "learning_rate": 2.984896961940635e-05, + "loss": 0.6595, + "step": 47600 + }, + { + "epoch": 1.100672377268122, + "grad_norm": 3.287398099899292, + "learning_rate": 2.9841698055583834e-05, + "loss": 0.6625, + "step": 47800 + }, + { + "epoch": 1.105277701022382, + "grad_norm": 3.132371425628662, + "learning_rate": 2.983442649176132e-05, + "loss": 0.6505, + "step": 48000 + }, + { + "epoch": 1.1098830247766418, + "grad_norm": 2.7118616104125977, + "learning_rate": 2.9827154927938806e-05, + "loss": 0.6747, + "step": 48200 + }, + { + "epoch": 1.1144883485309016, + "grad_norm": 2.996696949005127, + "learning_rate": 2.9819883364116285e-05, + "loss": 0.6633, + "step": 48400 + }, + { + "epoch": 1.1190936722851617, + "grad_norm": 3.6714489459991455, + "learning_rate": 2.981261180029377e-05, + "loss": 0.6554, + "step": 48600 + }, + { + "epoch": 1.1236989960394215, + "grad_norm": 2.80454421043396, + "learning_rate": 2.9805340236471257e-05, + "loss": 0.6641, + "step": 48800 + }, + { + "epoch": 1.1283043197936815, + "grad_norm": 3.167177200317383, + "learning_rate": 2.979806867264874e-05, + "loss": 0.6508, + "step": 49000 + }, + { + "epoch": 1.1329096435479413, + "grad_norm": 3.2425365447998047, + "learning_rate": 2.9790797108826225e-05, + "loss": 0.6508, + "step": 49200 + }, + { + "epoch": 1.1375149673022014, + "grad_norm": 2.834266424179077, + "learning_rate": 2.978352554500371e-05, + "loss": 0.6558, + "step": 49400 + }, + { + "epoch": 1.1421202910564612, + "grad_norm": 2.795851707458496, + "learning_rate": 2.9776253981181194e-05, + "loss": 0.6567, + "step": 49600 + }, + { + "epoch": 1.1467256148107212, + "grad_norm": 2.940406084060669, + "learning_rate": 2.9768982417358676e-05, + "loss": 0.6605, + "step": 49800 + }, + { + "epoch": 1.151330938564981, + "grad_norm": 3.899711847305298, + "learning_rate": 2.9761710853536162e-05, + "loss": 0.6639, + "step": 50000 + }, + { + "epoch": 1.155936262319241, + "grad_norm": 2.7121920585632324, + "learning_rate": 2.9754475647532757e-05, + "loss": 0.6759, + "step": 50200 + }, + { + "epoch": 1.1605415860735009, + "grad_norm": 3.220299005508423, + "learning_rate": 2.9747204083710243e-05, + "loss": 0.6514, + "step": 50400 + }, + { + "epoch": 1.165146909827761, + "grad_norm": 3.94079852104187, + "learning_rate": 2.973993251988773e-05, + "loss": 0.661, + "step": 50600 + }, + { + "epoch": 1.1697522335820207, + "grad_norm": 3.2445309162139893, + "learning_rate": 2.973266095606521e-05, + "loss": 0.6682, + "step": 50800 + }, + { + "epoch": 1.1743575573362808, + "grad_norm": 3.0553812980651855, + "learning_rate": 2.9725389392242697e-05, + "loss": 0.6687, + "step": 51000 + }, + { + "epoch": 1.1789628810905406, + "grad_norm": 4.24676513671875, + "learning_rate": 2.971811782842018e-05, + "loss": 0.656, + "step": 51200 + }, + { + "epoch": 1.1835682048448006, + "grad_norm": 3.447143316268921, + "learning_rate": 2.9710882622416775e-05, + "loss": 0.6562, + "step": 51400 + }, + { + "epoch": 1.1881735285990604, + "grad_norm": 2.6866791248321533, + "learning_rate": 2.970361105859426e-05, + "loss": 0.6684, + "step": 51600 + }, + { + "epoch": 1.1927788523533205, + "grad_norm": 3.1234331130981445, + "learning_rate": 2.9696339494771747e-05, + "loss": 0.658, + "step": 51800 + }, + { + "epoch": 1.1973841761075803, + "grad_norm": 3.407177686691284, + "learning_rate": 2.968906793094923e-05, + "loss": 0.6628, + "step": 52000 + }, + { + "epoch": 1.2019894998618403, + "grad_norm": 3.587676763534546, + "learning_rate": 2.9681796367126715e-05, + "loss": 0.6493, + "step": 52200 + }, + { + "epoch": 1.2065948236161002, + "grad_norm": 2.697775363922119, + "learning_rate": 2.96745248033042e-05, + "loss": 0.6637, + "step": 52400 + }, + { + "epoch": 1.2112001473703602, + "grad_norm": 3.507350206375122, + "learning_rate": 2.9667253239481684e-05, + "loss": 0.6528, + "step": 52600 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 3.143338203430176, + "learning_rate": 2.9659981675659166e-05, + "loss": 0.6521, + "step": 52800 + }, + { + "epoch": 1.22041079487888, + "grad_norm": 2.8852453231811523, + "learning_rate": 2.9652710111836652e-05, + "loss": 0.6595, + "step": 53000 + }, + { + "epoch": 1.2250161186331399, + "grad_norm": 3.039896249771118, + "learning_rate": 2.9645438548014135e-05, + "loss": 0.6542, + "step": 53200 + }, + { + "epoch": 1.2296214423874, + "grad_norm": 2.8866536617279053, + "learning_rate": 2.963816698419162e-05, + "loss": 0.6535, + "step": 53400 + }, + { + "epoch": 1.2342267661416597, + "grad_norm": 2.5456418991088867, + "learning_rate": 2.9630895420369107e-05, + "loss": 0.664, + "step": 53600 + }, + { + "epoch": 1.2388320898959198, + "grad_norm": 2.5279481410980225, + "learning_rate": 2.9623623856546593e-05, + "loss": 0.6603, + "step": 53800 + }, + { + "epoch": 1.2434374136501796, + "grad_norm": 3.0540003776550293, + "learning_rate": 2.9616352292724072e-05, + "loss": 0.6626, + "step": 54000 + }, + { + "epoch": 1.2480427374044396, + "grad_norm": 2.9297878742218018, + "learning_rate": 2.9609080728901558e-05, + "loss": 0.6612, + "step": 54200 + }, + { + "epoch": 1.2526480611586994, + "grad_norm": 3.4049458503723145, + "learning_rate": 2.9601809165079044e-05, + "loss": 0.6509, + "step": 54400 + }, + { + "epoch": 1.2572533849129595, + "grad_norm": 4.160104274749756, + "learning_rate": 2.9594537601256526e-05, + "loss": 0.6533, + "step": 54600 + }, + { + "epoch": 1.2618587086672193, + "grad_norm": 3.6435911655426025, + "learning_rate": 2.9587266037434012e-05, + "loss": 0.6703, + "step": 54800 + }, + { + "epoch": 1.2664640324214793, + "grad_norm": 4.371586322784424, + "learning_rate": 2.9579994473611498e-05, + "loss": 0.6507, + "step": 55000 + }, + { + "epoch": 1.2710693561757391, + "grad_norm": 2.840867757797241, + "learning_rate": 2.9572722909788977e-05, + "loss": 0.6644, + "step": 55200 + }, + { + "epoch": 1.275674679929999, + "grad_norm": 4.047842502593994, + "learning_rate": 2.9565451345966463e-05, + "loss": 0.6752, + "step": 55400 + }, + { + "epoch": 1.280280003684259, + "grad_norm": 4.21028470993042, + "learning_rate": 2.955817978214395e-05, + "loss": 0.6582, + "step": 55600 + }, + { + "epoch": 1.284885327438519, + "grad_norm": 2.9886422157287598, + "learning_rate": 2.955090821832143e-05, + "loss": 0.6634, + "step": 55800 + }, + { + "epoch": 1.2894906511927788, + "grad_norm": 3.3771917819976807, + "learning_rate": 2.9543636654498917e-05, + "loss": 0.6485, + "step": 56000 + }, + { + "epoch": 1.2940959749470387, + "grad_norm": 3.0930802822113037, + "learning_rate": 2.9536365090676403e-05, + "loss": 0.6665, + "step": 56200 + }, + { + "epoch": 1.2987012987012987, + "grad_norm": 3.460557222366333, + "learning_rate": 2.952909352685389e-05, + "loss": 0.6703, + "step": 56400 + }, + { + "epoch": 1.3033066224555587, + "grad_norm": 3.846440076828003, + "learning_rate": 2.9521858320850484e-05, + "loss": 0.6626, + "step": 56600 + }, + { + "epoch": 1.3079119462098185, + "grad_norm": 3.103189468383789, + "learning_rate": 2.9514586757027967e-05, + "loss": 0.6732, + "step": 56800 + }, + { + "epoch": 1.3125172699640784, + "grad_norm": 3.5162618160247803, + "learning_rate": 2.950731519320545e-05, + "loss": 0.6659, + "step": 57000 + }, + { + "epoch": 1.3171225937183384, + "grad_norm": 3.511310338973999, + "learning_rate": 2.9500079987202048e-05, + "loss": 0.668, + "step": 57200 + }, + { + "epoch": 1.3217279174725984, + "grad_norm": 3.5012547969818115, + "learning_rate": 2.9492808423379534e-05, + "loss": 0.6693, + "step": 57400 + }, + { + "epoch": 1.3263332412268583, + "grad_norm": 3.130657434463501, + "learning_rate": 2.9485536859557016e-05, + "loss": 0.6678, + "step": 57600 + }, + { + "epoch": 1.330938564981118, + "grad_norm": 3.0127837657928467, + "learning_rate": 2.9478265295734502e-05, + "loss": 0.6606, + "step": 57800 + }, + { + "epoch": 1.335543888735378, + "grad_norm": 2.949445962905884, + "learning_rate": 2.947099373191199e-05, + "loss": 0.6663, + "step": 58000 + }, + { + "epoch": 1.3401492124896381, + "grad_norm": 3.239060640335083, + "learning_rate": 2.9463722168089467e-05, + "loss": 0.659, + "step": 58200 + }, + { + "epoch": 1.344754536243898, + "grad_norm": 3.4041852951049805, + "learning_rate": 2.9456450604266953e-05, + "loss": 0.6652, + "step": 58400 + }, + { + "epoch": 1.3493598599981578, + "grad_norm": 2.7172391414642334, + "learning_rate": 2.944917904044444e-05, + "loss": 0.643, + "step": 58600 + }, + { + "epoch": 1.3539651837524178, + "grad_norm": 3.234712600708008, + "learning_rate": 2.9441907476621922e-05, + "loss": 0.6552, + "step": 58800 + }, + { + "epoch": 1.3585705075066778, + "grad_norm": 2.3830983638763428, + "learning_rate": 2.9434635912799408e-05, + "loss": 0.6704, + "step": 59000 + }, + { + "epoch": 1.3631758312609377, + "grad_norm": 3.203972578048706, + "learning_rate": 2.9427364348976894e-05, + "loss": 0.6609, + "step": 59200 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 3.8763749599456787, + "learning_rate": 2.9420092785154373e-05, + "loss": 0.6641, + "step": 59400 + }, + { + "epoch": 1.3723864787694575, + "grad_norm": 3.5821003913879395, + "learning_rate": 2.941282122133186e-05, + "loss": 0.6577, + "step": 59600 + }, + { + "epoch": 1.3769918025237173, + "grad_norm": 3.558241844177246, + "learning_rate": 2.9405549657509345e-05, + "loss": 0.6564, + "step": 59800 + }, + { + "epoch": 1.3815971262779774, + "grad_norm": 2.6265041828155518, + "learning_rate": 2.939827809368683e-05, + "loss": 0.6497, + "step": 60000 + }, + { + "epoch": 1.3862024500322372, + "grad_norm": 3.8873655796051025, + "learning_rate": 2.9391006529864313e-05, + "loss": 0.6618, + "step": 60200 + }, + { + "epoch": 1.3908077737864972, + "grad_norm": 3.367459774017334, + "learning_rate": 2.93837349660418e-05, + "loss": 0.6427, + "step": 60400 + }, + { + "epoch": 1.395413097540757, + "grad_norm": 3.5943410396575928, + "learning_rate": 2.9376463402219285e-05, + "loss": 0.6767, + "step": 60600 + }, + { + "epoch": 1.400018421295017, + "grad_norm": 3.536818742752075, + "learning_rate": 2.9369191838396764e-05, + "loss": 0.6474, + "step": 60800 + }, + { + "epoch": 1.404623745049277, + "grad_norm": 3.1541073322296143, + "learning_rate": 2.9361956632393363e-05, + "loss": 0.657, + "step": 61000 + }, + { + "epoch": 1.409229068803537, + "grad_norm": 3.130194902420044, + "learning_rate": 2.9354685068570845e-05, + "loss": 0.6623, + "step": 61200 + }, + { + "epoch": 1.4138343925577967, + "grad_norm": 3.6014018058776855, + "learning_rate": 2.934741350474833e-05, + "loss": 0.6544, + "step": 61400 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 3.4205713272094727, + "learning_rate": 2.9340141940925817e-05, + "loss": 0.663, + "step": 61600 + }, + { + "epoch": 1.4230450400663166, + "grad_norm": 2.7789885997772217, + "learning_rate": 2.9332870377103303e-05, + "loss": 0.6607, + "step": 61800 + }, + { + "epoch": 1.4276503638205766, + "grad_norm": 2.774611234664917, + "learning_rate": 2.9325598813280785e-05, + "loss": 0.6559, + "step": 62000 + }, + { + "epoch": 1.4322556875748365, + "grad_norm": 3.5750908851623535, + "learning_rate": 2.9318327249458268e-05, + "loss": 0.6546, + "step": 62200 + }, + { + "epoch": 1.4368610113290965, + "grad_norm": 3.758307456970215, + "learning_rate": 2.9311055685635754e-05, + "loss": 0.6681, + "step": 62400 + }, + { + "epoch": 1.4414663350833563, + "grad_norm": 2.9125349521636963, + "learning_rate": 2.9303784121813236e-05, + "loss": 0.6504, + "step": 62600 + }, + { + "epoch": 1.4460716588376163, + "grad_norm": 3.2125773429870605, + "learning_rate": 2.9296512557990722e-05, + "loss": 0.6535, + "step": 62800 + }, + { + "epoch": 1.4506769825918762, + "grad_norm": 3.11696457862854, + "learning_rate": 2.9289240994168208e-05, + "loss": 0.6425, + "step": 63000 + }, + { + "epoch": 1.4552823063461362, + "grad_norm": 4.079625606536865, + "learning_rate": 2.928196943034569e-05, + "loss": 0.6641, + "step": 63200 + }, + { + "epoch": 1.459887630100396, + "grad_norm": 2.697766065597534, + "learning_rate": 2.9274697866523177e-05, + "loss": 0.6568, + "step": 63400 + }, + { + "epoch": 1.464492953854656, + "grad_norm": 2.6162514686584473, + "learning_rate": 2.926742630270066e-05, + "loss": 0.659, + "step": 63600 + }, + { + "epoch": 1.4690982776089159, + "grad_norm": 3.08160138130188, + "learning_rate": 2.926015473887814e-05, + "loss": 0.6527, + "step": 63800 + }, + { + "epoch": 1.473703601363176, + "grad_norm": 3.38775897026062, + "learning_rate": 2.9252883175055628e-05, + "loss": 0.6614, + "step": 64000 + }, + { + "epoch": 1.4783089251174357, + "grad_norm": 3.066603183746338, + "learning_rate": 2.9245611611233113e-05, + "loss": 0.6607, + "step": 64200 + }, + { + "epoch": 1.4829142488716958, + "grad_norm": 2.781545400619507, + "learning_rate": 2.92383400474106e-05, + "loss": 0.6529, + "step": 64400 + }, + { + "epoch": 1.4875195726259556, + "grad_norm": 3.7093193531036377, + "learning_rate": 2.9231068483588082e-05, + "loss": 0.6507, + "step": 64600 + }, + { + "epoch": 1.4921248963802154, + "grad_norm": 3.037850856781006, + "learning_rate": 2.9223796919765564e-05, + "loss": 0.6593, + "step": 64800 + }, + { + "epoch": 1.4967302201344754, + "grad_norm": 3.192322254180908, + "learning_rate": 2.921652535594305e-05, + "loss": 0.6561, + "step": 65000 + }, + { + "epoch": 1.5013355438887355, + "grad_norm": 3.157817840576172, + "learning_rate": 2.9209253792120533e-05, + "loss": 0.6488, + "step": 65200 + }, + { + "epoch": 1.5059408676429953, + "grad_norm": 3.132276773452759, + "learning_rate": 2.920198222829802e-05, + "loss": 0.6549, + "step": 65400 + }, + { + "epoch": 1.510546191397255, + "grad_norm": 3.7865426540374756, + "learning_rate": 2.9194710664475505e-05, + "loss": 0.6534, + "step": 65600 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 3.1854352951049805, + "learning_rate": 2.9187439100652987e-05, + "loss": 0.6592, + "step": 65800 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 2.9768927097320557, + "learning_rate": 2.918016753683047e-05, + "loss": 0.6581, + "step": 66000 + }, + { + "epoch": 1.524362162660035, + "grad_norm": 3.467859983444214, + "learning_rate": 2.9172895973007956e-05, + "loss": 0.6599, + "step": 66200 + }, + { + "epoch": 1.5289674864142948, + "grad_norm": 3.409672975540161, + "learning_rate": 2.9165624409185438e-05, + "loss": 0.6589, + "step": 66400 + }, + { + "epoch": 1.5335728101685548, + "grad_norm": 3.304276943206787, + "learning_rate": 2.9158352845362924e-05, + "loss": 0.6619, + "step": 66600 + }, + { + "epoch": 1.5381781339228149, + "grad_norm": 2.9806504249572754, + "learning_rate": 2.915108128154041e-05, + "loss": 0.6686, + "step": 66800 + }, + { + "epoch": 1.5427834576770747, + "grad_norm": 3.9188714027404785, + "learning_rate": 2.9143846075537005e-05, + "loss": 0.6599, + "step": 67000 + }, + { + "epoch": 1.5473887814313345, + "grad_norm": 3.01717209815979, + "learning_rate": 2.913657451171449e-05, + "loss": 0.6603, + "step": 67200 + }, + { + "epoch": 1.5519941051855946, + "grad_norm": 3.1575047969818115, + "learning_rate": 2.9129302947891977e-05, + "loss": 0.6532, + "step": 67400 + }, + { + "epoch": 1.5565994289398546, + "grad_norm": 2.840865135192871, + "learning_rate": 2.9122031384069456e-05, + "loss": 0.6537, + "step": 67600 + }, + { + "epoch": 1.5612047526941144, + "grad_norm": 3.181452989578247, + "learning_rate": 2.9114759820246942e-05, + "loss": 0.6605, + "step": 67800 + }, + { + "epoch": 1.5658100764483742, + "grad_norm": 3.727302074432373, + "learning_rate": 2.9107488256424428e-05, + "loss": 0.6553, + "step": 68000 + }, + { + "epoch": 1.5704154002026343, + "grad_norm": 3.4746217727661133, + "learning_rate": 2.910021669260191e-05, + "loss": 0.6603, + "step": 68200 + }, + { + "epoch": 1.5750207239568943, + "grad_norm": 3.0356459617614746, + "learning_rate": 2.9092945128779396e-05, + "loss": 0.6705, + "step": 68400 + }, + { + "epoch": 1.5796260477111541, + "grad_norm": 4.231008529663086, + "learning_rate": 2.9085673564956882e-05, + "loss": 0.6509, + "step": 68600 + }, + { + "epoch": 1.584231371465414, + "grad_norm": 2.5000898838043213, + "learning_rate": 2.907840200113436e-05, + "loss": 0.6577, + "step": 68800 + }, + { + "epoch": 1.588836695219674, + "grad_norm": 3.202843427658081, + "learning_rate": 2.9071130437311847e-05, + "loss": 0.6515, + "step": 69000 + }, + { + "epoch": 1.593442018973934, + "grad_norm": 3.5553150177001953, + "learning_rate": 2.9063858873489333e-05, + "loss": 0.6569, + "step": 69200 + }, + { + "epoch": 1.5980473427281938, + "grad_norm": 3.006439208984375, + "learning_rate": 2.905658730966682e-05, + "loss": 0.6556, + "step": 69400 + }, + { + "epoch": 1.6026526664824536, + "grad_norm": 2.8993167877197266, + "learning_rate": 2.9049352103663414e-05, + "loss": 0.6602, + "step": 69600 + }, + { + "epoch": 1.6072579902367137, + "grad_norm": 3.5758256912231445, + "learning_rate": 2.90420805398409e-05, + "loss": 0.6512, + "step": 69800 + }, + { + "epoch": 1.6118633139909737, + "grad_norm": 2.4730918407440186, + "learning_rate": 2.9034808976018383e-05, + "loss": 0.6479, + "step": 70000 + }, + { + "epoch": 1.6164686377452335, + "grad_norm": 3.3662991523742676, + "learning_rate": 2.902757377001498e-05, + "loss": 0.6519, + "step": 70200 + }, + { + "epoch": 1.6210739614994933, + "grad_norm": 2.8330750465393066, + "learning_rate": 2.9020302206192467e-05, + "loss": 0.6492, + "step": 70400 + }, + { + "epoch": 1.6256792852537534, + "grad_norm": 3.55230975151062, + "learning_rate": 2.9013030642369947e-05, + "loss": 0.654, + "step": 70600 + }, + { + "epoch": 1.6302846090080134, + "grad_norm": 3.134399890899658, + "learning_rate": 2.9005795436366545e-05, + "loss": 0.657, + "step": 70800 + }, + { + "epoch": 1.6348899327622732, + "grad_norm": 3.899355888366699, + "learning_rate": 2.8998523872544028e-05, + "loss": 0.6565, + "step": 71000 + }, + { + "epoch": 1.639495256516533, + "grad_norm": 3.356893301010132, + "learning_rate": 2.8991252308721514e-05, + "loss": 0.6465, + "step": 71200 + }, + { + "epoch": 1.644100580270793, + "grad_norm": 3.021803855895996, + "learning_rate": 2.8983980744899e-05, + "loss": 0.6409, + "step": 71400 + }, + { + "epoch": 1.6487059040250531, + "grad_norm": 3.1984188556671143, + "learning_rate": 2.8976709181076485e-05, + "loss": 0.6457, + "step": 71600 + }, + { + "epoch": 1.6533112277793127, + "grad_norm": 2.6747777462005615, + "learning_rate": 2.8969437617253968e-05, + "loss": 0.6497, + "step": 71800 + }, + { + "epoch": 1.6579165515335728, + "grad_norm": 3.9369540214538574, + "learning_rate": 2.896216605343145e-05, + "loss": 0.6556, + "step": 72000 + }, + { + "epoch": 1.6625218752878328, + "grad_norm": 3.03513503074646, + "learning_rate": 2.8954894489608936e-05, + "loss": 0.652, + "step": 72200 + }, + { + "epoch": 1.6671271990420926, + "grad_norm": 2.8434908390045166, + "learning_rate": 2.894762292578642e-05, + "loss": 0.6575, + "step": 72400 + }, + { + "epoch": 1.6717325227963524, + "grad_norm": 3.372441291809082, + "learning_rate": 2.8940351361963905e-05, + "loss": 0.6576, + "step": 72600 + }, + { + "epoch": 1.6763378465506125, + "grad_norm": 2.8065624237060547, + "learning_rate": 2.893307979814139e-05, + "loss": 0.6379, + "step": 72800 + }, + { + "epoch": 1.6809431703048725, + "grad_norm": 4.157696723937988, + "learning_rate": 2.8925808234318873e-05, + "loss": 0.6482, + "step": 73000 + }, + { + "epoch": 1.6855484940591323, + "grad_norm": 2.8976545333862305, + "learning_rate": 2.8918536670496356e-05, + "loss": 0.6596, + "step": 73200 + }, + { + "epoch": 1.6901538178133921, + "grad_norm": 3.67669415473938, + "learning_rate": 2.8911265106673842e-05, + "loss": 0.6556, + "step": 73400 + }, + { + "epoch": 1.6947591415676522, + "grad_norm": 3.2437984943389893, + "learning_rate": 2.8903993542851324e-05, + "loss": 0.6519, + "step": 73600 + }, + { + "epoch": 1.6993644653219122, + "grad_norm": 2.789106607437134, + "learning_rate": 2.889672197902881e-05, + "loss": 0.673, + "step": 73800 + }, + { + "epoch": 1.703969789076172, + "grad_norm": 3.525022268295288, + "learning_rate": 2.8889450415206296e-05, + "loss": 0.6559, + "step": 74000 + }, + { + "epoch": 1.7085751128304318, + "grad_norm": 3.419024705886841, + "learning_rate": 2.8882178851383782e-05, + "loss": 0.6539, + "step": 74200 + }, + { + "epoch": 1.7131804365846919, + "grad_norm": 3.462752103805542, + "learning_rate": 2.8874907287561265e-05, + "loss": 0.6531, + "step": 74400 + }, + { + "epoch": 1.717785760338952, + "grad_norm": 3.5559234619140625, + "learning_rate": 2.8867635723738747e-05, + "loss": 0.6499, + "step": 74600 + }, + { + "epoch": 1.7223910840932117, + "grad_norm": 2.969327211380005, + "learning_rate": 2.8860364159916233e-05, + "loss": 0.6348, + "step": 74800 + }, + { + "epoch": 1.7269964078474715, + "grad_norm": 2.5927562713623047, + "learning_rate": 2.8853128953912828e-05, + "loss": 0.6491, + "step": 75000 + }, + { + "epoch": 1.7316017316017316, + "grad_norm": 3.60489821434021, + "learning_rate": 2.8845857390090314e-05, + "loss": 0.6627, + "step": 75200 + }, + { + "epoch": 1.7362070553559916, + "grad_norm": 3.0263278484344482, + "learning_rate": 2.8838585826267797e-05, + "loss": 0.6649, + "step": 75400 + }, + { + "epoch": 1.7408123791102514, + "grad_norm": 3.406975507736206, + "learning_rate": 2.8831314262445283e-05, + "loss": 0.6602, + "step": 75600 + }, + { + "epoch": 1.7454177028645113, + "grad_norm": 3.032499313354492, + "learning_rate": 2.882404269862277e-05, + "loss": 0.6548, + "step": 75800 + }, + { + "epoch": 1.7500230266187713, + "grad_norm": 3.3158748149871826, + "learning_rate": 2.881677113480025e-05, + "loss": 0.6661, + "step": 76000 + }, + { + "epoch": 1.7546283503730313, + "grad_norm": 2.534254312515259, + "learning_rate": 2.8809499570977733e-05, + "loss": 0.645, + "step": 76200 + }, + { + "epoch": 1.7592336741272911, + "grad_norm": 2.951493978500366, + "learning_rate": 2.880222800715522e-05, + "loss": 0.6594, + "step": 76400 + }, + { + "epoch": 1.763838997881551, + "grad_norm": 2.681861162185669, + "learning_rate": 2.8794956443332705e-05, + "loss": 0.652, + "step": 76600 + }, + { + "epoch": 1.768444321635811, + "grad_norm": 3.9448912143707275, + "learning_rate": 2.8787684879510188e-05, + "loss": 0.659, + "step": 76800 + }, + { + "epoch": 1.773049645390071, + "grad_norm": 3.7418482303619385, + "learning_rate": 2.8780413315687674e-05, + "loss": 0.655, + "step": 77000 + }, + { + "epoch": 1.7776549691443309, + "grad_norm": 3.9162464141845703, + "learning_rate": 2.877314175186516e-05, + "loss": 0.6591, + "step": 77200 + }, + { + "epoch": 1.7822602928985907, + "grad_norm": 3.8449208736419678, + "learning_rate": 2.876587018804264e-05, + "loss": 0.6717, + "step": 77400 + }, + { + "epoch": 1.7868656166528507, + "grad_norm": 2.90519118309021, + "learning_rate": 2.8758598624220125e-05, + "loss": 0.6559, + "step": 77600 + }, + { + "epoch": 1.7914709404071107, + "grad_norm": 3.301666259765625, + "learning_rate": 2.875132706039761e-05, + "loss": 0.6533, + "step": 77800 + }, + { + "epoch": 1.7960762641613706, + "grad_norm": 3.6207275390625, + "learning_rate": 2.8744091854394206e-05, + "loss": 0.6582, + "step": 78000 + }, + { + "epoch": 1.8006815879156304, + "grad_norm": 3.3020622730255127, + "learning_rate": 2.8736856648390804e-05, + "loss": 0.6606, + "step": 78200 + }, + { + "epoch": 1.8052869116698904, + "grad_norm": 3.674409866333008, + "learning_rate": 2.8729585084568287e-05, + "loss": 0.6581, + "step": 78400 + }, + { + "epoch": 1.8098922354241505, + "grad_norm": 2.18864369392395, + "learning_rate": 2.8722313520745773e-05, + "loss": 0.6641, + "step": 78600 + }, + { + "epoch": 1.8144975591784103, + "grad_norm": 3.165595769882202, + "learning_rate": 2.871504195692326e-05, + "loss": 0.6568, + "step": 78800 + }, + { + "epoch": 1.81910288293267, + "grad_norm": 2.2597508430480957, + "learning_rate": 2.8707770393100738e-05, + "loss": 0.651, + "step": 79000 + }, + { + "epoch": 1.8237082066869301, + "grad_norm": 3.7777042388916016, + "learning_rate": 2.8700498829278224e-05, + "loss": 0.6773, + "step": 79200 + }, + { + "epoch": 1.8283135304411902, + "grad_norm": 3.0962727069854736, + "learning_rate": 2.869322726545571e-05, + "loss": 0.6643, + "step": 79400 + }, + { + "epoch": 1.83291885419545, + "grad_norm": 3.4903509616851807, + "learning_rate": 2.8685955701633196e-05, + "loss": 0.6636, + "step": 79600 + }, + { + "epoch": 1.8375241779497098, + "grad_norm": 3.2742772102355957, + "learning_rate": 2.8678684137810678e-05, + "loss": 0.6497, + "step": 79800 + }, + { + "epoch": 1.8421295017039698, + "grad_norm": 3.5722882747650146, + "learning_rate": 2.8671412573988164e-05, + "loss": 0.6536, + "step": 80000 + }, + { + "epoch": 1.8467348254582299, + "grad_norm": 4.832566738128662, + "learning_rate": 2.8664141010165647e-05, + "loss": 0.6584, + "step": 80200 + }, + { + "epoch": 1.8513401492124897, + "grad_norm": 3.033071756362915, + "learning_rate": 2.865686944634313e-05, + "loss": 0.6542, + "step": 80400 + }, + { + "epoch": 1.8559454729667495, + "grad_norm": 3.029519557952881, + "learning_rate": 2.8649597882520615e-05, + "loss": 0.6627, + "step": 80600 + }, + { + "epoch": 1.8605507967210095, + "grad_norm": 3.6355273723602295, + "learning_rate": 2.86423263186981e-05, + "loss": 0.6568, + "step": 80800 + }, + { + "epoch": 1.8651561204752696, + "grad_norm": 2.547201633453369, + "learning_rate": 2.8635054754875584e-05, + "loss": 0.6582, + "step": 81000 + }, + { + "epoch": 1.8697614442295294, + "grad_norm": 3.6909263134002686, + "learning_rate": 2.862778319105307e-05, + "loss": 0.6528, + "step": 81200 + }, + { + "epoch": 1.8743667679837892, + "grad_norm": 3.107825994491577, + "learning_rate": 2.8620511627230555e-05, + "loss": 0.6452, + "step": 81400 + }, + { + "epoch": 1.8789720917380492, + "grad_norm": 2.624311685562134, + "learning_rate": 2.8613240063408034e-05, + "loss": 0.6588, + "step": 81600 + }, + { + "epoch": 1.883577415492309, + "grad_norm": 3.7440786361694336, + "learning_rate": 2.8606004857404633e-05, + "loss": 0.6489, + "step": 81800 + }, + { + "epoch": 1.8881827392465689, + "grad_norm": 3.2564656734466553, + "learning_rate": 2.859873329358212e-05, + "loss": 0.6599, + "step": 82000 + }, + { + "epoch": 1.892788063000829, + "grad_norm": 3.0685300827026367, + "learning_rate": 2.85914617297596e-05, + "loss": 0.6504, + "step": 82200 + }, + { + "epoch": 1.897393386755089, + "grad_norm": 3.587435245513916, + "learning_rate": 2.8584190165937087e-05, + "loss": 0.6499, + "step": 82400 + }, + { + "epoch": 1.9019987105093488, + "grad_norm": 2.8492074012756348, + "learning_rate": 2.8576918602114573e-05, + "loss": 0.6353, + "step": 82600 + }, + { + "epoch": 1.9066040342636086, + "grad_norm": 3.6821560859680176, + "learning_rate": 2.8569647038292056e-05, + "loss": 0.6529, + "step": 82800 + }, + { + "epoch": 1.9112093580178686, + "grad_norm": 4.206520080566406, + "learning_rate": 2.856237547446954e-05, + "loss": 0.6524, + "step": 83000 + }, + { + "epoch": 1.9158146817721287, + "grad_norm": 3.277606725692749, + "learning_rate": 2.8555103910647024e-05, + "loss": 0.6579, + "step": 83200 + }, + { + "epoch": 1.9204200055263885, + "grad_norm": 3.900179624557495, + "learning_rate": 2.8547832346824507e-05, + "loss": 0.6627, + "step": 83400 + }, + { + "epoch": 1.9250253292806483, + "grad_norm": 2.804596185684204, + "learning_rate": 2.8540560783001993e-05, + "loss": 0.6655, + "step": 83600 + }, + { + "epoch": 1.9296306530349083, + "grad_norm": 3.212975263595581, + "learning_rate": 2.853328921917948e-05, + "loss": 0.6614, + "step": 83800 + }, + { + "epoch": 1.9342359767891684, + "grad_norm": 4.128197193145752, + "learning_rate": 2.8526017655356965e-05, + "loss": 0.6546, + "step": 84000 + }, + { + "epoch": 1.9388413005434282, + "grad_norm": 3.0182225704193115, + "learning_rate": 2.8518746091534447e-05, + "loss": 0.659, + "step": 84200 + }, + { + "epoch": 1.943446624297688, + "grad_norm": 2.6237552165985107, + "learning_rate": 2.851147452771193e-05, + "loss": 0.6467, + "step": 84400 + }, + { + "epoch": 1.948051948051948, + "grad_norm": 2.700956106185913, + "learning_rate": 2.8504202963889416e-05, + "loss": 0.6597, + "step": 84600 + }, + { + "epoch": 1.952657271806208, + "grad_norm": 3.318554162979126, + "learning_rate": 2.8496931400066898e-05, + "loss": 0.649, + "step": 84800 + }, + { + "epoch": 1.9572625955604679, + "grad_norm": 2.75907301902771, + "learning_rate": 2.8489659836244384e-05, + "loss": 0.6713, + "step": 85000 + }, + { + "epoch": 1.9618679193147277, + "grad_norm": 3.922351837158203, + "learning_rate": 2.848238827242187e-05, + "loss": 0.6495, + "step": 85200 + }, + { + "epoch": 1.9664732430689877, + "grad_norm": 2.883132219314575, + "learning_rate": 2.8475116708599352e-05, + "loss": 0.6609, + "step": 85400 + }, + { + "epoch": 1.9710785668232478, + "grad_norm": 2.974393844604492, + "learning_rate": 2.8467845144776835e-05, + "loss": 0.6435, + "step": 85600 + }, + { + "epoch": 1.9756838905775076, + "grad_norm": 3.174490451812744, + "learning_rate": 2.846057358095432e-05, + "loss": 0.6514, + "step": 85800 + }, + { + "epoch": 1.9802892143317674, + "grad_norm": 3.1419320106506348, + "learning_rate": 2.8453302017131803e-05, + "loss": 0.6652, + "step": 86000 + }, + { + "epoch": 1.9848945380860274, + "grad_norm": 3.508948802947998, + "learning_rate": 2.844603045330929e-05, + "loss": 0.6562, + "step": 86200 + }, + { + "epoch": 1.9894998618402875, + "grad_norm": 3.572232484817505, + "learning_rate": 2.8438758889486775e-05, + "loss": 0.6547, + "step": 86400 + }, + { + "epoch": 1.9941051855945473, + "grad_norm": 2.923038959503174, + "learning_rate": 2.8431487325664258e-05, + "loss": 0.6555, + "step": 86600 + }, + { + "epoch": 1.9987105093488071, + "grad_norm": 2.6537113189697266, + "learning_rate": 2.8424252119660856e-05, + "loss": 0.6481, + "step": 86800 + }, + { + "epoch": 2.0, + "eval_loss": 0.6096945405006409, + "eval_runtime": 145.8772, + "eval_samples_per_second": 194.417, + "eval_steps_per_second": 12.154, + "step": 86856 + }, + { + "epoch": 2.003315833103067, + "grad_norm": 2.6319403648376465, + "learning_rate": 2.841698055583834e-05, + "loss": 0.6447, + "step": 87000 + }, + { + "epoch": 2.007921156857327, + "grad_norm": 3.3144571781158447, + "learning_rate": 2.840970899201582e-05, + "loss": 0.6489, + "step": 87200 + }, + { + "epoch": 2.012526480611587, + "grad_norm": 3.0572547912597656, + "learning_rate": 2.8402437428193307e-05, + "loss": 0.6458, + "step": 87400 + }, + { + "epoch": 2.017131804365847, + "grad_norm": 3.151933431625366, + "learning_rate": 2.8395165864370793e-05, + "loss": 0.6484, + "step": 87600 + }, + { + "epoch": 2.021737128120107, + "grad_norm": 3.794235944747925, + "learning_rate": 2.8387894300548276e-05, + "loss": 0.6493, + "step": 87800 + }, + { + "epoch": 2.026342451874367, + "grad_norm": 2.9801411628723145, + "learning_rate": 2.838062273672576e-05, + "loss": 0.6399, + "step": 88000 + }, + { + "epoch": 2.0309477756286265, + "grad_norm": 3.094648599624634, + "learning_rate": 2.8373351172903248e-05, + "loss": 0.6488, + "step": 88200 + }, + { + "epoch": 2.0355530993828865, + "grad_norm": 2.9568779468536377, + "learning_rate": 2.8366079609080727e-05, + "loss": 0.6476, + "step": 88400 + }, + { + "epoch": 2.0401584231371466, + "grad_norm": 3.226010799407959, + "learning_rate": 2.8358808045258213e-05, + "loss": 0.6439, + "step": 88600 + }, + { + "epoch": 2.0447637468914066, + "grad_norm": 3.10718035697937, + "learning_rate": 2.83515364814357e-05, + "loss": 0.6421, + "step": 88800 + }, + { + "epoch": 2.049369070645666, + "grad_norm": 3.8462493419647217, + "learning_rate": 2.8344264917613184e-05, + "loss": 0.6497, + "step": 89000 + }, + { + "epoch": 2.0539743943999262, + "grad_norm": 2.297658920288086, + "learning_rate": 2.8336993353790667e-05, + "loss": 0.6425, + "step": 89200 + }, + { + "epoch": 2.0585797181541863, + "grad_norm": 3.4975883960723877, + "learning_rate": 2.8329721789968153e-05, + "loss": 0.6387, + "step": 89400 + }, + { + "epoch": 2.0631850419084463, + "grad_norm": 2.6451213359832764, + "learning_rate": 2.8322450226145635e-05, + "loss": 0.6507, + "step": 89600 + }, + { + "epoch": 2.067790365662706, + "grad_norm": 4.4015679359436035, + "learning_rate": 2.8315178662323118e-05, + "loss": 0.6549, + "step": 89800 + }, + { + "epoch": 2.072395689416966, + "grad_norm": 3.2496938705444336, + "learning_rate": 2.8307907098500604e-05, + "loss": 0.6443, + "step": 90000 + }, + { + "epoch": 2.077001013171226, + "grad_norm": 3.8921310901641846, + "learning_rate": 2.830063553467809e-05, + "loss": 0.6563, + "step": 90200 + }, + { + "epoch": 2.081606336925486, + "grad_norm": 2.6001060009002686, + "learning_rate": 2.8293363970855572e-05, + "loss": 0.6561, + "step": 90400 + }, + { + "epoch": 2.0862116606797456, + "grad_norm": 2.925668716430664, + "learning_rate": 2.8286092407033058e-05, + "loss": 0.6395, + "step": 90600 + }, + { + "epoch": 2.0908169844340057, + "grad_norm": 2.9956302642822266, + "learning_rate": 2.8278857201029657e-05, + "loss": 0.6584, + "step": 90800 + }, + { + "epoch": 2.0954223081882657, + "grad_norm": 3.5268077850341797, + "learning_rate": 2.8271585637207136e-05, + "loss": 0.648, + "step": 91000 + }, + { + "epoch": 2.1000276319425257, + "grad_norm": 4.238087177276611, + "learning_rate": 2.8264314073384622e-05, + "loss": 0.6387, + "step": 91200 + }, + { + "epoch": 2.1046329556967853, + "grad_norm": 2.673576593399048, + "learning_rate": 2.8257042509562108e-05, + "loss": 0.6379, + "step": 91400 + }, + { + "epoch": 2.1092382794510454, + "grad_norm": 2.9278106689453125, + "learning_rate": 2.824977094573959e-05, + "loss": 0.6527, + "step": 91600 + }, + { + "epoch": 2.1138436032053054, + "grad_norm": 3.109639883041382, + "learning_rate": 2.824253573973619e-05, + "loss": 0.6437, + "step": 91800 + }, + { + "epoch": 2.1184489269595654, + "grad_norm": 3.2861876487731934, + "learning_rate": 2.8235264175913675e-05, + "loss": 0.6269, + "step": 92000 + }, + { + "epoch": 2.123054250713825, + "grad_norm": 3.4922659397125244, + "learning_rate": 2.8227992612091157e-05, + "loss": 0.647, + "step": 92200 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 3.1858551502227783, + "learning_rate": 2.8220721048268643e-05, + "loss": 0.6438, + "step": 92400 + }, + { + "epoch": 2.132264898222345, + "grad_norm": 3.2486298084259033, + "learning_rate": 2.8213449484446126e-05, + "loss": 0.6403, + "step": 92600 + }, + { + "epoch": 2.136870221976605, + "grad_norm": 2.3722951412200928, + "learning_rate": 2.8206177920623608e-05, + "loss": 0.6346, + "step": 92800 + }, + { + "epoch": 2.1414755457308647, + "grad_norm": 3.3336052894592285, + "learning_rate": 2.8198906356801094e-05, + "loss": 0.6514, + "step": 93000 + }, + { + "epoch": 2.1460808694851248, + "grad_norm": 2.755908727645874, + "learning_rate": 2.819163479297858e-05, + "loss": 0.6402, + "step": 93200 + }, + { + "epoch": 2.150686193239385, + "grad_norm": 3.960623264312744, + "learning_rate": 2.8184363229156063e-05, + "loss": 0.6398, + "step": 93400 + }, + { + "epoch": 2.155291516993645, + "grad_norm": 3.4139673709869385, + "learning_rate": 2.817709166533355e-05, + "loss": 0.6282, + "step": 93600 + }, + { + "epoch": 2.1598968407479044, + "grad_norm": 3.090846300125122, + "learning_rate": 2.816982010151103e-05, + "loss": 0.6634, + "step": 93800 + }, + { + "epoch": 2.1645021645021645, + "grad_norm": 2.835188627243042, + "learning_rate": 2.8162584895507626e-05, + "loss": 0.6342, + "step": 94000 + }, + { + "epoch": 2.1691074882564245, + "grad_norm": 2.9350528717041016, + "learning_rate": 2.8155313331685112e-05, + "loss": 0.6412, + "step": 94200 + }, + { + "epoch": 2.1737128120106846, + "grad_norm": 3.2040562629699707, + "learning_rate": 2.8148041767862598e-05, + "loss": 0.6519, + "step": 94400 + }, + { + "epoch": 2.178318135764944, + "grad_norm": 2.805971384048462, + "learning_rate": 2.814077020404008e-05, + "loss": 0.6438, + "step": 94600 + }, + { + "epoch": 2.182923459519204, + "grad_norm": 2.857898712158203, + "learning_rate": 2.8133498640217567e-05, + "loss": 0.6487, + "step": 94800 + }, + { + "epoch": 2.1875287832734642, + "grad_norm": 2.6172449588775635, + "learning_rate": 2.8126227076395052e-05, + "loss": 0.6422, + "step": 95000 + }, + { + "epoch": 2.192134107027724, + "grad_norm": 4.178059101104736, + "learning_rate": 2.8118955512572535e-05, + "loss": 0.6477, + "step": 95200 + }, + { + "epoch": 2.196739430781984, + "grad_norm": 3.011091947555542, + "learning_rate": 2.8111683948750018e-05, + "loss": 0.6463, + "step": 95400 + }, + { + "epoch": 2.201344754536244, + "grad_norm": 3.2300426959991455, + "learning_rate": 2.8104412384927503e-05, + "loss": 0.6409, + "step": 95600 + }, + { + "epoch": 2.205950078290504, + "grad_norm": 3.7736730575561523, + "learning_rate": 2.8097140821104986e-05, + "loss": 0.6504, + "step": 95800 + }, + { + "epoch": 2.210555402044764, + "grad_norm": 3.110412120819092, + "learning_rate": 2.8089869257282472e-05, + "loss": 0.6378, + "step": 96000 + }, + { + "epoch": 2.2151607257990236, + "grad_norm": 4.121431350708008, + "learning_rate": 2.8082597693459958e-05, + "loss": 0.653, + "step": 96200 + }, + { + "epoch": 2.2197660495532836, + "grad_norm": 3.8190181255340576, + "learning_rate": 2.807532612963744e-05, + "loss": 0.6441, + "step": 96400 + }, + { + "epoch": 2.2243713733075436, + "grad_norm": 2.9528844356536865, + "learning_rate": 2.8068054565814923e-05, + "loss": 0.6276, + "step": 96600 + }, + { + "epoch": 2.2289766970618032, + "grad_norm": 3.1562676429748535, + "learning_rate": 2.806078300199241e-05, + "loss": 0.6367, + "step": 96800 + }, + { + "epoch": 2.2335820208160633, + "grad_norm": 3.0100133419036865, + "learning_rate": 2.8053511438169895e-05, + "loss": 0.6336, + "step": 97000 + }, + { + "epoch": 2.2381873445703233, + "grad_norm": 2.758850336074829, + "learning_rate": 2.8046239874347377e-05, + "loss": 0.6509, + "step": 97200 + }, + { + "epoch": 2.2427926683245833, + "grad_norm": 3.6952168941497803, + "learning_rate": 2.8038968310524863e-05, + "loss": 0.6601, + "step": 97400 + }, + { + "epoch": 2.247397992078843, + "grad_norm": 3.7031092643737793, + "learning_rate": 2.803173310452146e-05, + "loss": 0.6531, + "step": 97600 + }, + { + "epoch": 2.252003315833103, + "grad_norm": 3.341907024383545, + "learning_rate": 2.8024461540698944e-05, + "loss": 0.6614, + "step": 97800 + }, + { + "epoch": 2.256608639587363, + "grad_norm": 3.172600746154785, + "learning_rate": 2.8017189976876427e-05, + "loss": 0.6426, + "step": 98000 + }, + { + "epoch": 2.261213963341623, + "grad_norm": 3.3580822944641113, + "learning_rate": 2.800991841305391e-05, + "loss": 0.6453, + "step": 98200 + }, + { + "epoch": 2.2658192870958827, + "grad_norm": 4.135528087615967, + "learning_rate": 2.8002646849231395e-05, + "loss": 0.6386, + "step": 98400 + }, + { + "epoch": 2.2704246108501427, + "grad_norm": 3.5923011302948, + "learning_rate": 2.799537528540888e-05, + "loss": 0.6651, + "step": 98600 + }, + { + "epoch": 2.2750299346044027, + "grad_norm": 3.149178981781006, + "learning_rate": 2.7988103721586367e-05, + "loss": 0.6435, + "step": 98800 + }, + { + "epoch": 2.2796352583586628, + "grad_norm": 3.2348053455352783, + "learning_rate": 2.798083215776385e-05, + "loss": 0.6424, + "step": 99000 + }, + { + "epoch": 2.2842405821129224, + "grad_norm": 3.580576181411743, + "learning_rate": 2.7973560593941335e-05, + "loss": 0.6572, + "step": 99200 + }, + { + "epoch": 2.2888459058671824, + "grad_norm": 3.2298011779785156, + "learning_rate": 2.7966289030118818e-05, + "loss": 0.666, + "step": 99400 + }, + { + "epoch": 2.2934512296214424, + "grad_norm": 3.1280014514923096, + "learning_rate": 2.79590174662963e-05, + "loss": 0.6499, + "step": 99600 + }, + { + "epoch": 2.2980565533757025, + "grad_norm": 3.244581937789917, + "learning_rate": 2.79517822602929e-05, + "loss": 0.6428, + "step": 99800 + }, + { + "epoch": 2.302661877129962, + "grad_norm": 3.160811424255371, + "learning_rate": 2.7944510696470385e-05, + "loss": 0.639, + "step": 100000 + }, + { + "epoch": 2.307267200884222, + "grad_norm": 3.2800822257995605, + "learning_rate": 2.793727549046698e-05, + "loss": 0.6504, + "step": 100200 + }, + { + "epoch": 2.311872524638482, + "grad_norm": 2.948641300201416, + "learning_rate": 2.7930003926644466e-05, + "loss": 0.6455, + "step": 100400 + }, + { + "epoch": 2.316477848392742, + "grad_norm": 3.270315408706665, + "learning_rate": 2.792273236282195e-05, + "loss": 0.6498, + "step": 100600 + }, + { + "epoch": 2.3210831721470018, + "grad_norm": 3.389112710952759, + "learning_rate": 2.7915460798999435e-05, + "loss": 0.6392, + "step": 100800 + }, + { + "epoch": 2.325688495901262, + "grad_norm": 3.453878164291382, + "learning_rate": 2.7908189235176917e-05, + "loss": 0.6538, + "step": 101000 + }, + { + "epoch": 2.330293819655522, + "grad_norm": 2.7830543518066406, + "learning_rate": 2.79009176713544e-05, + "loss": 0.6402, + "step": 101200 + }, + { + "epoch": 2.334899143409782, + "grad_norm": 3.6506121158599854, + "learning_rate": 2.7893646107531886e-05, + "loss": 0.6448, + "step": 101400 + }, + { + "epoch": 2.3395044671640415, + "grad_norm": 2.926161050796509, + "learning_rate": 2.788637454370937e-05, + "loss": 0.6506, + "step": 101600 + }, + { + "epoch": 2.3441097909183015, + "grad_norm": 2.95794677734375, + "learning_rate": 2.7879102979886857e-05, + "loss": 0.6472, + "step": 101800 + }, + { + "epoch": 2.3487151146725616, + "grad_norm": 3.9702541828155518, + "learning_rate": 2.787183141606434e-05, + "loss": 0.645, + "step": 102000 + }, + { + "epoch": 2.3533204384268216, + "grad_norm": 2.927553176879883, + "learning_rate": 2.7864559852241826e-05, + "loss": 0.6354, + "step": 102200 + }, + { + "epoch": 2.357925762181081, + "grad_norm": 3.047414779663086, + "learning_rate": 2.785728828841931e-05, + "loss": 0.6475, + "step": 102400 + }, + { + "epoch": 2.3625310859353412, + "grad_norm": 2.788905382156372, + "learning_rate": 2.785001672459679e-05, + "loss": 0.6542, + "step": 102600 + }, + { + "epoch": 2.3671364096896013, + "grad_norm": 2.729799509048462, + "learning_rate": 2.7842745160774277e-05, + "loss": 0.6384, + "step": 102800 + }, + { + "epoch": 2.3717417334438613, + "grad_norm": 3.3632562160491943, + "learning_rate": 2.7835473596951763e-05, + "loss": 0.6314, + "step": 103000 + }, + { + "epoch": 2.376347057198121, + "grad_norm": 3.1274969577789307, + "learning_rate": 2.7828202033129245e-05, + "loss": 0.6456, + "step": 103200 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 3.8563055992126465, + "learning_rate": 2.782093046930673e-05, + "loss": 0.6393, + "step": 103400 + }, + { + "epoch": 2.385557704706641, + "grad_norm": 3.3758862018585205, + "learning_rate": 2.7813658905484214e-05, + "loss": 0.6386, + "step": 103600 + }, + { + "epoch": 2.3901630284609006, + "grad_norm": 3.2293407917022705, + "learning_rate": 2.7806387341661696e-05, + "loss": 0.6505, + "step": 103800 + }, + { + "epoch": 2.3947683522151606, + "grad_norm": 3.1914443969726562, + "learning_rate": 2.7799115777839182e-05, + "loss": 0.643, + "step": 104000 + }, + { + "epoch": 2.3993736759694206, + "grad_norm": 2.7224652767181396, + "learning_rate": 2.7791844214016668e-05, + "loss": 0.6374, + "step": 104200 + }, + { + "epoch": 2.4039789997236807, + "grad_norm": 3.057840347290039, + "learning_rate": 2.778457265019415e-05, + "loss": 0.6343, + "step": 104400 + }, + { + "epoch": 2.4085843234779407, + "grad_norm": 2.8306033611297607, + "learning_rate": 2.7777301086371636e-05, + "loss": 0.645, + "step": 104600 + }, + { + "epoch": 2.4131896472322003, + "grad_norm": 3.3138833045959473, + "learning_rate": 2.777002952254912e-05, + "loss": 0.6502, + "step": 104800 + }, + { + "epoch": 2.4177949709864603, + "grad_norm": 2.700263738632202, + "learning_rate": 2.7762757958726605e-05, + "loss": 0.6443, + "step": 105000 + }, + { + "epoch": 2.4224002947407204, + "grad_norm": 4.0957932472229, + "learning_rate": 2.7755486394904087e-05, + "loss": 0.6615, + "step": 105200 + }, + { + "epoch": 2.42700561849498, + "grad_norm": 2.5739316940307617, + "learning_rate": 2.7748214831081573e-05, + "loss": 0.6456, + "step": 105400 + }, + { + "epoch": 2.43161094224924, + "grad_norm": 3.833193302154541, + "learning_rate": 2.774094326725906e-05, + "loss": 0.664, + "step": 105600 + }, + { + "epoch": 2.4362162660035, + "grad_norm": 2.990009307861328, + "learning_rate": 2.7733671703436542e-05, + "loss": 0.6308, + "step": 105800 + }, + { + "epoch": 2.44082158975776, + "grad_norm": 3.57989501953125, + "learning_rate": 2.7726400139614028e-05, + "loss": 0.6434, + "step": 106000 + }, + { + "epoch": 2.44542691351202, + "grad_norm": 2.9661567211151123, + "learning_rate": 2.771912857579151e-05, + "loss": 0.6467, + "step": 106200 + }, + { + "epoch": 2.4500322372662797, + "grad_norm": 2.5463151931762695, + "learning_rate": 2.7711857011968993e-05, + "loss": 0.6579, + "step": 106400 + }, + { + "epoch": 2.4546375610205398, + "grad_norm": 3.589879274368286, + "learning_rate": 2.770462180596559e-05, + "loss": 0.6557, + "step": 106600 + }, + { + "epoch": 2.4592428847748, + "grad_norm": 2.7275781631469727, + "learning_rate": 2.7697386599962187e-05, + "loss": 0.6576, + "step": 106800 + }, + { + "epoch": 2.4638482085290594, + "grad_norm": 3.2831528186798096, + "learning_rate": 2.7690115036139672e-05, + "loss": 0.645, + "step": 107000 + }, + { + "epoch": 2.4684535322833194, + "grad_norm": 2.328716993331909, + "learning_rate": 2.768284347231716e-05, + "loss": 0.6388, + "step": 107200 + }, + { + "epoch": 2.4730588560375795, + "grad_norm": 3.319284439086914, + "learning_rate": 2.767557190849464e-05, + "loss": 0.6465, + "step": 107400 + }, + { + "epoch": 2.4776641797918395, + "grad_norm": 3.1933634281158447, + "learning_rate": 2.766833670249124e-05, + "loss": 0.6414, + "step": 107600 + }, + { + "epoch": 2.482269503546099, + "grad_norm": 3.2471187114715576, + "learning_rate": 2.7661065138668725e-05, + "loss": 0.6406, + "step": 107800 + }, + { + "epoch": 2.486874827300359, + "grad_norm": 3.0858373641967773, + "learning_rate": 2.7653793574846205e-05, + "loss": 0.6537, + "step": 108000 + }, + { + "epoch": 2.491480151054619, + "grad_norm": 2.9696667194366455, + "learning_rate": 2.7646558368842803e-05, + "loss": 0.6347, + "step": 108200 + }, + { + "epoch": 2.496085474808879, + "grad_norm": 3.244499683380127, + "learning_rate": 2.7639286805020286e-05, + "loss": 0.6431, + "step": 108400 + }, + { + "epoch": 2.500690798563139, + "grad_norm": 3.8631808757781982, + "learning_rate": 2.763201524119777e-05, + "loss": 0.6439, + "step": 108600 + }, + { + "epoch": 2.505296122317399, + "grad_norm": 3.0365819931030273, + "learning_rate": 2.7624743677375258e-05, + "loss": 0.6466, + "step": 108800 + }, + { + "epoch": 2.509901446071659, + "grad_norm": 3.1504974365234375, + "learning_rate": 2.7617472113552743e-05, + "loss": 0.6623, + "step": 109000 + }, + { + "epoch": 2.514506769825919, + "grad_norm": 2.797166109085083, + "learning_rate": 2.7610200549730226e-05, + "loss": 0.6378, + "step": 109200 + }, + { + "epoch": 2.519112093580179, + "grad_norm": 6.695431232452393, + "learning_rate": 2.7602928985907712e-05, + "loss": 0.6388, + "step": 109400 + }, + { + "epoch": 2.5237174173344386, + "grad_norm": 2.692798614501953, + "learning_rate": 2.7595657422085194e-05, + "loss": 0.6556, + "step": 109600 + }, + { + "epoch": 2.5283227410886986, + "grad_norm": 4.388221740722656, + "learning_rate": 2.7588385858262677e-05, + "loss": 0.6547, + "step": 109800 + }, + { + "epoch": 2.5329280648429586, + "grad_norm": 6.0723676681518555, + "learning_rate": 2.7581114294440163e-05, + "loss": 0.642, + "step": 110000 + }, + { + "epoch": 2.537533388597218, + "grad_norm": 3.7598865032196045, + "learning_rate": 2.757384273061765e-05, + "loss": 0.637, + "step": 110200 + }, + { + "epoch": 2.5421387123514783, + "grad_norm": 4.075303077697754, + "learning_rate": 2.756657116679513e-05, + "loss": 0.6458, + "step": 110400 + }, + { + "epoch": 2.5467440361057383, + "grad_norm": 2.6612720489501953, + "learning_rate": 2.7559299602972617e-05, + "loss": 0.6478, + "step": 110600 + }, + { + "epoch": 2.551349359859998, + "grad_norm": 3.044689416885376, + "learning_rate": 2.75520280391501e-05, + "loss": 0.6586, + "step": 110800 + }, + { + "epoch": 2.555954683614258, + "grad_norm": 3.982969284057617, + "learning_rate": 2.7544756475327582e-05, + "loss": 0.6457, + "step": 111000 + }, + { + "epoch": 2.560560007368518, + "grad_norm": 3.1461875438690186, + "learning_rate": 2.7537484911505068e-05, + "loss": 0.649, + "step": 111200 + }, + { + "epoch": 2.565165331122778, + "grad_norm": 3.544240951538086, + "learning_rate": 2.7530213347682554e-05, + "loss": 0.6479, + "step": 111400 + }, + { + "epoch": 2.569770654877038, + "grad_norm": 3.3684897422790527, + "learning_rate": 2.752294178386004e-05, + "loss": 0.6437, + "step": 111600 + }, + { + "epoch": 2.5743759786312976, + "grad_norm": 3.4466545581817627, + "learning_rate": 2.7515670220037523e-05, + "loss": 0.6518, + "step": 111800 + }, + { + "epoch": 2.5789813023855577, + "grad_norm": 3.449298858642578, + "learning_rate": 2.7508398656215005e-05, + "loss": 0.6504, + "step": 112000 + }, + { + "epoch": 2.5835866261398177, + "grad_norm": 2.8273544311523438, + "learning_rate": 2.750112709239249e-05, + "loss": 0.6395, + "step": 112200 + }, + { + "epoch": 2.5881919498940773, + "grad_norm": 2.8043863773345947, + "learning_rate": 2.7493855528569973e-05, + "loss": 0.6562, + "step": 112400 + }, + { + "epoch": 2.5927972736483373, + "grad_norm": 3.3060178756713867, + "learning_rate": 2.748658396474746e-05, + "loss": 0.637, + "step": 112600 + }, + { + "epoch": 2.5974025974025974, + "grad_norm": 3.410897970199585, + "learning_rate": 2.7479312400924945e-05, + "loss": 0.6376, + "step": 112800 + }, + { + "epoch": 2.6020079211568574, + "grad_norm": 3.279798984527588, + "learning_rate": 2.7472040837102428e-05, + "loss": 0.6449, + "step": 113000 + }, + { + "epoch": 2.6066132449111175, + "grad_norm": 3.1562256813049316, + "learning_rate": 2.7464769273279914e-05, + "loss": 0.6447, + "step": 113200 + }, + { + "epoch": 2.611218568665377, + "grad_norm": 2.973257303237915, + "learning_rate": 2.7457534067276512e-05, + "loss": 0.6406, + "step": 113400 + }, + { + "epoch": 2.615823892419637, + "grad_norm": 3.439020872116089, + "learning_rate": 2.745026250345399e-05, + "loss": 0.6391, + "step": 113600 + }, + { + "epoch": 2.620429216173897, + "grad_norm": 2.99761700630188, + "learning_rate": 2.7442990939631477e-05, + "loss": 0.6486, + "step": 113800 + }, + { + "epoch": 2.6250345399281567, + "grad_norm": 3.7341833114624023, + "learning_rate": 2.7435719375808963e-05, + "loss": 0.6371, + "step": 114000 + }, + { + "epoch": 2.6296398636824168, + "grad_norm": 3.088653326034546, + "learning_rate": 2.7428447811986446e-05, + "loss": 0.6513, + "step": 114200 + }, + { + "epoch": 2.634245187436677, + "grad_norm": 3.4004712104797363, + "learning_rate": 2.7421176248163932e-05, + "loss": 0.6321, + "step": 114400 + }, + { + "epoch": 2.638850511190937, + "grad_norm": 2.652799129486084, + "learning_rate": 2.7413904684341418e-05, + "loss": 0.6364, + "step": 114600 + }, + { + "epoch": 2.643455834945197, + "grad_norm": 3.6918208599090576, + "learning_rate": 2.7406633120518897e-05, + "loss": 0.6341, + "step": 114800 + }, + { + "epoch": 2.6480611586994565, + "grad_norm": 3.6925747394561768, + "learning_rate": 2.7399361556696383e-05, + "loss": 0.6422, + "step": 115000 + }, + { + "epoch": 2.6526664824537165, + "grad_norm": 3.1433355808258057, + "learning_rate": 2.739208999287387e-05, + "loss": 0.6372, + "step": 115200 + }, + { + "epoch": 2.6572718062079765, + "grad_norm": 2.9344465732574463, + "learning_rate": 2.738481842905135e-05, + "loss": 0.6478, + "step": 115400 + }, + { + "epoch": 2.661877129962236, + "grad_norm": 3.018880605697632, + "learning_rate": 2.7377546865228837e-05, + "loss": 0.6395, + "step": 115600 + }, + { + "epoch": 2.666482453716496, + "grad_norm": 2.8171603679656982, + "learning_rate": 2.7370275301406323e-05, + "loss": 0.6492, + "step": 115800 + }, + { + "epoch": 2.671087777470756, + "grad_norm": 2.8728506565093994, + "learning_rate": 2.7363003737583802e-05, + "loss": 0.6405, + "step": 116000 + }, + { + "epoch": 2.6756931012250162, + "grad_norm": 3.09126877784729, + "learning_rate": 2.7355732173761288e-05, + "loss": 0.6398, + "step": 116200 + }, + { + "epoch": 2.6802984249792763, + "grad_norm": 3.089311122894287, + "learning_rate": 2.7348460609938774e-05, + "loss": 0.6494, + "step": 116400 + }, + { + "epoch": 2.684903748733536, + "grad_norm": 2.3543293476104736, + "learning_rate": 2.734118904611626e-05, + "loss": 0.6376, + "step": 116600 + }, + { + "epoch": 2.689509072487796, + "grad_norm": 2.459444046020508, + "learning_rate": 2.7333917482293742e-05, + "loss": 0.6337, + "step": 116800 + }, + { + "epoch": 2.694114396242056, + "grad_norm": 3.3463871479034424, + "learning_rate": 2.7326645918471228e-05, + "loss": 0.6359, + "step": 117000 + }, + { + "epoch": 2.6987197199963155, + "grad_norm": 2.9327232837677, + "learning_rate": 2.7319374354648714e-05, + "loss": 0.6264, + "step": 117200 + }, + { + "epoch": 2.7033250437505756, + "grad_norm": 3.052452802658081, + "learning_rate": 2.7312102790826193e-05, + "loss": 0.6466, + "step": 117400 + }, + { + "epoch": 2.7079303675048356, + "grad_norm": 3.294747829437256, + "learning_rate": 2.730483122700368e-05, + "loss": 0.6525, + "step": 117600 + }, + { + "epoch": 2.7125356912590957, + "grad_norm": 3.948997735977173, + "learning_rate": 2.7297596021000278e-05, + "loss": 0.6457, + "step": 117800 + }, + { + "epoch": 2.7171410150133557, + "grad_norm": 2.7049851417541504, + "learning_rate": 2.729032445717776e-05, + "loss": 0.6435, + "step": 118000 + }, + { + "epoch": 2.7217463387676153, + "grad_norm": 3.9702322483062744, + "learning_rate": 2.7283052893355246e-05, + "loss": 0.6516, + "step": 118200 + }, + { + "epoch": 2.7263516625218753, + "grad_norm": 2.606865882873535, + "learning_rate": 2.7275781329532732e-05, + "loss": 0.6551, + "step": 118400 + }, + { + "epoch": 2.7309569862761354, + "grad_norm": 3.387070894241333, + "learning_rate": 2.7268509765710215e-05, + "loss": 0.6403, + "step": 118600 + }, + { + "epoch": 2.735562310030395, + "grad_norm": 3.625943899154663, + "learning_rate": 2.7261238201887697e-05, + "loss": 0.6269, + "step": 118800 + }, + { + "epoch": 2.740167633784655, + "grad_norm": 2.9141345024108887, + "learning_rate": 2.7254002995884292e-05, + "loss": 0.6501, + "step": 119000 + }, + { + "epoch": 2.744772957538915, + "grad_norm": 3.324512004852295, + "learning_rate": 2.724673143206178e-05, + "loss": 0.6393, + "step": 119200 + }, + { + "epoch": 2.749378281293175, + "grad_norm": 3.5145280361175537, + "learning_rate": 2.7239459868239264e-05, + "loss": 0.6319, + "step": 119400 + }, + { + "epoch": 2.7539836050474347, + "grad_norm": 2.751305103302002, + "learning_rate": 2.723218830441675e-05, + "loss": 0.6362, + "step": 119600 + }, + { + "epoch": 2.7585889288016947, + "grad_norm": 3.314880132675171, + "learning_rate": 2.7224916740594233e-05, + "loss": 0.635, + "step": 119800 + }, + { + "epoch": 2.7631942525559547, + "grad_norm": 3.0874931812286377, + "learning_rate": 2.721764517677172e-05, + "loss": 0.6568, + "step": 120000 + }, + { + "epoch": 2.767799576310215, + "grad_norm": 3.5240249633789062, + "learning_rate": 2.7210373612949205e-05, + "loss": 0.6494, + "step": 120200 + }, + { + "epoch": 2.7724049000644744, + "grad_norm": 2.875325918197632, + "learning_rate": 2.7203102049126684e-05, + "loss": 0.6355, + "step": 120400 + }, + { + "epoch": 2.7770102238187344, + "grad_norm": 3.846749782562256, + "learning_rate": 2.719583048530417e-05, + "loss": 0.6338, + "step": 120600 + }, + { + "epoch": 2.7816155475729945, + "grad_norm": 3.331397533416748, + "learning_rate": 2.7188558921481656e-05, + "loss": 0.6511, + "step": 120800 + }, + { + "epoch": 2.786220871327254, + "grad_norm": 3.6676218509674072, + "learning_rate": 2.7181287357659138e-05, + "loss": 0.6326, + "step": 121000 + }, + { + "epoch": 2.790826195081514, + "grad_norm": 3.7938387393951416, + "learning_rate": 2.7174015793836624e-05, + "loss": 0.6354, + "step": 121200 + }, + { + "epoch": 2.795431518835774, + "grad_norm": 3.1966028213500977, + "learning_rate": 2.716674423001411e-05, + "loss": 0.6457, + "step": 121400 + }, + { + "epoch": 2.800036842590034, + "grad_norm": 2.660951852798462, + "learning_rate": 2.715947266619159e-05, + "loss": 0.6536, + "step": 121600 + }, + { + "epoch": 2.804642166344294, + "grad_norm": 3.02982234954834, + "learning_rate": 2.7152201102369075e-05, + "loss": 0.6507, + "step": 121800 + }, + { + "epoch": 2.809247490098554, + "grad_norm": 3.0377113819122314, + "learning_rate": 2.714492953854656e-05, + "loss": 0.645, + "step": 122000 + }, + { + "epoch": 2.813852813852814, + "grad_norm": 3.3548905849456787, + "learning_rate": 2.7137657974724043e-05, + "loss": 0.636, + "step": 122200 + }, + { + "epoch": 2.818458137607074, + "grad_norm": 4.323446750640869, + "learning_rate": 2.713038641090153e-05, + "loss": 0.6529, + "step": 122400 + }, + { + "epoch": 2.8230634613613335, + "grad_norm": 3.245694637298584, + "learning_rate": 2.7123114847079015e-05, + "loss": 0.6364, + "step": 122600 + }, + { + "epoch": 2.8276687851155935, + "grad_norm": 2.8766958713531494, + "learning_rate": 2.7115843283256498e-05, + "loss": 0.6388, + "step": 122800 + }, + { + "epoch": 2.8322741088698535, + "grad_norm": 2.433682441711426, + "learning_rate": 2.7108608077253093e-05, + "loss": 0.6383, + "step": 123000 + }, + { + "epoch": 2.8368794326241136, + "grad_norm": 3.1630749702453613, + "learning_rate": 2.710137287124969e-05, + "loss": 0.6312, + "step": 123200 + }, + { + "epoch": 2.8414847563783736, + "grad_norm": 3.1013870239257812, + "learning_rate": 2.7094101307427174e-05, + "loss": 0.6377, + "step": 123400 + }, + { + "epoch": 2.846090080132633, + "grad_norm": 3.5772645473480225, + "learning_rate": 2.708682974360466e-05, + "loss": 0.6404, + "step": 123600 + }, + { + "epoch": 2.8506954038868932, + "grad_norm": 2.970681667327881, + "learning_rate": 2.7079558179782146e-05, + "loss": 0.6472, + "step": 123800 + }, + { + "epoch": 2.8553007276411533, + "grad_norm": 2.7734873294830322, + "learning_rate": 2.707228661595963e-05, + "loss": 0.6329, + "step": 124000 + }, + { + "epoch": 2.859906051395413, + "grad_norm": 3.264719247817993, + "learning_rate": 2.7065015052137114e-05, + "loss": 0.6518, + "step": 124200 + }, + { + "epoch": 2.864511375149673, + "grad_norm": 3.1306731700897217, + "learning_rate": 2.70577434883146e-05, + "loss": 0.652, + "step": 124400 + }, + { + "epoch": 2.869116698903933, + "grad_norm": 3.149149179458618, + "learning_rate": 2.705047192449208e-05, + "loss": 0.6429, + "step": 124600 + }, + { + "epoch": 2.873722022658193, + "grad_norm": 3.6126046180725098, + "learning_rate": 2.7043200360669565e-05, + "loss": 0.6355, + "step": 124800 + }, + { + "epoch": 2.878327346412453, + "grad_norm": 2.957371234893799, + "learning_rate": 2.703592879684705e-05, + "loss": 0.6425, + "step": 125000 + }, + { + "epoch": 2.8829326701667126, + "grad_norm": 2.859186887741089, + "learning_rate": 2.7028693590843646e-05, + "loss": 0.6445, + "step": 125200 + }, + { + "epoch": 2.8875379939209727, + "grad_norm": 3.0626325607299805, + "learning_rate": 2.7021422027021132e-05, + "loss": 0.6572, + "step": 125400 + }, + { + "epoch": 2.8921433176752327, + "grad_norm": 2.7346231937408447, + "learning_rate": 2.7014150463198618e-05, + "loss": 0.6421, + "step": 125600 + }, + { + "epoch": 2.8967486414294923, + "grad_norm": 2.984370470046997, + "learning_rate": 2.70068788993761e-05, + "loss": 0.6436, + "step": 125800 + }, + { + "epoch": 2.9013539651837523, + "grad_norm": 3.117213726043701, + "learning_rate": 2.6999607335553583e-05, + "loss": 0.6493, + "step": 126000 + }, + { + "epoch": 2.9059592889380124, + "grad_norm": 3.779122829437256, + "learning_rate": 2.699233577173107e-05, + "loss": 0.6394, + "step": 126200 + }, + { + "epoch": 2.9105646126922724, + "grad_norm": 3.12115740776062, + "learning_rate": 2.6985064207908552e-05, + "loss": 0.6344, + "step": 126400 + }, + { + "epoch": 2.9151699364465324, + "grad_norm": 3.0919904708862305, + "learning_rate": 2.6977792644086038e-05, + "loss": 0.6405, + "step": 126600 + }, + { + "epoch": 2.919775260200792, + "grad_norm": 2.8004214763641357, + "learning_rate": 2.6970521080263524e-05, + "loss": 0.6489, + "step": 126800 + }, + { + "epoch": 2.924380583955052, + "grad_norm": 2.920830726623535, + "learning_rate": 2.6963249516441006e-05, + "loss": 0.6394, + "step": 127000 + }, + { + "epoch": 2.928985907709312, + "grad_norm": 3.278106451034546, + "learning_rate": 2.6955977952618492e-05, + "loss": 0.6434, + "step": 127200 + }, + { + "epoch": 2.9335912314635717, + "grad_norm": 3.6461238861083984, + "learning_rate": 2.6948706388795975e-05, + "loss": 0.646, + "step": 127400 + }, + { + "epoch": 2.9381965552178317, + "grad_norm": 3.31779146194458, + "learning_rate": 2.694143482497346e-05, + "loss": 0.6361, + "step": 127600 + }, + { + "epoch": 2.942801878972092, + "grad_norm": 2.7438087463378906, + "learning_rate": 2.6934199618970056e-05, + "loss": 0.6437, + "step": 127800 + }, + { + "epoch": 2.947407202726352, + "grad_norm": 2.8473029136657715, + "learning_rate": 2.692692805514754e-05, + "loss": 0.6338, + "step": 128000 + }, + { + "epoch": 2.952012526480612, + "grad_norm": 2.5815622806549072, + "learning_rate": 2.6919656491325024e-05, + "loss": 0.6371, + "step": 128200 + }, + { + "epoch": 2.9566178502348714, + "grad_norm": 3.202380657196045, + "learning_rate": 2.691238492750251e-05, + "loss": 0.6469, + "step": 128400 + }, + { + "epoch": 2.9612231739891315, + "grad_norm": 3.2944741249084473, + "learning_rate": 2.6905113363679996e-05, + "loss": 0.6515, + "step": 128600 + }, + { + "epoch": 2.9658284977433915, + "grad_norm": 2.853166103363037, + "learning_rate": 2.6897841799857475e-05, + "loss": 0.6392, + "step": 128800 + }, + { + "epoch": 2.970433821497651, + "grad_norm": 3.1663033962249756, + "learning_rate": 2.689057023603496e-05, + "loss": 0.6454, + "step": 129000 + }, + { + "epoch": 2.975039145251911, + "grad_norm": 2.5806081295013428, + "learning_rate": 2.6883298672212447e-05, + "loss": 0.6411, + "step": 129200 + }, + { + "epoch": 2.979644469006171, + "grad_norm": 2.8585219383239746, + "learning_rate": 2.6876027108389933e-05, + "loss": 0.6475, + "step": 129400 + }, + { + "epoch": 2.984249792760431, + "grad_norm": 2.955064058303833, + "learning_rate": 2.6868755544567415e-05, + "loss": 0.6548, + "step": 129600 + }, + { + "epoch": 2.988855116514691, + "grad_norm": 4.072856426239014, + "learning_rate": 2.68614839807449e-05, + "loss": 0.6538, + "step": 129800 + }, + { + "epoch": 2.993460440268951, + "grad_norm": 3.6334667205810547, + "learning_rate": 2.6854212416922384e-05, + "loss": 0.6406, + "step": 130000 + }, + { + "epoch": 2.998065764023211, + "grad_norm": 2.8243021965026855, + "learning_rate": 2.6846940853099866e-05, + "loss": 0.6339, + "step": 130200 + }, + { + "epoch": 3.0, + "eval_loss": 0.5988962650299072, + "eval_runtime": 145.7025, + "eval_samples_per_second": 194.65, + "eval_steps_per_second": 12.169, + "step": 130284 + }, + { + "epoch": 3.002671087777471, + "grad_norm": 2.527531385421753, + "learning_rate": 2.6839669289277352e-05, + "loss": 0.6279, + "step": 130400 + }, + { + "epoch": 3.0072764115317305, + "grad_norm": 3.933335304260254, + "learning_rate": 2.6832397725454838e-05, + "loss": 0.6282, + "step": 130600 + }, + { + "epoch": 3.0118817352859906, + "grad_norm": 3.1796278953552246, + "learning_rate": 2.682512616163232e-05, + "loss": 0.6296, + "step": 130800 + }, + { + "epoch": 3.0164870590402506, + "grad_norm": 2.6977293491363525, + "learning_rate": 2.681789095562892e-05, + "loss": 0.6249, + "step": 131000 + }, + { + "epoch": 3.0210923827945106, + "grad_norm": 3.7220981121063232, + "learning_rate": 2.6810619391806405e-05, + "loss": 0.6298, + "step": 131200 + }, + { + "epoch": 3.0256977065487702, + "grad_norm": 3.126789093017578, + "learning_rate": 2.6803347827983888e-05, + "loss": 0.6473, + "step": 131400 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 2.9881398677825928, + "learning_rate": 2.679607626416137e-05, + "loss": 0.6445, + "step": 131600 + }, + { + "epoch": 3.0349083540572903, + "grad_norm": 4.632811069488525, + "learning_rate": 2.6788804700338856e-05, + "loss": 0.6415, + "step": 131800 + }, + { + "epoch": 3.0395136778115504, + "grad_norm": 3.6200356483459473, + "learning_rate": 2.678153313651634e-05, + "loss": 0.6242, + "step": 132000 + }, + { + "epoch": 3.04411900156581, + "grad_norm": 3.1809945106506348, + "learning_rate": 2.6774261572693825e-05, + "loss": 0.638, + "step": 132200 + }, + { + "epoch": 3.04872432532007, + "grad_norm": 4.0169806480407715, + "learning_rate": 2.676699000887131e-05, + "loss": 0.6335, + "step": 132400 + }, + { + "epoch": 3.05332964907433, + "grad_norm": 3.9464833736419678, + "learning_rate": 2.6759718445048793e-05, + "loss": 0.6376, + "step": 132600 + }, + { + "epoch": 3.05793497282859, + "grad_norm": 2.9559237957000732, + "learning_rate": 2.6752446881226276e-05, + "loss": 0.6343, + "step": 132800 + }, + { + "epoch": 3.0625402965828497, + "grad_norm": 3.0488362312316895, + "learning_rate": 2.674517531740376e-05, + "loss": 0.6426, + "step": 133000 + }, + { + "epoch": 3.0671456203371097, + "grad_norm": 3.531463861465454, + "learning_rate": 2.6737903753581244e-05, + "loss": 0.6438, + "step": 133200 + }, + { + "epoch": 3.0717509440913697, + "grad_norm": 3.7633442878723145, + "learning_rate": 2.673063218975873e-05, + "loss": 0.6383, + "step": 133400 + }, + { + "epoch": 3.0763562678456298, + "grad_norm": 2.790278196334839, + "learning_rate": 2.6723360625936216e-05, + "loss": 0.6376, + "step": 133600 + }, + { + "epoch": 3.0809615915998894, + "grad_norm": 3.4008514881134033, + "learning_rate": 2.67160890621137e-05, + "loss": 0.6293, + "step": 133800 + }, + { + "epoch": 3.0855669153541494, + "grad_norm": 2.998616933822632, + "learning_rate": 2.6708817498291184e-05, + "loss": 0.6387, + "step": 134000 + }, + { + "epoch": 3.0901722391084094, + "grad_norm": 3.212735891342163, + "learning_rate": 2.6701545934468667e-05, + "loss": 0.6208, + "step": 134200 + }, + { + "epoch": 3.094777562862669, + "grad_norm": 3.259173631668091, + "learning_rate": 2.6694310728465262e-05, + "loss": 0.6398, + "step": 134400 + }, + { + "epoch": 3.099382886616929, + "grad_norm": 3.346163511276245, + "learning_rate": 2.6687039164642748e-05, + "loss": 0.6488, + "step": 134600 + }, + { + "epoch": 3.103988210371189, + "grad_norm": 3.005929708480835, + "learning_rate": 2.6679767600820234e-05, + "loss": 0.6339, + "step": 134800 + }, + { + "epoch": 3.108593534125449, + "grad_norm": 3.2009568214416504, + "learning_rate": 2.6672496036997716e-05, + "loss": 0.63, + "step": 135000 + }, + { + "epoch": 3.1131988578797087, + "grad_norm": 2.799675226211548, + "learning_rate": 2.6665224473175202e-05, + "loss": 0.6275, + "step": 135200 + }, + { + "epoch": 3.1178041816339688, + "grad_norm": 3.211223840713501, + "learning_rate": 2.6657952909352688e-05, + "loss": 0.6436, + "step": 135400 + }, + { + "epoch": 3.122409505388229, + "grad_norm": 2.964587926864624, + "learning_rate": 2.665068134553017e-05, + "loss": 0.6512, + "step": 135600 + }, + { + "epoch": 3.127014829142489, + "grad_norm": 3.677194595336914, + "learning_rate": 2.6643409781707653e-05, + "loss": 0.6396, + "step": 135800 + }, + { + "epoch": 3.1316201528967484, + "grad_norm": 3.1306989192962646, + "learning_rate": 2.663613821788514e-05, + "loss": 0.6295, + "step": 136000 + }, + { + "epoch": 3.1362254766510085, + "grad_norm": 2.744112968444824, + "learning_rate": 2.6628866654062625e-05, + "loss": 0.6328, + "step": 136200 + }, + { + "epoch": 3.1408308004052685, + "grad_norm": 3.3055408000946045, + "learning_rate": 2.6621595090240108e-05, + "loss": 0.649, + "step": 136400 + }, + { + "epoch": 3.1454361241595286, + "grad_norm": 2.8039541244506836, + "learning_rate": 2.6614323526417593e-05, + "loss": 0.6382, + "step": 136600 + }, + { + "epoch": 3.150041447913788, + "grad_norm": 4.089864730834961, + "learning_rate": 2.6607051962595076e-05, + "loss": 0.642, + "step": 136800 + }, + { + "epoch": 3.154646771668048, + "grad_norm": 4.057522296905518, + "learning_rate": 2.659978039877256e-05, + "loss": 0.6349, + "step": 137000 + }, + { + "epoch": 3.1592520954223082, + "grad_norm": 3.3105990886688232, + "learning_rate": 2.6592508834950044e-05, + "loss": 0.6389, + "step": 137200 + }, + { + "epoch": 3.1638574191765683, + "grad_norm": 3.4372615814208984, + "learning_rate": 2.6585273628946643e-05, + "loss": 0.6244, + "step": 137400 + }, + { + "epoch": 3.168462742930828, + "grad_norm": 2.707204818725586, + "learning_rate": 2.6578002065124126e-05, + "loss": 0.6341, + "step": 137600 + }, + { + "epoch": 3.173068066685088, + "grad_norm": 2.621185064315796, + "learning_rate": 2.657073050130161e-05, + "loss": 0.6464, + "step": 137800 + }, + { + "epoch": 3.177673390439348, + "grad_norm": 3.090057611465454, + "learning_rate": 2.6563458937479097e-05, + "loss": 0.6257, + "step": 138000 + }, + { + "epoch": 3.182278714193608, + "grad_norm": 2.8256921768188477, + "learning_rate": 2.655618737365658e-05, + "loss": 0.6349, + "step": 138200 + }, + { + "epoch": 3.1868840379478676, + "grad_norm": 4.003274917602539, + "learning_rate": 2.6548915809834062e-05, + "loss": 0.6321, + "step": 138400 + }, + { + "epoch": 3.1914893617021276, + "grad_norm": 3.346566677093506, + "learning_rate": 2.654164424601155e-05, + "loss": 0.636, + "step": 138600 + }, + { + "epoch": 3.1960946854563876, + "grad_norm": 3.386808156967163, + "learning_rate": 2.653437268218903e-05, + "loss": 0.6287, + "step": 138800 + }, + { + "epoch": 3.2007000092106477, + "grad_norm": 2.970651149749756, + "learning_rate": 2.6527101118366517e-05, + "loss": 0.6299, + "step": 139000 + }, + { + "epoch": 3.2053053329649073, + "grad_norm": 3.1046876907348633, + "learning_rate": 2.6519865912363115e-05, + "loss": 0.6245, + "step": 139200 + }, + { + "epoch": 3.2099106567191673, + "grad_norm": 3.6521196365356445, + "learning_rate": 2.6512594348540598e-05, + "loss": 0.6315, + "step": 139400 + }, + { + "epoch": 3.2145159804734273, + "grad_norm": 2.994075298309326, + "learning_rate": 2.6505322784718084e-05, + "loss": 0.6459, + "step": 139600 + }, + { + "epoch": 3.2191213042276874, + "grad_norm": 3.732825517654419, + "learning_rate": 2.6498051220895566e-05, + "loss": 0.6445, + "step": 139800 + }, + { + "epoch": 3.223726627981947, + "grad_norm": 3.046517848968506, + "learning_rate": 2.649077965707305e-05, + "loss": 0.6352, + "step": 140000 + }, + { + "epoch": 3.228331951736207, + "grad_norm": 3.441150426864624, + "learning_rate": 2.6483508093250535e-05, + "loss": 0.6337, + "step": 140200 + }, + { + "epoch": 3.232937275490467, + "grad_norm": 4.240359306335449, + "learning_rate": 2.647623652942802e-05, + "loss": 0.6317, + "step": 140400 + }, + { + "epoch": 3.237542599244727, + "grad_norm": 3.5375964641571045, + "learning_rate": 2.6468964965605503e-05, + "loss": 0.6309, + "step": 140600 + }, + { + "epoch": 3.2421479229989867, + "grad_norm": 3.648287773132324, + "learning_rate": 2.646169340178299e-05, + "loss": 0.6336, + "step": 140800 + }, + { + "epoch": 3.2467532467532467, + "grad_norm": 3.4129178524017334, + "learning_rate": 2.6454421837960475e-05, + "loss": 0.6226, + "step": 141000 + }, + { + "epoch": 3.2513585705075068, + "grad_norm": 3.6263647079467773, + "learning_rate": 2.6447150274137954e-05, + "loss": 0.6289, + "step": 141200 + }, + { + "epoch": 3.2559638942617664, + "grad_norm": 2.798513889312744, + "learning_rate": 2.643987871031544e-05, + "loss": 0.631, + "step": 141400 + }, + { + "epoch": 3.2605692180160264, + "grad_norm": 3.113395929336548, + "learning_rate": 2.6432607146492926e-05, + "loss": 0.6316, + "step": 141600 + }, + { + "epoch": 3.2651745417702864, + "grad_norm": 2.998884916305542, + "learning_rate": 2.642533558267041e-05, + "loss": 0.6218, + "step": 141800 + }, + { + "epoch": 3.2697798655245465, + "grad_norm": 3.264549493789673, + "learning_rate": 2.6418064018847894e-05, + "loss": 0.6306, + "step": 142000 + }, + { + "epoch": 3.2743851892788065, + "grad_norm": 2.9390909671783447, + "learning_rate": 2.641079245502538e-05, + "loss": 0.6357, + "step": 142200 + }, + { + "epoch": 3.278990513033066, + "grad_norm": 2.7646069526672363, + "learning_rate": 2.6403520891202863e-05, + "loss": 0.6232, + "step": 142400 + }, + { + "epoch": 3.283595836787326, + "grad_norm": 3.1988773345947266, + "learning_rate": 2.6396249327380345e-05, + "loss": 0.6246, + "step": 142600 + }, + { + "epoch": 3.288201160541586, + "grad_norm": 2.912640333175659, + "learning_rate": 2.638897776355783e-05, + "loss": 0.6361, + "step": 142800 + }, + { + "epoch": 3.2928064842958458, + "grad_norm": 3.98209810256958, + "learning_rate": 2.6381706199735317e-05, + "loss": 0.6234, + "step": 143000 + }, + { + "epoch": 3.297411808050106, + "grad_norm": 3.3420772552490234, + "learning_rate": 2.63744346359128e-05, + "loss": 0.6344, + "step": 143200 + }, + { + "epoch": 3.302017131804366, + "grad_norm": 4.18234395980835, + "learning_rate": 2.6367163072090286e-05, + "loss": 0.6178, + "step": 143400 + }, + { + "epoch": 3.306622455558626, + "grad_norm": 2.9908995628356934, + "learning_rate": 2.6359891508267768e-05, + "loss": 0.6181, + "step": 143600 + }, + { + "epoch": 3.311227779312886, + "grad_norm": 3.504472255706787, + "learning_rate": 2.635261994444525e-05, + "loss": 0.6403, + "step": 143800 + }, + { + "epoch": 3.3158331030671455, + "grad_norm": 3.0107550621032715, + "learning_rate": 2.6345348380622737e-05, + "loss": 0.6338, + "step": 144000 + }, + { + "epoch": 3.3204384268214056, + "grad_norm": 3.5131990909576416, + "learning_rate": 2.6338076816800223e-05, + "loss": 0.6412, + "step": 144200 + }, + { + "epoch": 3.3250437505756656, + "grad_norm": 3.315256357192993, + "learning_rate": 2.6330805252977705e-05, + "loss": 0.6329, + "step": 144400 + }, + { + "epoch": 3.329649074329925, + "grad_norm": 3.0303072929382324, + "learning_rate": 2.632353368915519e-05, + "loss": 0.6312, + "step": 144600 + }, + { + "epoch": 3.3342543980841852, + "grad_norm": 3.3184735774993896, + "learning_rate": 2.6316262125332677e-05, + "loss": 0.647, + "step": 144800 + }, + { + "epoch": 3.3388597218384453, + "grad_norm": 2.842087745666504, + "learning_rate": 2.630899056151016e-05, + "loss": 0.622, + "step": 145000 + }, + { + "epoch": 3.3434650455927053, + "grad_norm": 2.741712808609009, + "learning_rate": 2.6301718997687642e-05, + "loss": 0.635, + "step": 145200 + }, + { + "epoch": 3.3480703693469653, + "grad_norm": 2.9912500381469727, + "learning_rate": 2.6294447433865128e-05, + "loss": 0.6426, + "step": 145400 + }, + { + "epoch": 3.352675693101225, + "grad_norm": 3.2431087493896484, + "learning_rate": 2.6287175870042614e-05, + "loss": 0.6314, + "step": 145600 + }, + { + "epoch": 3.357281016855485, + "grad_norm": 2.5913467407226562, + "learning_rate": 2.6279904306220096e-05, + "loss": 0.6282, + "step": 145800 + }, + { + "epoch": 3.361886340609745, + "grad_norm": 2.8884477615356445, + "learning_rate": 2.6272632742397582e-05, + "loss": 0.6446, + "step": 146000 + }, + { + "epoch": 3.3664916643640046, + "grad_norm": 3.9637069702148438, + "learning_rate": 2.6265361178575065e-05, + "loss": 0.6327, + "step": 146200 + }, + { + "epoch": 3.3710969881182646, + "grad_norm": 3.9412338733673096, + "learning_rate": 2.6258089614752547e-05, + "loss": 0.6294, + "step": 146400 + }, + { + "epoch": 3.3757023118725247, + "grad_norm": 3.3022706508636475, + "learning_rate": 2.6250818050930033e-05, + "loss": 0.6326, + "step": 146600 + }, + { + "epoch": 3.3803076356267847, + "grad_norm": 2.921814441680908, + "learning_rate": 2.624354648710752e-05, + "loss": 0.6312, + "step": 146800 + }, + { + "epoch": 3.3849129593810443, + "grad_norm": 3.0949013233184814, + "learning_rate": 2.6236274923285e-05, + "loss": 0.6192, + "step": 147000 + }, + { + "epoch": 3.3895182831353043, + "grad_norm": 4.004492282867432, + "learning_rate": 2.6229003359462488e-05, + "loss": 0.6298, + "step": 147200 + }, + { + "epoch": 3.3941236068895644, + "grad_norm": 3.4825210571289062, + "learning_rate": 2.622173179563997e-05, + "loss": 0.6333, + "step": 147400 + }, + { + "epoch": 3.3987289306438244, + "grad_norm": 3.0541634559631348, + "learning_rate": 2.6214460231817453e-05, + "loss": 0.6347, + "step": 147600 + }, + { + "epoch": 3.403334254398084, + "grad_norm": 4.5528693199157715, + "learning_rate": 2.6207261383633164e-05, + "loss": 0.6381, + "step": 147800 + }, + { + "epoch": 3.407939578152344, + "grad_norm": 3.09186053276062, + "learning_rate": 2.6199989819810646e-05, + "loss": 0.6424, + "step": 148000 + }, + { + "epoch": 3.412544901906604, + "grad_norm": 4.061122894287109, + "learning_rate": 2.6192718255988132e-05, + "loss": 0.6292, + "step": 148200 + }, + { + "epoch": 3.417150225660864, + "grad_norm": 3.7221198081970215, + "learning_rate": 2.6185446692165618e-05, + "loss": 0.6446, + "step": 148400 + }, + { + "epoch": 3.4217555494151237, + "grad_norm": 2.8360812664031982, + "learning_rate": 2.6178175128343104e-05, + "loss": 0.6492, + "step": 148600 + }, + { + "epoch": 3.4263608731693838, + "grad_norm": 3.033463954925537, + "learning_rate": 2.6170903564520587e-05, + "loss": 0.6273, + "step": 148800 + }, + { + "epoch": 3.430966196923644, + "grad_norm": 5.270578384399414, + "learning_rate": 2.6163632000698073e-05, + "loss": 0.6235, + "step": 149000 + }, + { + "epoch": 3.435571520677904, + "grad_norm": 2.8622264862060547, + "learning_rate": 2.6156360436875555e-05, + "loss": 0.6325, + "step": 149200 + }, + { + "epoch": 3.4401768444321634, + "grad_norm": 3.450784206390381, + "learning_rate": 2.6149088873053038e-05, + "loss": 0.6373, + "step": 149400 + }, + { + "epoch": 3.4447821681864235, + "grad_norm": 3.269690990447998, + "learning_rate": 2.6141817309230524e-05, + "loss": 0.6274, + "step": 149600 + }, + { + "epoch": 3.4493874919406835, + "grad_norm": 3.7646055221557617, + "learning_rate": 2.613454574540801e-05, + "loss": 0.6287, + "step": 149800 + }, + { + "epoch": 3.4539928156949435, + "grad_norm": 3.5802409648895264, + "learning_rate": 2.6127274181585492e-05, + "loss": 0.6322, + "step": 150000 + }, + { + "epoch": 3.458598139449203, + "grad_norm": 3.1698813438415527, + "learning_rate": 2.6120002617762978e-05, + "loss": 0.6339, + "step": 150200 + }, + { + "epoch": 3.463203463203463, + "grad_norm": 2.7178969383239746, + "learning_rate": 2.611273105394046e-05, + "loss": 0.6382, + "step": 150400 + }, + { + "epoch": 3.467808786957723, + "grad_norm": 3.088505506515503, + "learning_rate": 2.6105459490117943e-05, + "loss": 0.6256, + "step": 150600 + }, + { + "epoch": 3.4724141107119832, + "grad_norm": 2.7076618671417236, + "learning_rate": 2.609818792629543e-05, + "loss": 0.6312, + "step": 150800 + }, + { + "epoch": 3.477019434466243, + "grad_norm": 2.320732831954956, + "learning_rate": 2.6090916362472915e-05, + "loss": 0.6289, + "step": 151000 + }, + { + "epoch": 3.481624758220503, + "grad_norm": 3.147030830383301, + "learning_rate": 2.6083644798650397e-05, + "loss": 0.641, + "step": 151200 + }, + { + "epoch": 3.486230081974763, + "grad_norm": 3.3665804862976074, + "learning_rate": 2.6076373234827883e-05, + "loss": 0.633, + "step": 151400 + }, + { + "epoch": 3.4908354057290225, + "grad_norm": 3.705167531967163, + "learning_rate": 2.606910167100537e-05, + "loss": 0.6372, + "step": 151600 + }, + { + "epoch": 3.4954407294832825, + "grad_norm": 2.7983779907226562, + "learning_rate": 2.606183010718285e-05, + "loss": 0.6277, + "step": 151800 + }, + { + "epoch": 3.5000460532375426, + "grad_norm": 3.0177760124206543, + "learning_rate": 2.6054594901179447e-05, + "loss": 0.6277, + "step": 152000 + }, + { + "epoch": 3.5046513769918026, + "grad_norm": 3.7722854614257812, + "learning_rate": 2.6047323337356933e-05, + "loss": 0.6278, + "step": 152200 + }, + { + "epoch": 3.5092567007460627, + "grad_norm": 2.6710495948791504, + "learning_rate": 2.6040051773534415e-05, + "loss": 0.6202, + "step": 152400 + }, + { + "epoch": 3.5138620245003223, + "grad_norm": 3.0754504203796387, + "learning_rate": 2.60327802097119e-05, + "loss": 0.6317, + "step": 152600 + }, + { + "epoch": 3.5184673482545823, + "grad_norm": 3.376631498336792, + "learning_rate": 2.60255450037085e-05, + "loss": 0.6422, + "step": 152800 + }, + { + "epoch": 3.5230726720088423, + "grad_norm": 2.970846652984619, + "learning_rate": 2.6018273439885982e-05, + "loss": 0.6315, + "step": 153000 + }, + { + "epoch": 3.527677995763102, + "grad_norm": 2.7988736629486084, + "learning_rate": 2.6011001876063468e-05, + "loss": 0.6321, + "step": 153200 + }, + { + "epoch": 3.532283319517362, + "grad_norm": 2.94100022315979, + "learning_rate": 2.600373031224095e-05, + "loss": 0.6253, + "step": 153400 + }, + { + "epoch": 3.536888643271622, + "grad_norm": 3.08990216255188, + "learning_rate": 2.5996458748418433e-05, + "loss": 0.6337, + "step": 153600 + }, + { + "epoch": 3.541493967025882, + "grad_norm": 2.9995460510253906, + "learning_rate": 2.598918718459592e-05, + "loss": 0.634, + "step": 153800 + }, + { + "epoch": 3.546099290780142, + "grad_norm": 3.2981879711151123, + "learning_rate": 2.5981915620773405e-05, + "loss": 0.6263, + "step": 154000 + }, + { + "epoch": 3.5507046145344017, + "grad_norm": 3.200422525405884, + "learning_rate": 2.5974644056950888e-05, + "loss": 0.6185, + "step": 154200 + }, + { + "epoch": 3.5553099382886617, + "grad_norm": 2.9688191413879395, + "learning_rate": 2.5967372493128374e-05, + "loss": 0.6213, + "step": 154400 + }, + { + "epoch": 3.5599152620429217, + "grad_norm": 2.5572378635406494, + "learning_rate": 2.5960100929305856e-05, + "loss": 0.6365, + "step": 154600 + }, + { + "epoch": 3.5645205857971813, + "grad_norm": 2.690509557723999, + "learning_rate": 2.5952829365483342e-05, + "loss": 0.6428, + "step": 154800 + }, + { + "epoch": 3.5691259095514414, + "grad_norm": 5.786709785461426, + "learning_rate": 2.5945594159479937e-05, + "loss": 0.6448, + "step": 155000 + }, + { + "epoch": 3.5737312333057014, + "grad_norm": 2.30430269241333, + "learning_rate": 2.5938322595657423e-05, + "loss": 0.6192, + "step": 155200 + }, + { + "epoch": 3.5783365570599615, + "grad_norm": 3.188671588897705, + "learning_rate": 2.5931051031834906e-05, + "loss": 0.6335, + "step": 155400 + }, + { + "epoch": 3.5829418808142215, + "grad_norm": 2.945617198944092, + "learning_rate": 2.592377946801239e-05, + "loss": 0.6265, + "step": 155600 + }, + { + "epoch": 3.587547204568481, + "grad_norm": 3.1751980781555176, + "learning_rate": 2.5916507904189878e-05, + "loss": 0.6199, + "step": 155800 + }, + { + "epoch": 3.592152528322741, + "grad_norm": 2.6950361728668213, + "learning_rate": 2.590923634036736e-05, + "loss": 0.6283, + "step": 156000 + }, + { + "epoch": 3.596757852077001, + "grad_norm": 3.544735908508301, + "learning_rate": 2.5901964776544843e-05, + "loss": 0.6281, + "step": 156200 + }, + { + "epoch": 3.6013631758312608, + "grad_norm": 2.9812283515930176, + "learning_rate": 2.589469321272233e-05, + "loss": 0.6275, + "step": 156400 + }, + { + "epoch": 3.605968499585521, + "grad_norm": 3.3013553619384766, + "learning_rate": 2.5887421648899814e-05, + "loss": 0.6292, + "step": 156600 + }, + { + "epoch": 3.610573823339781, + "grad_norm": 3.0218076705932617, + "learning_rate": 2.5880150085077297e-05, + "loss": 0.6279, + "step": 156800 + }, + { + "epoch": 3.615179147094041, + "grad_norm": 2.9628562927246094, + "learning_rate": 2.5872878521254783e-05, + "loss": 0.6444, + "step": 157000 + }, + { + "epoch": 3.619784470848301, + "grad_norm": 3.080231189727783, + "learning_rate": 2.586560695743227e-05, + "loss": 0.6305, + "step": 157200 + }, + { + "epoch": 3.6243897946025605, + "grad_norm": 3.3439502716064453, + "learning_rate": 2.5858335393609748e-05, + "loss": 0.6354, + "step": 157400 + }, + { + "epoch": 3.6289951183568205, + "grad_norm": 3.6231765747070312, + "learning_rate": 2.5851063829787234e-05, + "loss": 0.622, + "step": 157600 + }, + { + "epoch": 3.6336004421110806, + "grad_norm": 4.266796588897705, + "learning_rate": 2.584379226596472e-05, + "loss": 0.6385, + "step": 157800 + }, + { + "epoch": 3.63820576586534, + "grad_norm": 2.9517550468444824, + "learning_rate": 2.5836520702142202e-05, + "loss": 0.6319, + "step": 158000 + }, + { + "epoch": 3.6428110896196, + "grad_norm": 3.549234390258789, + "learning_rate": 2.5829249138319688e-05, + "loss": 0.6233, + "step": 158200 + }, + { + "epoch": 3.6474164133738602, + "grad_norm": 2.957629442214966, + "learning_rate": 2.5821977574497174e-05, + "loss": 0.628, + "step": 158400 + }, + { + "epoch": 3.65202173712812, + "grad_norm": 2.4832262992858887, + "learning_rate": 2.581474236849377e-05, + "loss": 0.6252, + "step": 158600 + }, + { + "epoch": 3.65662706088238, + "grad_norm": 3.6070032119750977, + "learning_rate": 2.5807470804671255e-05, + "loss": 0.6317, + "step": 158800 + }, + { + "epoch": 3.66123238463664, + "grad_norm": 3.4381484985351562, + "learning_rate": 2.5800199240848738e-05, + "loss": 0.6174, + "step": 159000 + }, + { + "epoch": 3.6658377083909, + "grad_norm": 2.8952486515045166, + "learning_rate": 2.579292767702622e-05, + "loss": 0.6334, + "step": 159200 + }, + { + "epoch": 3.67044303214516, + "grad_norm": 3.704712390899658, + "learning_rate": 2.5785656113203706e-05, + "loss": 0.6371, + "step": 159400 + }, + { + "epoch": 3.6750483558994196, + "grad_norm": 3.430786609649658, + "learning_rate": 2.5778384549381192e-05, + "loss": 0.6369, + "step": 159600 + }, + { + "epoch": 3.6796536796536796, + "grad_norm": 3.201442003250122, + "learning_rate": 2.5771112985558675e-05, + "loss": 0.6274, + "step": 159800 + }, + { + "epoch": 3.6842590034079397, + "grad_norm": 3.125777244567871, + "learning_rate": 2.576384142173616e-05, + "loss": 0.6318, + "step": 160000 + }, + { + "epoch": 3.6888643271621993, + "grad_norm": 2.6248064041137695, + "learning_rate": 2.5756569857913643e-05, + "loss": 0.6344, + "step": 160200 + }, + { + "epoch": 3.6934696509164593, + "grad_norm": 3.5570108890533447, + "learning_rate": 2.5749298294091126e-05, + "loss": 0.6293, + "step": 160400 + }, + { + "epoch": 3.6980749746707193, + "grad_norm": 3.13261342048645, + "learning_rate": 2.5742063088087724e-05, + "loss": 0.634, + "step": 160600 + }, + { + "epoch": 3.7026802984249794, + "grad_norm": 3.070614814758301, + "learning_rate": 2.573479152426521e-05, + "loss": 0.6255, + "step": 160800 + }, + { + "epoch": 3.7072856221792394, + "grad_norm": 2.835911750793457, + "learning_rate": 2.5727519960442693e-05, + "loss": 0.6224, + "step": 161000 + }, + { + "epoch": 3.711890945933499, + "grad_norm": 2.9913852214813232, + "learning_rate": 2.572024839662018e-05, + "loss": 0.6335, + "step": 161200 + }, + { + "epoch": 3.716496269687759, + "grad_norm": 3.0731775760650635, + "learning_rate": 2.5712976832797664e-05, + "loss": 0.6479, + "step": 161400 + }, + { + "epoch": 3.721101593442019, + "grad_norm": 3.5844905376434326, + "learning_rate": 2.5705705268975144e-05, + "loss": 0.6201, + "step": 161600 + }, + { + "epoch": 3.7257069171962787, + "grad_norm": 3.9543886184692383, + "learning_rate": 2.569843370515263e-05, + "loss": 0.6318, + "step": 161800 + }, + { + "epoch": 3.7303122409505387, + "grad_norm": 2.6492578983306885, + "learning_rate": 2.5691162141330115e-05, + "loss": 0.6319, + "step": 162000 + }, + { + "epoch": 3.7349175647047987, + "grad_norm": 3.418893814086914, + "learning_rate": 2.5683890577507598e-05, + "loss": 0.637, + "step": 162200 + }, + { + "epoch": 3.739522888459059, + "grad_norm": 2.8401198387145996, + "learning_rate": 2.5676619013685084e-05, + "loss": 0.6223, + "step": 162400 + }, + { + "epoch": 3.744128212213319, + "grad_norm": 3.19801664352417, + "learning_rate": 2.566934744986257e-05, + "loss": 0.6277, + "step": 162600 + }, + { + "epoch": 3.7487335359675784, + "grad_norm": 2.9500930309295654, + "learning_rate": 2.5662075886040052e-05, + "loss": 0.6275, + "step": 162800 + }, + { + "epoch": 3.7533388597218384, + "grad_norm": 2.7662055492401123, + "learning_rate": 2.5654804322217535e-05, + "loss": 0.619, + "step": 163000 + }, + { + "epoch": 3.7579441834760985, + "grad_norm": 2.61580753326416, + "learning_rate": 2.564753275839502e-05, + "loss": 0.6305, + "step": 163200 + }, + { + "epoch": 3.762549507230358, + "grad_norm": 3.7837417125701904, + "learning_rate": 2.5640261194572507e-05, + "loss": 0.6273, + "step": 163400 + }, + { + "epoch": 3.767154830984618, + "grad_norm": 3.516418695449829, + "learning_rate": 2.563298963074999e-05, + "loss": 0.638, + "step": 163600 + }, + { + "epoch": 3.771760154738878, + "grad_norm": 2.6570441722869873, + "learning_rate": 2.5625718066927475e-05, + "loss": 0.6319, + "step": 163800 + }, + { + "epoch": 3.776365478493138, + "grad_norm": 2.759058952331543, + "learning_rate": 2.561844650310496e-05, + "loss": 0.6406, + "step": 164000 + }, + { + "epoch": 3.7809708022473982, + "grad_norm": 2.742692232131958, + "learning_rate": 2.561117493928244e-05, + "loss": 0.6278, + "step": 164200 + }, + { + "epoch": 3.785576126001658, + "grad_norm": 3.729440212249756, + "learning_rate": 2.5603903375459926e-05, + "loss": 0.6289, + "step": 164400 + }, + { + "epoch": 3.790181449755918, + "grad_norm": 3.219895601272583, + "learning_rate": 2.5596631811637412e-05, + "loss": 0.6275, + "step": 164600 + }, + { + "epoch": 3.794786773510178, + "grad_norm": 3.222836971282959, + "learning_rate": 2.5589360247814894e-05, + "loss": 0.6331, + "step": 164800 + }, + { + "epoch": 3.7993920972644375, + "grad_norm": 2.4223623275756836, + "learning_rate": 2.558208868399238e-05, + "loss": 0.623, + "step": 165000 + }, + { + "epoch": 3.8039974210186975, + "grad_norm": 3.0702037811279297, + "learning_rate": 2.5574817120169866e-05, + "loss": 0.6285, + "step": 165200 + }, + { + "epoch": 3.8086027447729576, + "grad_norm": 2.8104872703552246, + "learning_rate": 2.556758191416646e-05, + "loss": 0.6353, + "step": 165400 + }, + { + "epoch": 3.8132080685272176, + "grad_norm": 3.1389353275299072, + "learning_rate": 2.556034670816306e-05, + "loss": 0.6363, + "step": 165600 + }, + { + "epoch": 3.8178133922814776, + "grad_norm": 3.318134307861328, + "learning_rate": 2.5553111502159655e-05, + "loss": 0.6259, + "step": 165800 + }, + { + "epoch": 3.8224187160357372, + "grad_norm": 3.075765609741211, + "learning_rate": 2.554583993833714e-05, + "loss": 0.6325, + "step": 166000 + }, + { + "epoch": 3.8270240397899973, + "grad_norm": 3.2406067848205566, + "learning_rate": 2.5538568374514624e-05, + "loss": 0.6303, + "step": 166200 + }, + { + "epoch": 3.8316293635442573, + "grad_norm": 3.4027702808380127, + "learning_rate": 2.5531296810692106e-05, + "loss": 0.634, + "step": 166400 + }, + { + "epoch": 3.836234687298517, + "grad_norm": 3.2992939949035645, + "learning_rate": 2.5524025246869592e-05, + "loss": 0.6285, + "step": 166600 + }, + { + "epoch": 3.840840011052777, + "grad_norm": 3.0388951301574707, + "learning_rate": 2.5516753683047078e-05, + "loss": 0.631, + "step": 166800 + }, + { + "epoch": 3.845445334807037, + "grad_norm": 3.0130348205566406, + "learning_rate": 2.550948211922456e-05, + "loss": 0.6239, + "step": 167000 + }, + { + "epoch": 3.850050658561297, + "grad_norm": 3.0870108604431152, + "learning_rate": 2.5502210555402047e-05, + "loss": 0.6318, + "step": 167200 + }, + { + "epoch": 3.8546559823155566, + "grad_norm": 3.3640689849853516, + "learning_rate": 2.549493899157953e-05, + "loss": 0.6381, + "step": 167400 + }, + { + "epoch": 3.8592613060698167, + "grad_norm": 3.3157622814178467, + "learning_rate": 2.548766742775701e-05, + "loss": 0.6208, + "step": 167600 + }, + { + "epoch": 3.8638666298240767, + "grad_norm": 3.0967700481414795, + "learning_rate": 2.5480395863934498e-05, + "loss": 0.6294, + "step": 167800 + }, + { + "epoch": 3.8684719535783367, + "grad_norm": 2.9279990196228027, + "learning_rate": 2.5473124300111983e-05, + "loss": 0.629, + "step": 168000 + }, + { + "epoch": 3.8730772773325963, + "grad_norm": 3.182149648666382, + "learning_rate": 2.546585273628947e-05, + "loss": 0.62, + "step": 168200 + }, + { + "epoch": 3.8776826010868564, + "grad_norm": 2.424578905105591, + "learning_rate": 2.5458581172466952e-05, + "loss": 0.6178, + "step": 168400 + }, + { + "epoch": 3.8822879248411164, + "grad_norm": 3.4072725772857666, + "learning_rate": 2.5451309608644434e-05, + "loss": 0.6325, + "step": 168600 + }, + { + "epoch": 3.886893248595376, + "grad_norm": 3.33803653717041, + "learning_rate": 2.544407440264103e-05, + "loss": 0.6347, + "step": 168800 + }, + { + "epoch": 3.891498572349636, + "grad_norm": 3.434117555618286, + "learning_rate": 2.5436802838818516e-05, + "loss": 0.6278, + "step": 169000 + }, + { + "epoch": 3.896103896103896, + "grad_norm": 3.191801071166992, + "learning_rate": 2.5429531274996e-05, + "loss": 0.6292, + "step": 169200 + }, + { + "epoch": 3.900709219858156, + "grad_norm": 3.666332483291626, + "learning_rate": 2.5422259711173484e-05, + "loss": 0.6279, + "step": 169400 + }, + { + "epoch": 3.905314543612416, + "grad_norm": 4.157709121704102, + "learning_rate": 2.541498814735097e-05, + "loss": 0.6244, + "step": 169600 + }, + { + "epoch": 3.9099198673666757, + "grad_norm": 2.783106565475464, + "learning_rate": 2.5407716583528456e-05, + "loss": 0.6278, + "step": 169800 + }, + { + "epoch": 3.9145251911209358, + "grad_norm": 3.1697680950164795, + "learning_rate": 2.5400445019705942e-05, + "loss": 0.6312, + "step": 170000 + }, + { + "epoch": 3.919130514875196, + "grad_norm": 2.8394782543182373, + "learning_rate": 2.539317345588342e-05, + "loss": 0.6282, + "step": 170200 + }, + { + "epoch": 3.9237358386294554, + "grad_norm": 2.653749704360962, + "learning_rate": 2.5385901892060907e-05, + "loss": 0.6185, + "step": 170400 + }, + { + "epoch": 3.9283411623837154, + "grad_norm": 2.6118738651275635, + "learning_rate": 2.5378630328238393e-05, + "loss": 0.6259, + "step": 170600 + }, + { + "epoch": 3.9329464861379755, + "grad_norm": 4.355154991149902, + "learning_rate": 2.5371358764415875e-05, + "loss": 0.6394, + "step": 170800 + }, + { + "epoch": 3.9375518098922355, + "grad_norm": 2.658700466156006, + "learning_rate": 2.536408720059336e-05, + "loss": 0.6277, + "step": 171000 + }, + { + "epoch": 3.9421571336464956, + "grad_norm": 3.9519739151000977, + "learning_rate": 2.5356815636770847e-05, + "loss": 0.6268, + "step": 171200 + }, + { + "epoch": 3.946762457400755, + "grad_norm": 2.6966614723205566, + "learning_rate": 2.5349544072948326e-05, + "loss": 0.6368, + "step": 171400 + }, + { + "epoch": 3.951367781155015, + "grad_norm": 2.29526686668396, + "learning_rate": 2.5342272509125812e-05, + "loss": 0.6327, + "step": 171600 + }, + { + "epoch": 3.9559731049092752, + "grad_norm": 2.904367208480835, + "learning_rate": 2.5335000945303298e-05, + "loss": 0.6201, + "step": 171800 + }, + { + "epoch": 3.960578428663535, + "grad_norm": 3.2330174446105957, + "learning_rate": 2.532772938148078e-05, + "loss": 0.6297, + "step": 172000 + }, + { + "epoch": 3.965183752417795, + "grad_norm": 2.654149293899536, + "learning_rate": 2.5320457817658266e-05, + "loss": 0.6334, + "step": 172200 + }, + { + "epoch": 3.969789076172055, + "grad_norm": 2.883559465408325, + "learning_rate": 2.5313186253835752e-05, + "loss": 0.6175, + "step": 172400 + }, + { + "epoch": 3.974394399926315, + "grad_norm": 2.8193156719207764, + "learning_rate": 2.5305914690013235e-05, + "loss": 0.6293, + "step": 172600 + }, + { + "epoch": 3.978999723680575, + "grad_norm": 3.1371946334838867, + "learning_rate": 2.5298643126190717e-05, + "loss": 0.6256, + "step": 172800 + }, + { + "epoch": 3.9836050474348346, + "grad_norm": 3.378344774246216, + "learning_rate": 2.5291371562368203e-05, + "loss": 0.6241, + "step": 173000 + }, + { + "epoch": 3.9882103711890946, + "grad_norm": 3.4124484062194824, + "learning_rate": 2.52841363563648e-05, + "loss": 0.6255, + "step": 173200 + }, + { + "epoch": 3.9928156949433546, + "grad_norm": 2.8864808082580566, + "learning_rate": 2.5276901150361397e-05, + "loss": 0.6253, + "step": 173400 + }, + { + "epoch": 3.9974210186976142, + "grad_norm": 2.594573736190796, + "learning_rate": 2.5269629586538883e-05, + "loss": 0.6265, + "step": 173600 + }, + { + "epoch": 4.0, + "eval_loss": 0.5913488864898682, + "eval_runtime": 146.1973, + "eval_samples_per_second": 193.991, + "eval_steps_per_second": 12.127, + "step": 173712 + }, + { + "epoch": 4.002026342451875, + "grad_norm": 4.0110249519348145, + "learning_rate": 2.5262358022716366e-05, + "loss": 0.6277, + "step": 173800 + }, + { + "epoch": 4.006631666206134, + "grad_norm": 3.5719571113586426, + "learning_rate": 2.525508645889385e-05, + "loss": 0.6222, + "step": 174000 + }, + { + "epoch": 4.011236989960394, + "grad_norm": 3.5779929161071777, + "learning_rate": 2.5247814895071337e-05, + "loss": 0.6147, + "step": 174200 + }, + { + "epoch": 4.015842313714654, + "grad_norm": 2.7468457221984863, + "learning_rate": 2.5240543331248817e-05, + "loss": 0.6193, + "step": 174400 + }, + { + "epoch": 4.020447637468914, + "grad_norm": 3.6863133907318115, + "learning_rate": 2.5233271767426302e-05, + "loss": 0.627, + "step": 174600 + }, + { + "epoch": 4.025052961223174, + "grad_norm": 3.8438150882720947, + "learning_rate": 2.522600020360379e-05, + "loss": 0.6304, + "step": 174800 + }, + { + "epoch": 4.029658284977434, + "grad_norm": 3.4153921604156494, + "learning_rate": 2.521872863978127e-05, + "loss": 0.6209, + "step": 175000 + }, + { + "epoch": 4.034263608731694, + "grad_norm": 3.0748400688171387, + "learning_rate": 2.5211457075958757e-05, + "loss": 0.6252, + "step": 175200 + }, + { + "epoch": 4.038868932485954, + "grad_norm": 4.052780628204346, + "learning_rate": 2.5204185512136243e-05, + "loss": 0.6123, + "step": 175400 + }, + { + "epoch": 4.043474256240214, + "grad_norm": 3.1808793544769287, + "learning_rate": 2.5196913948313722e-05, + "loss": 0.6148, + "step": 175600 + }, + { + "epoch": 4.048079579994473, + "grad_norm": 2.881800413131714, + "learning_rate": 2.5189642384491208e-05, + "loss": 0.6156, + "step": 175800 + }, + { + "epoch": 4.052684903748734, + "grad_norm": 2.8981564044952393, + "learning_rate": 2.5182370820668694e-05, + "loss": 0.6211, + "step": 176000 + }, + { + "epoch": 4.057290227502993, + "grad_norm": 3.5495710372924805, + "learning_rate": 2.517509925684618e-05, + "loss": 0.617, + "step": 176200 + }, + { + "epoch": 4.061895551257253, + "grad_norm": 2.4835290908813477, + "learning_rate": 2.5167827693023662e-05, + "loss": 0.6297, + "step": 176400 + }, + { + "epoch": 4.0665008750115135, + "grad_norm": 2.574183225631714, + "learning_rate": 2.5160556129201148e-05, + "loss": 0.6176, + "step": 176600 + }, + { + "epoch": 4.071106198765773, + "grad_norm": 3.0550966262817383, + "learning_rate": 2.5153284565378634e-05, + "loss": 0.6242, + "step": 176800 + }, + { + "epoch": 4.0757115225200335, + "grad_norm": 3.120473861694336, + "learning_rate": 2.5146013001556113e-05, + "loss": 0.6201, + "step": 177000 + }, + { + "epoch": 4.080316846274293, + "grad_norm": 2.9657979011535645, + "learning_rate": 2.51387414377336e-05, + "loss": 0.6219, + "step": 177200 + }, + { + "epoch": 4.084922170028553, + "grad_norm": 2.9161338806152344, + "learning_rate": 2.5131469873911085e-05, + "loss": 0.623, + "step": 177400 + }, + { + "epoch": 4.089527493782813, + "grad_norm": 2.9412291049957275, + "learning_rate": 2.5124198310088567e-05, + "loss": 0.6353, + "step": 177600 + }, + { + "epoch": 4.094132817537073, + "grad_norm": 3.1718437671661377, + "learning_rate": 2.5116926746266053e-05, + "loss": 0.6205, + "step": 177800 + }, + { + "epoch": 4.098738141291332, + "grad_norm": 4.523801326751709, + "learning_rate": 2.510965518244354e-05, + "loss": 0.6243, + "step": 178000 + }, + { + "epoch": 4.103343465045593, + "grad_norm": 2.9366707801818848, + "learning_rate": 2.510238361862102e-05, + "loss": 0.619, + "step": 178200 + }, + { + "epoch": 4.1079487887998525, + "grad_norm": 2.822934865951538, + "learning_rate": 2.5095148412617617e-05, + "loss": 0.6183, + "step": 178400 + }, + { + "epoch": 4.112554112554113, + "grad_norm": 4.115916728973389, + "learning_rate": 2.5087876848795103e-05, + "loss": 0.6153, + "step": 178600 + }, + { + "epoch": 4.1171594363083726, + "grad_norm": 3.5023276805877686, + "learning_rate": 2.5080605284972585e-05, + "loss": 0.6262, + "step": 178800 + }, + { + "epoch": 4.121764760062632, + "grad_norm": 2.906297206878662, + "learning_rate": 2.507333372115007e-05, + "loss": 0.6307, + "step": 179000 + }, + { + "epoch": 4.126370083816893, + "grad_norm": 2.6169967651367188, + "learning_rate": 2.5066062157327557e-05, + "loss": 0.625, + "step": 179200 + }, + { + "epoch": 4.130975407571152, + "grad_norm": 3.4697892665863037, + "learning_rate": 2.505879059350504e-05, + "loss": 0.631, + "step": 179400 + }, + { + "epoch": 4.135580731325412, + "grad_norm": 3.589235305786133, + "learning_rate": 2.5051519029682522e-05, + "loss": 0.6238, + "step": 179600 + }, + { + "epoch": 4.140186055079672, + "grad_norm": 3.2380635738372803, + "learning_rate": 2.5044247465860008e-05, + "loss": 0.6222, + "step": 179800 + }, + { + "epoch": 4.144791378833932, + "grad_norm": 2.568429946899414, + "learning_rate": 2.503697590203749e-05, + "loss": 0.6218, + "step": 180000 + }, + { + "epoch": 4.149396702588192, + "grad_norm": 3.3458428382873535, + "learning_rate": 2.5029704338214977e-05, + "loss": 0.6222, + "step": 180200 + }, + { + "epoch": 4.154002026342452, + "grad_norm": 3.1617233753204346, + "learning_rate": 2.5022432774392463e-05, + "loss": 0.6284, + "step": 180400 + }, + { + "epoch": 4.158607350096712, + "grad_norm": 2.6967251300811768, + "learning_rate": 2.5015197568389058e-05, + "loss": 0.6211, + "step": 180600 + }, + { + "epoch": 4.163212673850972, + "grad_norm": 4.2213568687438965, + "learning_rate": 2.5007926004566544e-05, + "loss": 0.6242, + "step": 180800 + }, + { + "epoch": 4.167817997605232, + "grad_norm": 3.8117053508758545, + "learning_rate": 2.500065444074403e-05, + "loss": 0.6182, + "step": 181000 + }, + { + "epoch": 4.172423321359491, + "grad_norm": 3.316926956176758, + "learning_rate": 2.499338287692151e-05, + "loss": 0.6174, + "step": 181200 + }, + { + "epoch": 4.177028645113752, + "grad_norm": 3.363097667694092, + "learning_rate": 2.4986111313098995e-05, + "loss": 0.6207, + "step": 181400 + }, + { + "epoch": 4.181633968868011, + "grad_norm": 3.064410924911499, + "learning_rate": 2.497883974927648e-05, + "loss": 0.6218, + "step": 181600 + }, + { + "epoch": 4.186239292622272, + "grad_norm": 2.6976685523986816, + "learning_rate": 2.4971568185453963e-05, + "loss": 0.6128, + "step": 181800 + }, + { + "epoch": 4.190844616376531, + "grad_norm": 3.532400608062744, + "learning_rate": 2.496429662163145e-05, + "loss": 0.6185, + "step": 182000 + }, + { + "epoch": 4.195449940130791, + "grad_norm": 2.791353702545166, + "learning_rate": 2.4957025057808935e-05, + "loss": 0.6279, + "step": 182200 + }, + { + "epoch": 4.2000552638850515, + "grad_norm": 2.433746814727783, + "learning_rate": 2.4949753493986417e-05, + "loss": 0.6216, + "step": 182400 + }, + { + "epoch": 4.204660587639311, + "grad_norm": 3.122479200363159, + "learning_rate": 2.49424819301639e-05, + "loss": 0.6206, + "step": 182600 + }, + { + "epoch": 4.209265911393571, + "grad_norm": 3.7672858238220215, + "learning_rate": 2.4935210366341386e-05, + "loss": 0.6327, + "step": 182800 + }, + { + "epoch": 4.213871235147831, + "grad_norm": 2.832796812057495, + "learning_rate": 2.4927938802518872e-05, + "loss": 0.6363, + "step": 183000 + }, + { + "epoch": 4.218476558902091, + "grad_norm": 2.60891056060791, + "learning_rate": 2.4920667238696354e-05, + "loss": 0.6279, + "step": 183200 + }, + { + "epoch": 4.22308188265635, + "grad_norm": 2.9585559368133545, + "learning_rate": 2.491339567487384e-05, + "loss": 0.6306, + "step": 183400 + }, + { + "epoch": 4.227687206410611, + "grad_norm": 2.4493677616119385, + "learning_rate": 2.4906124111051326e-05, + "loss": 0.6151, + "step": 183600 + }, + { + "epoch": 4.23229253016487, + "grad_norm": 3.008601188659668, + "learning_rate": 2.4898852547228805e-05, + "loss": 0.61, + "step": 183800 + }, + { + "epoch": 4.236897853919131, + "grad_norm": 3.0694305896759033, + "learning_rate": 2.489158098340629e-05, + "loss": 0.6369, + "step": 184000 + }, + { + "epoch": 4.2415031776733905, + "grad_norm": 3.306530714035034, + "learning_rate": 2.4884309419583777e-05, + "loss": 0.6271, + "step": 184200 + }, + { + "epoch": 4.24610850142765, + "grad_norm": 3.441470146179199, + "learning_rate": 2.487703785576126e-05, + "loss": 0.6163, + "step": 184400 + }, + { + "epoch": 4.2507138251819105, + "grad_norm": 2.845263957977295, + "learning_rate": 2.4869766291938746e-05, + "loss": 0.6207, + "step": 184600 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 3.332775354385376, + "learning_rate": 2.4862531085935344e-05, + "loss": 0.6221, + "step": 184800 + }, + { + "epoch": 4.25992447269043, + "grad_norm": 2.332599639892578, + "learning_rate": 2.4855259522112827e-05, + "loss": 0.6207, + "step": 185000 + }, + { + "epoch": 4.26452979644469, + "grad_norm": 4.441643238067627, + "learning_rate": 2.4848024316109425e-05, + "loss": 0.6276, + "step": 185200 + }, + { + "epoch": 4.26913512019895, + "grad_norm": 3.2958297729492188, + "learning_rate": 2.4840752752286904e-05, + "loss": 0.6209, + "step": 185400 + }, + { + "epoch": 4.27374044395321, + "grad_norm": 2.6615874767303467, + "learning_rate": 2.483348118846439e-05, + "loss": 0.6227, + "step": 185600 + }, + { + "epoch": 4.27834576770747, + "grad_norm": 2.7402963638305664, + "learning_rate": 2.4826209624641876e-05, + "loss": 0.613, + "step": 185800 + }, + { + "epoch": 4.2829510914617295, + "grad_norm": 3.0777735710144043, + "learning_rate": 2.4818938060819362e-05, + "loss": 0.6105, + "step": 186000 + }, + { + "epoch": 4.28755641521599, + "grad_norm": 3.108518123626709, + "learning_rate": 2.4811702854815957e-05, + "loss": 0.6155, + "step": 186200 + }, + { + "epoch": 4.2921617389702496, + "grad_norm": 2.7684409618377686, + "learning_rate": 2.4804431290993443e-05, + "loss": 0.6132, + "step": 186400 + }, + { + "epoch": 4.296767062724509, + "grad_norm": 3.1115176677703857, + "learning_rate": 2.4797159727170926e-05, + "loss": 0.6306, + "step": 186600 + }, + { + "epoch": 4.30137238647877, + "grad_norm": 3.601163387298584, + "learning_rate": 2.478988816334841e-05, + "loss": 0.6142, + "step": 186800 + }, + { + "epoch": 4.305977710233029, + "grad_norm": 3.0134575366973877, + "learning_rate": 2.4782616599525894e-05, + "loss": 0.6248, + "step": 187000 + }, + { + "epoch": 4.31058303398729, + "grad_norm": 2.950279951095581, + "learning_rate": 2.4775345035703377e-05, + "loss": 0.6184, + "step": 187200 + }, + { + "epoch": 4.315188357741549, + "grad_norm": 2.8289947509765625, + "learning_rate": 2.4768073471880863e-05, + "loss": 0.6292, + "step": 187400 + }, + { + "epoch": 4.319793681495809, + "grad_norm": 3.4812533855438232, + "learning_rate": 2.476080190805835e-05, + "loss": 0.6284, + "step": 187600 + }, + { + "epoch": 4.324399005250069, + "grad_norm": 4.80519437789917, + "learning_rate": 2.4753530344235835e-05, + "loss": 0.6278, + "step": 187800 + }, + { + "epoch": 4.329004329004329, + "grad_norm": 2.8514554500579834, + "learning_rate": 2.4746258780413317e-05, + "loss": 0.6139, + "step": 188000 + }, + { + "epoch": 4.333609652758589, + "grad_norm": 2.441417694091797, + "learning_rate": 2.47389872165908e-05, + "loss": 0.6142, + "step": 188200 + }, + { + "epoch": 4.338214976512849, + "grad_norm": 3.1779348850250244, + "learning_rate": 2.4731715652768285e-05, + "loss": 0.6245, + "step": 188400 + }, + { + "epoch": 4.342820300267109, + "grad_norm": 3.2313997745513916, + "learning_rate": 2.4724444088945768e-05, + "loss": 0.631, + "step": 188600 + }, + { + "epoch": 4.347425624021369, + "grad_norm": 3.2089149951934814, + "learning_rate": 2.4717172525123254e-05, + "loss": 0.6131, + "step": 188800 + }, + { + "epoch": 4.352030947775629, + "grad_norm": 3.3893136978149414, + "learning_rate": 2.470990096130074e-05, + "loss": 0.6144, + "step": 189000 + }, + { + "epoch": 4.356636271529888, + "grad_norm": 3.192901611328125, + "learning_rate": 2.4702629397478222e-05, + "loss": 0.62, + "step": 189200 + }, + { + "epoch": 4.361241595284149, + "grad_norm": 3.3176310062408447, + "learning_rate": 2.4695357833655705e-05, + "loss": 0.623, + "step": 189400 + }, + { + "epoch": 4.365846919038408, + "grad_norm": 3.1651062965393066, + "learning_rate": 2.468808626983319e-05, + "loss": 0.6313, + "step": 189600 + }, + { + "epoch": 4.370452242792668, + "grad_norm": 4.049862861633301, + "learning_rate": 2.4680814706010673e-05, + "loss": 0.6236, + "step": 189800 + }, + { + "epoch": 4.3750575665469285, + "grad_norm": 3.2649123668670654, + "learning_rate": 2.467354314218816e-05, + "loss": 0.614, + "step": 190000 + }, + { + "epoch": 4.379662890301188, + "grad_norm": 2.824345350265503, + "learning_rate": 2.4666271578365645e-05, + "loss": 0.6238, + "step": 190200 + }, + { + "epoch": 4.384268214055448, + "grad_norm": 3.1499876976013184, + "learning_rate": 2.4659000014543128e-05, + "loss": 0.6226, + "step": 190400 + }, + { + "epoch": 4.388873537809708, + "grad_norm": 3.3163814544677734, + "learning_rate": 2.4651728450720614e-05, + "loss": 0.6126, + "step": 190600 + }, + { + "epoch": 4.393478861563968, + "grad_norm": 2.8196558952331543, + "learning_rate": 2.4644493244717212e-05, + "loss": 0.6226, + "step": 190800 + }, + { + "epoch": 4.398084185318228, + "grad_norm": 2.6953351497650146, + "learning_rate": 2.463722168089469e-05, + "loss": 0.62, + "step": 191000 + }, + { + "epoch": 4.402689509072488, + "grad_norm": 2.942456007003784, + "learning_rate": 2.4629950117072177e-05, + "loss": 0.6265, + "step": 191200 + }, + { + "epoch": 4.407294832826747, + "grad_norm": 2.7047855854034424, + "learning_rate": 2.4622678553249663e-05, + "loss": 0.6177, + "step": 191400 + }, + { + "epoch": 4.411900156581008, + "grad_norm": 3.093517541885376, + "learning_rate": 2.4615406989427146e-05, + "loss": 0.6225, + "step": 191600 + }, + { + "epoch": 4.4165054803352675, + "grad_norm": 3.8072400093078613, + "learning_rate": 2.4608171783423744e-05, + "loss": 0.6177, + "step": 191800 + }, + { + "epoch": 4.421110804089528, + "grad_norm": 2.6595051288604736, + "learning_rate": 2.460090021960123e-05, + "loss": 0.6163, + "step": 192000 + }, + { + "epoch": 4.4257161278437875, + "grad_norm": 3.101194143295288, + "learning_rate": 2.4593628655778713e-05, + "loss": 0.6157, + "step": 192200 + }, + { + "epoch": 4.430321451598047, + "grad_norm": 2.7680416107177734, + "learning_rate": 2.4586357091956195e-05, + "loss": 0.6209, + "step": 192400 + }, + { + "epoch": 4.434926775352308, + "grad_norm": 3.988497734069824, + "learning_rate": 2.457908552813368e-05, + "loss": 0.6246, + "step": 192600 + }, + { + "epoch": 4.439532099106567, + "grad_norm": 3.0158700942993164, + "learning_rate": 2.4571813964311164e-05, + "loss": 0.6225, + "step": 192800 + }, + { + "epoch": 4.444137422860827, + "grad_norm": 3.032444953918457, + "learning_rate": 2.456454240048865e-05, + "loss": 0.6229, + "step": 193000 + }, + { + "epoch": 4.448742746615087, + "grad_norm": 2.915055274963379, + "learning_rate": 2.4557270836666136e-05, + "loss": 0.629, + "step": 193200 + }, + { + "epoch": 4.453348070369347, + "grad_norm": 3.404008388519287, + "learning_rate": 2.4549999272843618e-05, + "loss": 0.6233, + "step": 193400 + }, + { + "epoch": 4.4579533941236065, + "grad_norm": 2.661874294281006, + "learning_rate": 2.45427277090211e-05, + "loss": 0.6216, + "step": 193600 + }, + { + "epoch": 4.462558717877867, + "grad_norm": 3.4487946033477783, + "learning_rate": 2.4535456145198586e-05, + "loss": 0.6312, + "step": 193800 + }, + { + "epoch": 4.4671640416321265, + "grad_norm": 2.926490068435669, + "learning_rate": 2.4528184581376072e-05, + "loss": 0.6134, + "step": 194000 + }, + { + "epoch": 4.471769365386387, + "grad_norm": 3.7607150077819824, + "learning_rate": 2.4520913017553555e-05, + "loss": 0.621, + "step": 194200 + }, + { + "epoch": 4.476374689140647, + "grad_norm": 4.96255350112915, + "learning_rate": 2.451364145373104e-05, + "loss": 0.622, + "step": 194400 + }, + { + "epoch": 4.480980012894906, + "grad_norm": 3.3920106887817383, + "learning_rate": 2.4506369889908527e-05, + "loss": 0.6184, + "step": 194600 + }, + { + "epoch": 4.485585336649167, + "grad_norm": 3.106212854385376, + "learning_rate": 2.449909832608601e-05, + "loss": 0.624, + "step": 194800 + }, + { + "epoch": 4.490190660403426, + "grad_norm": 3.0986621379852295, + "learning_rate": 2.4491826762263492e-05, + "loss": 0.6199, + "step": 195000 + }, + { + "epoch": 4.494795984157686, + "grad_norm": 2.7702393531799316, + "learning_rate": 2.4484555198440978e-05, + "loss": 0.6219, + "step": 195200 + }, + { + "epoch": 4.499401307911946, + "grad_norm": 2.7158069610595703, + "learning_rate": 2.447728363461846e-05, + "loss": 0.6228, + "step": 195400 + }, + { + "epoch": 4.504006631666206, + "grad_norm": 2.6804583072662354, + "learning_rate": 2.4470012070795946e-05, + "loss": 0.6264, + "step": 195600 + }, + { + "epoch": 4.508611955420466, + "grad_norm": 3.254936695098877, + "learning_rate": 2.4462740506973432e-05, + "loss": 0.6178, + "step": 195800 + }, + { + "epoch": 4.513217279174726, + "grad_norm": 2.8413448333740234, + "learning_rate": 2.4455468943150915e-05, + "loss": 0.6221, + "step": 196000 + }, + { + "epoch": 4.517822602928986, + "grad_norm": 3.7565321922302246, + "learning_rate": 2.4448197379328397e-05, + "loss": 0.6167, + "step": 196200 + }, + { + "epoch": 4.522427926683246, + "grad_norm": 2.5034019947052, + "learning_rate": 2.4440925815505883e-05, + "loss": 0.6137, + "step": 196400 + }, + { + "epoch": 4.527033250437506, + "grad_norm": 3.1947107315063477, + "learning_rate": 2.4433654251683366e-05, + "loss": 0.621, + "step": 196600 + }, + { + "epoch": 4.531638574191765, + "grad_norm": 2.7699062824249268, + "learning_rate": 2.4426419045679964e-05, + "loss": 0.6192, + "step": 196800 + }, + { + "epoch": 4.536243897946026, + "grad_norm": 3.791555166244507, + "learning_rate": 2.441914748185745e-05, + "loss": 0.633, + "step": 197000 + }, + { + "epoch": 4.540849221700285, + "grad_norm": 2.6840012073516846, + "learning_rate": 2.4411875918034933e-05, + "loss": 0.6252, + "step": 197200 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 3.7765004634857178, + "learning_rate": 2.440460435421242e-05, + "loss": 0.6171, + "step": 197400 + }, + { + "epoch": 4.5500598692088055, + "grad_norm": 2.8564860820770264, + "learning_rate": 2.4397332790389904e-05, + "loss": 0.6235, + "step": 197600 + }, + { + "epoch": 4.554665192963065, + "grad_norm": 3.1641488075256348, + "learning_rate": 2.4390061226567384e-05, + "loss": 0.6197, + "step": 197800 + }, + { + "epoch": 4.5592705167173255, + "grad_norm": 3.0477914810180664, + "learning_rate": 2.438278966274487e-05, + "loss": 0.6269, + "step": 198000 + }, + { + "epoch": 4.563875840471585, + "grad_norm": 3.0091304779052734, + "learning_rate": 2.4375518098922355e-05, + "loss": 0.622, + "step": 198200 + }, + { + "epoch": 4.568481164225845, + "grad_norm": 3.7036385536193848, + "learning_rate": 2.4368246535099838e-05, + "loss": 0.6151, + "step": 198400 + }, + { + "epoch": 4.573086487980105, + "grad_norm": 2.7939934730529785, + "learning_rate": 2.4360974971277324e-05, + "loss": 0.6295, + "step": 198600 + }, + { + "epoch": 4.577691811734365, + "grad_norm": 3.725222110748291, + "learning_rate": 2.435370340745481e-05, + "loss": 0.6212, + "step": 198800 + }, + { + "epoch": 4.582297135488625, + "grad_norm": 3.55149245262146, + "learning_rate": 2.4346431843632292e-05, + "loss": 0.6143, + "step": 199000 + }, + { + "epoch": 4.586902459242885, + "grad_norm": 3.657155990600586, + "learning_rate": 2.4339160279809775e-05, + "loss": 0.6245, + "step": 199200 + }, + { + "epoch": 4.5915077829971445, + "grad_norm": 2.774144172668457, + "learning_rate": 2.433188871598726e-05, + "loss": 0.6263, + "step": 199400 + }, + { + "epoch": 4.596113106751405, + "grad_norm": 2.516934633255005, + "learning_rate": 2.4324617152164747e-05, + "loss": 0.6096, + "step": 199600 + }, + { + "epoch": 4.6007184305056645, + "grad_norm": 3.243980646133423, + "learning_rate": 2.431734558834223e-05, + "loss": 0.6136, + "step": 199800 + }, + { + "epoch": 4.605323754259924, + "grad_norm": 3.0042223930358887, + "learning_rate": 2.4310110382338828e-05, + "loss": 0.6219, + "step": 200000 + }, + { + "epoch": 4.609929078014185, + "grad_norm": 2.626081943511963, + "learning_rate": 2.430283881851631e-05, + "loss": 0.6093, + "step": 200200 + }, + { + "epoch": 4.614534401768444, + "grad_norm": 3.4706902503967285, + "learning_rate": 2.4295567254693793e-05, + "loss": 0.6217, + "step": 200400 + }, + { + "epoch": 4.619139725522704, + "grad_norm": 3.2710297107696533, + "learning_rate": 2.428829569087128e-05, + "loss": 0.617, + "step": 200600 + }, + { + "epoch": 4.623745049276964, + "grad_norm": 2.6531553268432617, + "learning_rate": 2.4281024127048765e-05, + "loss": 0.636, + "step": 200800 + }, + { + "epoch": 4.628350373031224, + "grad_norm": 2.9135398864746094, + "learning_rate": 2.4273752563226247e-05, + "loss": 0.6269, + "step": 201000 + }, + { + "epoch": 4.632955696785484, + "grad_norm": 2.918564558029175, + "learning_rate": 2.4266480999403733e-05, + "loss": 0.6139, + "step": 201200 + }, + { + "epoch": 4.637561020539744, + "grad_norm": 3.0329713821411133, + "learning_rate": 2.425920943558122e-05, + "loss": 0.619, + "step": 201400 + }, + { + "epoch": 4.6421663442940035, + "grad_norm": 2.717499017715454, + "learning_rate": 2.42519378717587e-05, + "loss": 0.6141, + "step": 201600 + }, + { + "epoch": 4.646771668048264, + "grad_norm": 3.2214903831481934, + "learning_rate": 2.4244666307936184e-05, + "loss": 0.614, + "step": 201800 + }, + { + "epoch": 4.651376991802524, + "grad_norm": 2.94065260887146, + "learning_rate": 2.423739474411367e-05, + "loss": 0.6206, + "step": 202000 + }, + { + "epoch": 4.655982315556784, + "grad_norm": 3.6057605743408203, + "learning_rate": 2.4230123180291152e-05, + "loss": 0.6097, + "step": 202200 + }, + { + "epoch": 4.660587639311044, + "grad_norm": 3.165396213531494, + "learning_rate": 2.422285161646864e-05, + "loss": 0.6283, + "step": 202400 + }, + { + "epoch": 4.665192963065303, + "grad_norm": 3.9540810585021973, + "learning_rate": 2.4215580052646124e-05, + "loss": 0.6247, + "step": 202600 + }, + { + "epoch": 4.669798286819564, + "grad_norm": 2.56862735748291, + "learning_rate": 2.4208308488823607e-05, + "loss": 0.6188, + "step": 202800 + }, + { + "epoch": 4.674403610573823, + "grad_norm": 2.8995814323425293, + "learning_rate": 2.420103692500109e-05, + "loss": 0.6152, + "step": 203000 + }, + { + "epoch": 4.679008934328083, + "grad_norm": 3.891038656234741, + "learning_rate": 2.4193765361178575e-05, + "loss": 0.6129, + "step": 203200 + }, + { + "epoch": 4.683614258082343, + "grad_norm": 3.055830478668213, + "learning_rate": 2.418649379735606e-05, + "loss": 0.6204, + "step": 203400 + }, + { + "epoch": 4.688219581836603, + "grad_norm": 2.924553394317627, + "learning_rate": 2.4179222233533544e-05, + "loss": 0.6245, + "step": 203600 + }, + { + "epoch": 4.692824905590863, + "grad_norm": 3.2216031551361084, + "learning_rate": 2.4171987027530142e-05, + "loss": 0.6289, + "step": 203800 + }, + { + "epoch": 4.697430229345123, + "grad_norm": 3.0679335594177246, + "learning_rate": 2.4164715463707625e-05, + "loss": 0.6114, + "step": 204000 + }, + { + "epoch": 4.702035553099383, + "grad_norm": 2.5067596435546875, + "learning_rate": 2.415744389988511e-05, + "loss": 0.6313, + "step": 204200 + }, + { + "epoch": 4.706640876853643, + "grad_norm": 3.141855239868164, + "learning_rate": 2.4150172336062597e-05, + "loss": 0.6201, + "step": 204400 + }, + { + "epoch": 4.711246200607903, + "grad_norm": 2.854891300201416, + "learning_rate": 2.4142900772240076e-05, + "loss": 0.6298, + "step": 204600 + }, + { + "epoch": 4.715851524362162, + "grad_norm": 3.0763156414031982, + "learning_rate": 2.413562920841756e-05, + "loss": 0.6246, + "step": 204800 + }, + { + "epoch": 4.720456848116423, + "grad_norm": 3.858271837234497, + "learning_rate": 2.4128357644595048e-05, + "loss": 0.6288, + "step": 205000 + }, + { + "epoch": 4.7250621718706824, + "grad_norm": 3.183344602584839, + "learning_rate": 2.4121086080772534e-05, + "loss": 0.614, + "step": 205200 + }, + { + "epoch": 4.729667495624943, + "grad_norm": 3.646907329559326, + "learning_rate": 2.4113814516950016e-05, + "loss": 0.6259, + "step": 205400 + }, + { + "epoch": 4.7342728193792025, + "grad_norm": 3.3939154148101807, + "learning_rate": 2.4106542953127502e-05, + "loss": 0.6315, + "step": 205600 + }, + { + "epoch": 4.738878143133462, + "grad_norm": 3.364961862564087, + "learning_rate": 2.4099271389304984e-05, + "loss": 0.6232, + "step": 205800 + }, + { + "epoch": 4.743483466887723, + "grad_norm": 2.7885661125183105, + "learning_rate": 2.4091999825482467e-05, + "loss": 0.6219, + "step": 206000 + }, + { + "epoch": 4.748088790641982, + "grad_norm": 3.6182515621185303, + "learning_rate": 2.4084728261659953e-05, + "loss": 0.6302, + "step": 206200 + }, + { + "epoch": 4.752694114396242, + "grad_norm": 2.98297381401062, + "learning_rate": 2.407745669783744e-05, + "loss": 0.6217, + "step": 206400 + }, + { + "epoch": 4.757299438150502, + "grad_norm": 4.925945281982422, + "learning_rate": 2.407018513401492e-05, + "loss": 0.6299, + "step": 206600 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 2.90513277053833, + "learning_rate": 2.406294992801152e-05, + "loss": 0.6254, + "step": 206800 + }, + { + "epoch": 4.7665100856590215, + "grad_norm": 2.9922261238098145, + "learning_rate": 2.4055714722008115e-05, + "loss": 0.6204, + "step": 207000 + }, + { + "epoch": 4.771115409413282, + "grad_norm": 3.269758939743042, + "learning_rate": 2.40484431581856e-05, + "loss": 0.6091, + "step": 207200 + }, + { + "epoch": 4.7757207331675415, + "grad_norm": 2.7046399116516113, + "learning_rate": 2.4041171594363084e-05, + "loss": 0.6204, + "step": 207400 + }, + { + "epoch": 4.780326056921801, + "grad_norm": 3.494405508041382, + "learning_rate": 2.4033900030540566e-05, + "loss": 0.6361, + "step": 207600 + }, + { + "epoch": 4.784931380676062, + "grad_norm": 2.91849684715271, + "learning_rate": 2.4026628466718052e-05, + "loss": 0.623, + "step": 207800 + }, + { + "epoch": 4.789536704430321, + "grad_norm": 2.500559091567993, + "learning_rate": 2.4019356902895538e-05, + "loss": 0.615, + "step": 208000 + }, + { + "epoch": 4.794142028184582, + "grad_norm": 3.2106189727783203, + "learning_rate": 2.401208533907302e-05, + "loss": 0.6345, + "step": 208200 + }, + { + "epoch": 4.798747351938841, + "grad_norm": 3.0638959407806396, + "learning_rate": 2.4004813775250506e-05, + "loss": 0.6176, + "step": 208400 + }, + { + "epoch": 4.803352675693101, + "grad_norm": 3.2531685829162598, + "learning_rate": 2.3997542211427992e-05, + "loss": 0.628, + "step": 208600 + }, + { + "epoch": 4.807957999447361, + "grad_norm": 2.51589035987854, + "learning_rate": 2.3990270647605475e-05, + "loss": 0.6179, + "step": 208800 + }, + { + "epoch": 4.812563323201621, + "grad_norm": 4.157649040222168, + "learning_rate": 2.3982999083782957e-05, + "loss": 0.6069, + "step": 209000 + }, + { + "epoch": 4.817168646955881, + "grad_norm": 2.5439136028289795, + "learning_rate": 2.3975727519960443e-05, + "loss": 0.6243, + "step": 209200 + }, + { + "epoch": 4.821773970710141, + "grad_norm": 2.936598300933838, + "learning_rate": 2.396845595613793e-05, + "loss": 0.6232, + "step": 209400 + }, + { + "epoch": 4.826379294464401, + "grad_norm": 3.0496158599853516, + "learning_rate": 2.3961184392315412e-05, + "loss": 0.6308, + "step": 209600 + }, + { + "epoch": 4.830984618218661, + "grad_norm": 3.6222925186157227, + "learning_rate": 2.3953912828492898e-05, + "loss": 0.6168, + "step": 209800 + }, + { + "epoch": 4.835589941972921, + "grad_norm": 3.679816722869873, + "learning_rate": 2.394664126467038e-05, + "loss": 0.6095, + "step": 210000 + }, + { + "epoch": 4.84019526572718, + "grad_norm": 3.8149142265319824, + "learning_rate": 2.3939369700847863e-05, + "loss": 0.616, + "step": 210200 + }, + { + "epoch": 4.844800589481441, + "grad_norm": 3.1322624683380127, + "learning_rate": 2.393209813702535e-05, + "loss": 0.6137, + "step": 210400 + }, + { + "epoch": 4.8494059132357, + "grad_norm": 2.972074031829834, + "learning_rate": 2.3924826573202835e-05, + "loss": 0.6372, + "step": 210600 + }, + { + "epoch": 4.85401123698996, + "grad_norm": 2.327326774597168, + "learning_rate": 2.3917555009380317e-05, + "loss": 0.6182, + "step": 210800 + }, + { + "epoch": 4.85861656074422, + "grad_norm": 3.573911666870117, + "learning_rate": 2.3910283445557803e-05, + "loss": 0.6278, + "step": 211000 + }, + { + "epoch": 4.86322188449848, + "grad_norm": 3.1551811695098877, + "learning_rate": 2.39030482395544e-05, + "loss": 0.6174, + "step": 211200 + }, + { + "epoch": 4.8678272082527405, + "grad_norm": 3.1177220344543457, + "learning_rate": 2.389577667573188e-05, + "loss": 0.6203, + "step": 211400 + }, + { + "epoch": 4.872432532007, + "grad_norm": 3.517178773880005, + "learning_rate": 2.3888505111909367e-05, + "loss": 0.6165, + "step": 211600 + }, + { + "epoch": 4.87703785576126, + "grad_norm": 2.932919502258301, + "learning_rate": 2.3881233548086853e-05, + "loss": 0.6154, + "step": 211800 + }, + { + "epoch": 4.88164317951552, + "grad_norm": 3.162429094314575, + "learning_rate": 2.3873961984264335e-05, + "loss": 0.6215, + "step": 212000 + }, + { + "epoch": 4.88624850326978, + "grad_norm": 3.3734543323516846, + "learning_rate": 2.386669042044182e-05, + "loss": 0.6068, + "step": 212200 + }, + { + "epoch": 4.89085382702404, + "grad_norm": 3.567715644836426, + "learning_rate": 2.3859418856619307e-05, + "loss": 0.6183, + "step": 212400 + }, + { + "epoch": 4.8954591507783, + "grad_norm": 3.8765225410461426, + "learning_rate": 2.385214729279679e-05, + "loss": 0.6122, + "step": 212600 + }, + { + "epoch": 4.9000644745325594, + "grad_norm": 3.328627109527588, + "learning_rate": 2.3844875728974272e-05, + "loss": 0.6196, + "step": 212800 + }, + { + "epoch": 4.90466979828682, + "grad_norm": 3.3088150024414062, + "learning_rate": 2.3837604165151758e-05, + "loss": 0.6223, + "step": 213000 + }, + { + "epoch": 4.9092751220410795, + "grad_norm": 3.936145305633545, + "learning_rate": 2.3830368959148353e-05, + "loss": 0.6145, + "step": 213200 + }, + { + "epoch": 4.913880445795339, + "grad_norm": 4.120792388916016, + "learning_rate": 2.382309739532584e-05, + "loss": 0.6168, + "step": 213400 + }, + { + "epoch": 4.9184857695496, + "grad_norm": 2.7531352043151855, + "learning_rate": 2.3815825831503325e-05, + "loss": 0.608, + "step": 213600 + }, + { + "epoch": 4.923091093303859, + "grad_norm": 3.97782301902771, + "learning_rate": 2.3808554267680807e-05, + "loss": 0.6283, + "step": 213800 + }, + { + "epoch": 4.927696417058119, + "grad_norm": 2.840651035308838, + "learning_rate": 2.3801282703858293e-05, + "loss": 0.608, + "step": 214000 + }, + { + "epoch": 4.932301740812379, + "grad_norm": 2.7774887084960938, + "learning_rate": 2.3794011140035776e-05, + "loss": 0.6015, + "step": 214200 + }, + { + "epoch": 4.936907064566639, + "grad_norm": 2.750030040740967, + "learning_rate": 2.378677593403237e-05, + "loss": 0.5992, + "step": 214400 + }, + { + "epoch": 4.941512388320899, + "grad_norm": 3.340794563293457, + "learning_rate": 2.3779504370209857e-05, + "loss": 0.6196, + "step": 214600 + }, + { + "epoch": 4.946117712075159, + "grad_norm": 2.626340866088867, + "learning_rate": 2.3772232806387343e-05, + "loss": 0.6199, + "step": 214800 + }, + { + "epoch": 4.9507230358294185, + "grad_norm": 2.984501361846924, + "learning_rate": 2.3764961242564825e-05, + "loss": 0.6095, + "step": 215000 + }, + { + "epoch": 4.955328359583679, + "grad_norm": 3.6400811672210693, + "learning_rate": 2.375768967874231e-05, + "loss": 0.6172, + "step": 215200 + }, + { + "epoch": 4.959933683337939, + "grad_norm": 3.16477632522583, + "learning_rate": 2.3750418114919797e-05, + "loss": 0.6134, + "step": 215400 + }, + { + "epoch": 4.964539007092198, + "grad_norm": 3.852839708328247, + "learning_rate": 2.374314655109728e-05, + "loss": 0.6247, + "step": 215600 + }, + { + "epoch": 4.969144330846459, + "grad_norm": 3.3444173336029053, + "learning_rate": 2.3735874987274762e-05, + "loss": 0.6085, + "step": 215800 + }, + { + "epoch": 4.973749654600718, + "grad_norm": 4.497110366821289, + "learning_rate": 2.3728603423452248e-05, + "loss": 0.6162, + "step": 216000 + }, + { + "epoch": 4.978354978354979, + "grad_norm": 2.6496007442474365, + "learning_rate": 2.372133185962973e-05, + "loss": 0.6196, + "step": 216200 + }, + { + "epoch": 4.982960302109238, + "grad_norm": 3.1037495136260986, + "learning_rate": 2.371409665362633e-05, + "loss": 0.6156, + "step": 216400 + }, + { + "epoch": 4.987565625863498, + "grad_norm": 2.7602591514587402, + "learning_rate": 2.3706825089803815e-05, + "loss": 0.628, + "step": 216600 + }, + { + "epoch": 4.992170949617758, + "grad_norm": 4.069486141204834, + "learning_rate": 2.3699553525981298e-05, + "loss": 0.6232, + "step": 216800 + }, + { + "epoch": 4.996776273372018, + "grad_norm": 2.9536936283111572, + "learning_rate": 2.3692281962158784e-05, + "loss": 0.6129, + "step": 217000 + }, + { + "epoch": 5.0, + "eval_loss": 0.5846441388130188, + "eval_runtime": 145.6578, + "eval_samples_per_second": 194.71, + "eval_steps_per_second": 12.172, + "step": 217140 + }, + { + "epoch": 5.001381597126278, + "grad_norm": 3.7548468112945557, + "learning_rate": 2.3685010398336266e-05, + "loss": 0.6154, + "step": 217200 + }, + { + "epoch": 5.005986920880538, + "grad_norm": 3.002516269683838, + "learning_rate": 2.367773883451375e-05, + "loss": 0.6113, + "step": 217400 + }, + { + "epoch": 5.010592244634798, + "grad_norm": 2.816727638244629, + "learning_rate": 2.3670467270691235e-05, + "loss": 0.6102, + "step": 217600 + }, + { + "epoch": 5.015197568389058, + "grad_norm": 2.5023694038391113, + "learning_rate": 2.366319570686872e-05, + "loss": 0.6157, + "step": 217800 + }, + { + "epoch": 5.019802892143318, + "grad_norm": 3.309704542160034, + "learning_rate": 2.3655924143046203e-05, + "loss": 0.6056, + "step": 218000 + }, + { + "epoch": 5.024408215897577, + "grad_norm": 2.3339779376983643, + "learning_rate": 2.364865257922369e-05, + "loss": 0.6094, + "step": 218200 + }, + { + "epoch": 5.029013539651838, + "grad_norm": 3.4080960750579834, + "learning_rate": 2.364138101540117e-05, + "loss": 0.6022, + "step": 218400 + }, + { + "epoch": 5.033618863406097, + "grad_norm": 3.769949197769165, + "learning_rate": 2.3634109451578657e-05, + "loss": 0.6158, + "step": 218600 + }, + { + "epoch": 5.038224187160357, + "grad_norm": 3.2221176624298096, + "learning_rate": 2.362683788775614e-05, + "loss": 0.6077, + "step": 218800 + }, + { + "epoch": 5.0428295109146175, + "grad_norm": 2.9638614654541016, + "learning_rate": 2.3619566323933626e-05, + "loss": 0.6134, + "step": 219000 + }, + { + "epoch": 5.047434834668877, + "grad_norm": 3.3254809379577637, + "learning_rate": 2.3612294760111112e-05, + "loss": 0.6121, + "step": 219200 + }, + { + "epoch": 5.052040158423138, + "grad_norm": 3.3986082077026367, + "learning_rate": 2.3605023196288594e-05, + "loss": 0.6231, + "step": 219400 + }, + { + "epoch": 5.056645482177397, + "grad_norm": 3.187117099761963, + "learning_rate": 2.359775163246608e-05, + "loss": 0.6075, + "step": 219600 + }, + { + "epoch": 5.061250805931657, + "grad_norm": 3.4964001178741455, + "learning_rate": 2.3590480068643563e-05, + "loss": 0.6022, + "step": 219800 + }, + { + "epoch": 5.065856129685917, + "grad_norm": 2.966295003890991, + "learning_rate": 2.3583208504821045e-05, + "loss": 0.6103, + "step": 220000 + }, + { + "epoch": 5.070461453440177, + "grad_norm": 2.458843946456909, + "learning_rate": 2.357593694099853e-05, + "loss": 0.6164, + "step": 220200 + }, + { + "epoch": 5.075066777194436, + "grad_norm": 2.4280691146850586, + "learning_rate": 2.3568665377176017e-05, + "loss": 0.6103, + "step": 220400 + }, + { + "epoch": 5.079672100948697, + "grad_norm": 2.904318332672119, + "learning_rate": 2.35613938133535e-05, + "loss": 0.6018, + "step": 220600 + }, + { + "epoch": 5.0842774247029565, + "grad_norm": 3.2932612895965576, + "learning_rate": 2.3554122249530986e-05, + "loss": 0.6065, + "step": 220800 + }, + { + "epoch": 5.088882748457216, + "grad_norm": 3.404580593109131, + "learning_rate": 2.3546850685708468e-05, + "loss": 0.6202, + "step": 221000 + }, + { + "epoch": 5.093488072211477, + "grad_norm": 3.106785535812378, + "learning_rate": 2.3539579121885954e-05, + "loss": 0.6062, + "step": 221200 + }, + { + "epoch": 5.098093395965736, + "grad_norm": 3.391956329345703, + "learning_rate": 2.3532307558063437e-05, + "loss": 0.6091, + "step": 221400 + }, + { + "epoch": 5.102698719719997, + "grad_norm": 2.8422746658325195, + "learning_rate": 2.3525035994240922e-05, + "loss": 0.6021, + "step": 221600 + }, + { + "epoch": 5.107304043474256, + "grad_norm": 2.9199986457824707, + "learning_rate": 2.351776443041841e-05, + "loss": 0.6146, + "step": 221800 + }, + { + "epoch": 5.111909367228516, + "grad_norm": 3.0510222911834717, + "learning_rate": 2.351049286659589e-05, + "loss": 0.6042, + "step": 222000 + }, + { + "epoch": 5.116514690982776, + "grad_norm": 4.642153263092041, + "learning_rate": 2.350325766059249e-05, + "loss": 0.6086, + "step": 222200 + }, + { + "epoch": 5.121120014737036, + "grad_norm": 4.9221086502075195, + "learning_rate": 2.3495986096769972e-05, + "loss": 0.6069, + "step": 222400 + }, + { + "epoch": 5.1257253384912955, + "grad_norm": 3.0352025032043457, + "learning_rate": 2.3488714532947455e-05, + "loss": 0.6003, + "step": 222600 + }, + { + "epoch": 5.130330662245556, + "grad_norm": 3.675919771194458, + "learning_rate": 2.348144296912494e-05, + "loss": 0.6157, + "step": 222800 + }, + { + "epoch": 5.134935985999816, + "grad_norm": 3.621706247329712, + "learning_rate": 2.3474171405302426e-05, + "loss": 0.6081, + "step": 223000 + }, + { + "epoch": 5.139541309754076, + "grad_norm": 3.0481488704681396, + "learning_rate": 2.346689984147991e-05, + "loss": 0.6155, + "step": 223200 + }, + { + "epoch": 5.144146633508336, + "grad_norm": 3.5414528846740723, + "learning_rate": 2.3459628277657395e-05, + "loss": 0.6054, + "step": 223400 + }, + { + "epoch": 5.148751957262595, + "grad_norm": 3.2819435596466064, + "learning_rate": 2.345235671383488e-05, + "loss": 0.6131, + "step": 223600 + }, + { + "epoch": 5.153357281016856, + "grad_norm": 2.693819761276245, + "learning_rate": 2.344508515001236e-05, + "loss": 0.6082, + "step": 223800 + }, + { + "epoch": 5.157962604771115, + "grad_norm": 2.757230520248413, + "learning_rate": 2.3437813586189846e-05, + "loss": 0.6084, + "step": 224000 + }, + { + "epoch": 5.162567928525375, + "grad_norm": 3.3219823837280273, + "learning_rate": 2.343054202236733e-05, + "loss": 0.6068, + "step": 224200 + }, + { + "epoch": 5.167173252279635, + "grad_norm": 2.8286936283111572, + "learning_rate": 2.3423270458544814e-05, + "loss": 0.6043, + "step": 224400 + }, + { + "epoch": 5.171778576033895, + "grad_norm": 3.0710113048553467, + "learning_rate": 2.34159988947223e-05, + "loss": 0.6132, + "step": 224600 + }, + { + "epoch": 5.1763838997881555, + "grad_norm": 3.501431465148926, + "learning_rate": 2.3408727330899786e-05, + "loss": 0.6119, + "step": 224800 + }, + { + "epoch": 5.180989223542415, + "grad_norm": 2.9645442962646484, + "learning_rate": 2.3401455767077265e-05, + "loss": 0.6027, + "step": 225000 + }, + { + "epoch": 5.185594547296675, + "grad_norm": 3.057513475418091, + "learning_rate": 2.3394220561073864e-05, + "loss": 0.6105, + "step": 225200 + }, + { + "epoch": 5.190199871050935, + "grad_norm": 3.0226759910583496, + "learning_rate": 2.338694899725135e-05, + "loss": 0.6052, + "step": 225400 + }, + { + "epoch": 5.194805194805195, + "grad_norm": 2.725600481033325, + "learning_rate": 2.3379677433428832e-05, + "loss": 0.6066, + "step": 225600 + }, + { + "epoch": 5.199410518559454, + "grad_norm": 3.8302340507507324, + "learning_rate": 2.3372405869606318e-05, + "loss": 0.6098, + "step": 225800 + }, + { + "epoch": 5.204015842313715, + "grad_norm": 2.8420770168304443, + "learning_rate": 2.3365134305783804e-05, + "loss": 0.6164, + "step": 226000 + }, + { + "epoch": 5.208621166067974, + "grad_norm": 3.4650795459747314, + "learning_rate": 2.3357862741961287e-05, + "loss": 0.6102, + "step": 226200 + }, + { + "epoch": 5.213226489822235, + "grad_norm": 3.013132095336914, + "learning_rate": 2.3350591178138772e-05, + "loss": 0.6133, + "step": 226400 + }, + { + "epoch": 5.2178318135764945, + "grad_norm": 3.517982244491577, + "learning_rate": 2.3343319614316255e-05, + "loss": 0.6042, + "step": 226600 + }, + { + "epoch": 5.222437137330754, + "grad_norm": 3.570617914199829, + "learning_rate": 2.3336048050493737e-05, + "loss": 0.6194, + "step": 226800 + }, + { + "epoch": 5.227042461085015, + "grad_norm": 3.553957223892212, + "learning_rate": 2.3328776486671223e-05, + "loss": 0.6028, + "step": 227000 + }, + { + "epoch": 5.231647784839274, + "grad_norm": 3.234729051589966, + "learning_rate": 2.332150492284871e-05, + "loss": 0.6069, + "step": 227200 + }, + { + "epoch": 5.236253108593534, + "grad_norm": 2.738168478012085, + "learning_rate": 2.3314233359026195e-05, + "loss": 0.6122, + "step": 227400 + }, + { + "epoch": 5.240858432347794, + "grad_norm": 3.4680685997009277, + "learning_rate": 2.3306961795203678e-05, + "loss": 0.6166, + "step": 227600 + }, + { + "epoch": 5.245463756102054, + "grad_norm": 3.349167823791504, + "learning_rate": 2.329969023138116e-05, + "loss": 0.6182, + "step": 227800 + }, + { + "epoch": 5.250069079856313, + "grad_norm": 2.7940080165863037, + "learning_rate": 2.3292418667558646e-05, + "loss": 0.6073, + "step": 228000 + }, + { + "epoch": 5.254674403610574, + "grad_norm": 3.263004779815674, + "learning_rate": 2.328514710373613e-05, + "loss": 0.6102, + "step": 228200 + }, + { + "epoch": 5.2592797273648335, + "grad_norm": 3.2109506130218506, + "learning_rate": 2.3277875539913615e-05, + "loss": 0.6147, + "step": 228400 + }, + { + "epoch": 5.263885051119094, + "grad_norm": 3.7838523387908936, + "learning_rate": 2.32706039760911e-05, + "loss": 0.61, + "step": 228600 + }, + { + "epoch": 5.268490374873354, + "grad_norm": 2.4661216735839844, + "learning_rate": 2.3263332412268583e-05, + "loss": 0.6123, + "step": 228800 + }, + { + "epoch": 5.273095698627613, + "grad_norm": 3.483590841293335, + "learning_rate": 2.325606084844607e-05, + "loss": 0.6215, + "step": 229000 + }, + { + "epoch": 5.277701022381874, + "grad_norm": 2.8300187587738037, + "learning_rate": 2.324878928462355e-05, + "loss": 0.6036, + "step": 229200 + }, + { + "epoch": 5.282306346136133, + "grad_norm": 3.444559097290039, + "learning_rate": 2.3241517720801034e-05, + "loss": 0.6136, + "step": 229400 + }, + { + "epoch": 5.286911669890394, + "grad_norm": 3.6756949424743652, + "learning_rate": 2.323424615697852e-05, + "loss": 0.621, + "step": 229600 + }, + { + "epoch": 5.291516993644653, + "grad_norm": 3.188176393508911, + "learning_rate": 2.322701095097512e-05, + "loss": 0.6068, + "step": 229800 + }, + { + "epoch": 5.296122317398913, + "grad_norm": 3.3259594440460205, + "learning_rate": 2.32197393871526e-05, + "loss": 0.6159, + "step": 230000 + }, + { + "epoch": 5.300727641153173, + "grad_norm": 2.9103612899780273, + "learning_rate": 2.3212467823330087e-05, + "loss": 0.6131, + "step": 230200 + }, + { + "epoch": 5.305332964907433, + "grad_norm": 2.856694459915161, + "learning_rate": 2.3205196259507573e-05, + "loss": 0.608, + "step": 230400 + }, + { + "epoch": 5.309938288661693, + "grad_norm": 3.170351982116699, + "learning_rate": 2.3197924695685052e-05, + "loss": 0.6211, + "step": 230600 + }, + { + "epoch": 5.314543612415953, + "grad_norm": 2.481973648071289, + "learning_rate": 2.3190653131862538e-05, + "loss": 0.6183, + "step": 230800 + }, + { + "epoch": 5.319148936170213, + "grad_norm": 2.819699287414551, + "learning_rate": 2.3183381568040024e-05, + "loss": 0.6043, + "step": 231000 + }, + { + "epoch": 5.323754259924472, + "grad_norm": 2.6565167903900146, + "learning_rate": 2.3176110004217506e-05, + "loss": 0.6098, + "step": 231200 + }, + { + "epoch": 5.328359583678733, + "grad_norm": 3.6145877838134766, + "learning_rate": 2.3168838440394992e-05, + "loss": 0.6108, + "step": 231400 + }, + { + "epoch": 5.332964907432992, + "grad_norm": 3.620488166809082, + "learning_rate": 2.3161566876572478e-05, + "loss": 0.6066, + "step": 231600 + }, + { + "epoch": 5.337570231187253, + "grad_norm": 3.417673110961914, + "learning_rate": 2.3154331670569073e-05, + "loss": 0.6115, + "step": 231800 + }, + { + "epoch": 5.342175554941512, + "grad_norm": 3.6562862396240234, + "learning_rate": 2.3147060106746556e-05, + "loss": 0.6219, + "step": 232000 + }, + { + "epoch": 5.346780878695772, + "grad_norm": 3.1641769409179688, + "learning_rate": 2.3139788542924042e-05, + "loss": 0.5932, + "step": 232200 + }, + { + "epoch": 5.3513862024500325, + "grad_norm": 3.7529780864715576, + "learning_rate": 2.3132516979101524e-05, + "loss": 0.6114, + "step": 232400 + }, + { + "epoch": 5.355991526204292, + "grad_norm": 4.237635612487793, + "learning_rate": 2.3125281773098123e-05, + "loss": 0.6147, + "step": 232600 + }, + { + "epoch": 5.360596849958552, + "grad_norm": 3.604637861251831, + "learning_rate": 2.311801020927561e-05, + "loss": 0.6077, + "step": 232800 + }, + { + "epoch": 5.365202173712812, + "grad_norm": 3.4843764305114746, + "learning_rate": 2.311073864545309e-05, + "loss": 0.6114, + "step": 233000 + }, + { + "epoch": 5.369807497467072, + "grad_norm": 2.583153486251831, + "learning_rate": 2.3103467081630577e-05, + "loss": 0.5862, + "step": 233200 + }, + { + "epoch": 5.374412821221332, + "grad_norm": 2.859898328781128, + "learning_rate": 2.3096195517808063e-05, + "loss": 0.6113, + "step": 233400 + }, + { + "epoch": 5.379018144975592, + "grad_norm": 3.472050666809082, + "learning_rate": 2.3088923953985542e-05, + "loss": 0.6087, + "step": 233600 + }, + { + "epoch": 5.383623468729851, + "grad_norm": 3.197916030883789, + "learning_rate": 2.308168874798214e-05, + "loss": 0.602, + "step": 233800 + }, + { + "epoch": 5.388228792484112, + "grad_norm": 3.002330780029297, + "learning_rate": 2.3074417184159624e-05, + "loss": 0.6079, + "step": 234000 + }, + { + "epoch": 5.3928341162383715, + "grad_norm": 3.3833117485046387, + "learning_rate": 2.306714562033711e-05, + "loss": 0.6113, + "step": 234200 + }, + { + "epoch": 5.397439439992631, + "grad_norm": 2.8529210090637207, + "learning_rate": 2.3059874056514595e-05, + "loss": 0.6106, + "step": 234400 + }, + { + "epoch": 5.402044763746892, + "grad_norm": 3.600402593612671, + "learning_rate": 2.305260249269208e-05, + "loss": 0.6042, + "step": 234600 + }, + { + "epoch": 5.406650087501151, + "grad_norm": 2.895305633544922, + "learning_rate": 2.3045330928869564e-05, + "loss": 0.6136, + "step": 234800 + }, + { + "epoch": 5.411255411255412, + "grad_norm": 2.833522319793701, + "learning_rate": 2.3038059365047046e-05, + "loss": 0.6148, + "step": 235000 + }, + { + "epoch": 5.415860735009671, + "grad_norm": 3.2785439491271973, + "learning_rate": 2.3030787801224532e-05, + "loss": 0.5955, + "step": 235200 + }, + { + "epoch": 5.420466058763931, + "grad_norm": 2.5901570320129395, + "learning_rate": 2.3023516237402015e-05, + "loss": 0.6243, + "step": 235400 + }, + { + "epoch": 5.425071382518191, + "grad_norm": 2.593280076980591, + "learning_rate": 2.30162446735795e-05, + "loss": 0.6095, + "step": 235600 + }, + { + "epoch": 5.429676706272451, + "grad_norm": 2.4974257946014404, + "learning_rate": 2.3008973109756987e-05, + "loss": 0.6196, + "step": 235800 + }, + { + "epoch": 5.4342820300267105, + "grad_norm": 2.7232484817504883, + "learning_rate": 2.3001737903753582e-05, + "loss": 0.6088, + "step": 236000 + }, + { + "epoch": 5.438887353780971, + "grad_norm": 3.5613925457000732, + "learning_rate": 2.2994466339931068e-05, + "loss": 0.6062, + "step": 236200 + }, + { + "epoch": 5.443492677535231, + "grad_norm": 3.134225368499756, + "learning_rate": 2.2987194776108554e-05, + "loss": 0.6045, + "step": 236400 + }, + { + "epoch": 5.448098001289491, + "grad_norm": 3.1845412254333496, + "learning_rate": 2.2979923212286033e-05, + "loss": 0.6135, + "step": 236600 + }, + { + "epoch": 5.452703325043751, + "grad_norm": 3.119331121444702, + "learning_rate": 2.297265164846352e-05, + "loss": 0.6089, + "step": 236800 + }, + { + "epoch": 5.45730864879801, + "grad_norm": 3.175262928009033, + "learning_rate": 2.2965380084641005e-05, + "loss": 0.6185, + "step": 237000 + }, + { + "epoch": 5.461913972552271, + "grad_norm": 2.737478733062744, + "learning_rate": 2.2958108520818487e-05, + "loss": 0.6136, + "step": 237200 + }, + { + "epoch": 5.46651929630653, + "grad_norm": 3.8264358043670654, + "learning_rate": 2.2950836956995973e-05, + "loss": 0.5989, + "step": 237400 + }, + { + "epoch": 5.47112462006079, + "grad_norm": 2.678365468978882, + "learning_rate": 2.294356539317346e-05, + "loss": 0.6062, + "step": 237600 + }, + { + "epoch": 5.47572994381505, + "grad_norm": 3.2151529788970947, + "learning_rate": 2.2936293829350938e-05, + "loss": 0.6091, + "step": 237800 + }, + { + "epoch": 5.48033526756931, + "grad_norm": 2.8425002098083496, + "learning_rate": 2.2929022265528424e-05, + "loss": 0.6134, + "step": 238000 + }, + { + "epoch": 5.48494059132357, + "grad_norm": 2.8937878608703613, + "learning_rate": 2.292175070170591e-05, + "loss": 0.6041, + "step": 238200 + }, + { + "epoch": 5.48954591507783, + "grad_norm": 3.9438226222991943, + "learning_rate": 2.2914479137883392e-05, + "loss": 0.6061, + "step": 238400 + }, + { + "epoch": 5.49415123883209, + "grad_norm": 2.902695655822754, + "learning_rate": 2.290720757406088e-05, + "loss": 0.6239, + "step": 238600 + }, + { + "epoch": 5.49875656258635, + "grad_norm": 2.9250173568725586, + "learning_rate": 2.2899936010238364e-05, + "loss": 0.6094, + "step": 238800 + }, + { + "epoch": 5.50336188634061, + "grad_norm": 3.668975591659546, + "learning_rate": 2.2892664446415847e-05, + "loss": 0.61, + "step": 239000 + }, + { + "epoch": 5.507967210094869, + "grad_norm": 3.020686626434326, + "learning_rate": 2.288539288259333e-05, + "loss": 0.621, + "step": 239200 + }, + { + "epoch": 5.51257253384913, + "grad_norm": 3.6001780033111572, + "learning_rate": 2.2878121318770815e-05, + "loss": 0.6037, + "step": 239400 + }, + { + "epoch": 5.517177857603389, + "grad_norm": 2.8009440898895264, + "learning_rate": 2.28708497549483e-05, + "loss": 0.6158, + "step": 239600 + }, + { + "epoch": 5.52178318135765, + "grad_norm": 2.9138927459716797, + "learning_rate": 2.2863578191125784e-05, + "loss": 0.6091, + "step": 239800 + }, + { + "epoch": 5.5263885051119095, + "grad_norm": 3.2366840839385986, + "learning_rate": 2.285630662730327e-05, + "loss": 0.61, + "step": 240000 + }, + { + "epoch": 5.530993828866169, + "grad_norm": 3.211121082305908, + "learning_rate": 2.2849071421299865e-05, + "loss": 0.6203, + "step": 240200 + }, + { + "epoch": 5.53559915262043, + "grad_norm": 2.9791855812072754, + "learning_rate": 2.284179985747735e-05, + "loss": 0.611, + "step": 240400 + }, + { + "epoch": 5.540204476374689, + "grad_norm": 2.9468283653259277, + "learning_rate": 2.2834528293654833e-05, + "loss": 0.5963, + "step": 240600 + }, + { + "epoch": 5.544809800128949, + "grad_norm": 2.6335391998291016, + "learning_rate": 2.282725672983232e-05, + "loss": 0.5982, + "step": 240800 + }, + { + "epoch": 5.549415123883209, + "grad_norm": 3.262544870376587, + "learning_rate": 2.28199851660098e-05, + "loss": 0.6005, + "step": 241000 + }, + { + "epoch": 5.554020447637469, + "grad_norm": 2.9877734184265137, + "learning_rate": 2.28127499600064e-05, + "loss": 0.6062, + "step": 241200 + }, + { + "epoch": 5.558625771391728, + "grad_norm": 3.008165121078491, + "learning_rate": 2.2805478396183883e-05, + "loss": 0.6056, + "step": 241400 + }, + { + "epoch": 5.563231095145989, + "grad_norm": 2.6136744022369385, + "learning_rate": 2.279820683236137e-05, + "loss": 0.604, + "step": 241600 + }, + { + "epoch": 5.5678364189002485, + "grad_norm": 3.0362038612365723, + "learning_rate": 2.2790935268538855e-05, + "loss": 0.6016, + "step": 241800 + }, + { + "epoch": 5.572441742654509, + "grad_norm": 3.641286849975586, + "learning_rate": 2.2783663704716334e-05, + "loss": 0.6089, + "step": 242000 + }, + { + "epoch": 5.577047066408769, + "grad_norm": 3.9004762172698975, + "learning_rate": 2.277639214089382e-05, + "loss": 0.6049, + "step": 242200 + }, + { + "epoch": 5.581652390163028, + "grad_norm": 3.441751003265381, + "learning_rate": 2.2769120577071306e-05, + "loss": 0.6185, + "step": 242400 + }, + { + "epoch": 5.586257713917289, + "grad_norm": 2.6894123554229736, + "learning_rate": 2.276184901324879e-05, + "loss": 0.614, + "step": 242600 + }, + { + "epoch": 5.590863037671548, + "grad_norm": 3.961691379547119, + "learning_rate": 2.2754577449426274e-05, + "loss": 0.6081, + "step": 242800 + }, + { + "epoch": 5.595468361425809, + "grad_norm": 4.229228496551514, + "learning_rate": 2.274730588560376e-05, + "loss": 0.6125, + "step": 243000 + }, + { + "epoch": 5.600073685180068, + "grad_norm": 2.8456718921661377, + "learning_rate": 2.2740034321781242e-05, + "loss": 0.6013, + "step": 243200 + }, + { + "epoch": 5.604679008934328, + "grad_norm": 2.563215494155884, + "learning_rate": 2.2732762757958725e-05, + "loss": 0.6055, + "step": 243400 + }, + { + "epoch": 5.609284332688588, + "grad_norm": 3.1517744064331055, + "learning_rate": 2.272549119413621e-05, + "loss": 0.598, + "step": 243600 + }, + { + "epoch": 5.613889656442848, + "grad_norm": 3.084362030029297, + "learning_rate": 2.2718219630313697e-05, + "loss": 0.6114, + "step": 243800 + }, + { + "epoch": 5.618494980197108, + "grad_norm": 2.9705121517181396, + "learning_rate": 2.271094806649118e-05, + "loss": 0.6105, + "step": 244000 + }, + { + "epoch": 5.623100303951368, + "grad_norm": 2.46872615814209, + "learning_rate": 2.2703676502668665e-05, + "loss": 0.6021, + "step": 244200 + }, + { + "epoch": 5.627705627705628, + "grad_norm": 3.1614882946014404, + "learning_rate": 2.269640493884615e-05, + "loss": 0.6155, + "step": 244400 + }, + { + "epoch": 5.632310951459887, + "grad_norm": 3.2688674926757812, + "learning_rate": 2.268913337502363e-05, + "loss": 0.617, + "step": 244600 + }, + { + "epoch": 5.636916275214148, + "grad_norm": 2.7544074058532715, + "learning_rate": 2.2681861811201116e-05, + "loss": 0.6218, + "step": 244800 + }, + { + "epoch": 5.641521598968407, + "grad_norm": 3.150360345840454, + "learning_rate": 2.2674590247378602e-05, + "loss": 0.6118, + "step": 245000 + }, + { + "epoch": 5.646126922722667, + "grad_norm": 3.299985647201538, + "learning_rate": 2.2667318683556088e-05, + "loss": 0.6018, + "step": 245200 + }, + { + "epoch": 5.650732246476927, + "grad_norm": 3.0777273178100586, + "learning_rate": 2.266004711973357e-05, + "loss": 0.6136, + "step": 245400 + }, + { + "epoch": 5.655337570231187, + "grad_norm": 3.37021803855896, + "learning_rate": 2.2652775555911056e-05, + "loss": 0.6124, + "step": 245600 + }, + { + "epoch": 5.6599428939854475, + "grad_norm": 3.1448745727539062, + "learning_rate": 2.264550399208854e-05, + "loss": 0.6185, + "step": 245800 + }, + { + "epoch": 5.664548217739707, + "grad_norm": 3.1662933826446533, + "learning_rate": 2.263823242826602e-05, + "loss": 0.6087, + "step": 246000 + }, + { + "epoch": 5.669153541493967, + "grad_norm": 2.839693546295166, + "learning_rate": 2.2630960864443507e-05, + "loss": 0.6074, + "step": 246200 + }, + { + "epoch": 5.673758865248227, + "grad_norm": 3.3215548992156982, + "learning_rate": 2.2623689300620993e-05, + "loss": 0.6169, + "step": 246400 + }, + { + "epoch": 5.678364189002487, + "grad_norm": 3.0180182456970215, + "learning_rate": 2.2616417736798476e-05, + "loss": 0.6126, + "step": 246600 + }, + { + "epoch": 5.682969512756747, + "grad_norm": 2.791381597518921, + "learning_rate": 2.2609146172975962e-05, + "loss": 0.6059, + "step": 246800 + }, + { + "epoch": 5.687574836511007, + "grad_norm": 3.137742280960083, + "learning_rate": 2.2601874609153448e-05, + "loss": 0.6082, + "step": 247000 + }, + { + "epoch": 5.692180160265266, + "grad_norm": 3.0345163345336914, + "learning_rate": 2.2594603045330927e-05, + "loss": 0.5985, + "step": 247200 + }, + { + "epoch": 5.696785484019527, + "grad_norm": 3.281728744506836, + "learning_rate": 2.2587331481508413e-05, + "loss": 0.5944, + "step": 247400 + }, + { + "epoch": 5.7013908077737865, + "grad_norm": 2.6144189834594727, + "learning_rate": 2.258009627550501e-05, + "loss": 0.6033, + "step": 247600 + }, + { + "epoch": 5.705996131528046, + "grad_norm": 3.488960027694702, + "learning_rate": 2.2572824711682494e-05, + "loss": 0.6041, + "step": 247800 + }, + { + "epoch": 5.710601455282307, + "grad_norm": 2.470059871673584, + "learning_rate": 2.256555314785998e-05, + "loss": 0.5939, + "step": 248000 + }, + { + "epoch": 5.715206779036566, + "grad_norm": 2.9853994846343994, + "learning_rate": 2.2558281584037466e-05, + "loss": 0.6016, + "step": 248200 + }, + { + "epoch": 5.719812102790826, + "grad_norm": 2.8882532119750977, + "learning_rate": 2.2551010020214948e-05, + "loss": 0.614, + "step": 248400 + }, + { + "epoch": 5.724417426545086, + "grad_norm": 2.4324746131896973, + "learning_rate": 2.254373845639243e-05, + "loss": 0.6169, + "step": 248600 + }, + { + "epoch": 5.729022750299346, + "grad_norm": 3.518960952758789, + "learning_rate": 2.253650325038903e-05, + "loss": 0.6053, + "step": 248800 + }, + { + "epoch": 5.733628074053606, + "grad_norm": 3.073248863220215, + "learning_rate": 2.2529231686566512e-05, + "loss": 0.6129, + "step": 249000 + }, + { + "epoch": 5.738233397807866, + "grad_norm": 3.165026903152466, + "learning_rate": 2.2521960122743998e-05, + "loss": 0.6069, + "step": 249200 + }, + { + "epoch": 5.7428387215621255, + "grad_norm": 3.6242356300354004, + "learning_rate": 2.2514688558921484e-05, + "loss": 0.617, + "step": 249400 + }, + { + "epoch": 5.747444045316386, + "grad_norm": 2.7420730590820312, + "learning_rate": 2.2507416995098966e-05, + "loss": 0.6056, + "step": 249600 + }, + { + "epoch": 5.752049369070646, + "grad_norm": 3.044351577758789, + "learning_rate": 2.2500145431276452e-05, + "loss": 0.6047, + "step": 249800 + }, + { + "epoch": 5.756654692824906, + "grad_norm": 3.59561824798584, + "learning_rate": 2.2492873867453935e-05, + "loss": 0.5947, + "step": 250000 + }, + { + "epoch": 5.761260016579166, + "grad_norm": 2.7053959369659424, + "learning_rate": 2.2485602303631417e-05, + "loss": 0.6167, + "step": 250200 + }, + { + "epoch": 5.765865340333425, + "grad_norm": 3.2808427810668945, + "learning_rate": 2.2478367097628016e-05, + "loss": 0.6096, + "step": 250400 + }, + { + "epoch": 5.770470664087686, + "grad_norm": 3.0669496059417725, + "learning_rate": 2.2471095533805502e-05, + "loss": 0.6172, + "step": 250600 + }, + { + "epoch": 5.775075987841945, + "grad_norm": 3.3072104454040527, + "learning_rate": 2.2463823969982984e-05, + "loss": 0.6153, + "step": 250800 + }, + { + "epoch": 5.779681311596205, + "grad_norm": 3.2189414501190186, + "learning_rate": 2.245655240616047e-05, + "loss": 0.6051, + "step": 251000 + }, + { + "epoch": 5.784286635350465, + "grad_norm": 3.9628310203552246, + "learning_rate": 2.2449280842337956e-05, + "loss": 0.6128, + "step": 251200 + }, + { + "epoch": 5.788891959104725, + "grad_norm": 2.860816717147827, + "learning_rate": 2.244200927851544e-05, + "loss": 0.6007, + "step": 251400 + }, + { + "epoch": 5.793497282858985, + "grad_norm": 2.6828348636627197, + "learning_rate": 2.243473771469292e-05, + "loss": 0.5999, + "step": 251600 + }, + { + "epoch": 5.798102606613245, + "grad_norm": 3.44728946685791, + "learning_rate": 2.2427466150870407e-05, + "loss": 0.6162, + "step": 251800 + }, + { + "epoch": 5.802707930367505, + "grad_norm": 2.5375330448150635, + "learning_rate": 2.242019458704789e-05, + "loss": 0.6101, + "step": 252000 + }, + { + "epoch": 5.807313254121765, + "grad_norm": 3.269181489944458, + "learning_rate": 2.2412923023225375e-05, + "loss": 0.6227, + "step": 252200 + }, + { + "epoch": 5.811918577876025, + "grad_norm": 3.957916498184204, + "learning_rate": 2.240565145940286e-05, + "loss": 0.6036, + "step": 252400 + }, + { + "epoch": 5.816523901630284, + "grad_norm": 3.4399096965789795, + "learning_rate": 2.2398379895580344e-05, + "loss": 0.6127, + "step": 252600 + }, + { + "epoch": 5.821129225384545, + "grad_norm": 3.4735524654388428, + "learning_rate": 2.2391108331757826e-05, + "loss": 0.6012, + "step": 252800 + }, + { + "epoch": 5.825734549138804, + "grad_norm": 2.868939161300659, + "learning_rate": 2.2383836767935312e-05, + "loss": 0.6138, + "step": 253000 + }, + { + "epoch": 5.830339872893065, + "grad_norm": 3.8807289600372314, + "learning_rate": 2.2376565204112798e-05, + "loss": 0.6036, + "step": 253200 + }, + { + "epoch": 5.8349451966473245, + "grad_norm": 3.3366916179656982, + "learning_rate": 2.236929364029028e-05, + "loss": 0.6151, + "step": 253400 + }, + { + "epoch": 5.839550520401584, + "grad_norm": 3.0559029579162598, + "learning_rate": 2.2362022076467767e-05, + "loss": 0.6081, + "step": 253600 + }, + { + "epoch": 5.8441558441558445, + "grad_norm": 2.922211170196533, + "learning_rate": 2.2354750512645253e-05, + "loss": 0.611, + "step": 253800 + }, + { + "epoch": 5.848761167910104, + "grad_norm": 3.6084461212158203, + "learning_rate": 2.2347478948822735e-05, + "loss": 0.6, + "step": 254000 + }, + { + "epoch": 5.853366491664364, + "grad_norm": 3.366588592529297, + "learning_rate": 2.2340207385000218e-05, + "loss": 0.6079, + "step": 254200 + }, + { + "epoch": 5.857971815418624, + "grad_norm": 4.779687404632568, + "learning_rate": 2.2332935821177704e-05, + "loss": 0.5983, + "step": 254400 + }, + { + "epoch": 5.862577139172884, + "grad_norm": 2.4821720123291016, + "learning_rate": 2.2325664257355186e-05, + "loss": 0.5977, + "step": 254600 + }, + { + "epoch": 5.867182462927143, + "grad_norm": 3.079432487487793, + "learning_rate": 2.2318392693532672e-05, + "loss": 0.6135, + "step": 254800 + }, + { + "epoch": 5.871787786681404, + "grad_norm": 2.8621842861175537, + "learning_rate": 2.2311121129710158e-05, + "loss": 0.615, + "step": 255000 + }, + { + "epoch": 5.8763931104356635, + "grad_norm": 3.1682121753692627, + "learning_rate": 2.230384956588764e-05, + "loss": 0.6029, + "step": 255200 + }, + { + "epoch": 5.880998434189923, + "grad_norm": 3.040579080581665, + "learning_rate": 2.2296578002065123e-05, + "loss": 0.6091, + "step": 255400 + }, + { + "epoch": 5.885603757944184, + "grad_norm": 3.665220260620117, + "learning_rate": 2.228930643824261e-05, + "loss": 0.6065, + "step": 255600 + }, + { + "epoch": 5.890209081698443, + "grad_norm": 2.9831740856170654, + "learning_rate": 2.228203487442009e-05, + "loss": 0.6027, + "step": 255800 + }, + { + "epoch": 5.894814405452704, + "grad_norm": 2.8328495025634766, + "learning_rate": 2.2274763310597577e-05, + "loss": 0.6086, + "step": 256000 + }, + { + "epoch": 5.899419729206963, + "grad_norm": 3.341989755630493, + "learning_rate": 2.2267528104594176e-05, + "loss": 0.6206, + "step": 256200 + }, + { + "epoch": 5.904025052961223, + "grad_norm": 2.885848045349121, + "learning_rate": 2.226025654077166e-05, + "loss": 0.6136, + "step": 256400 + }, + { + "epoch": 5.908630376715483, + "grad_norm": 2.7301769256591797, + "learning_rate": 2.2252984976949144e-05, + "loss": 0.6032, + "step": 256600 + }, + { + "epoch": 5.913235700469743, + "grad_norm": 3.254574775695801, + "learning_rate": 2.2245713413126627e-05, + "loss": 0.6161, + "step": 256800 + }, + { + "epoch": 5.917841024224003, + "grad_norm": 3.9567196369171143, + "learning_rate": 2.223844184930411e-05, + "loss": 0.6057, + "step": 257000 + }, + { + "epoch": 5.922446347978263, + "grad_norm": 2.612321615219116, + "learning_rate": 2.2231170285481595e-05, + "loss": 0.6112, + "step": 257200 + }, + { + "epoch": 5.927051671732523, + "grad_norm": 2.945931911468506, + "learning_rate": 2.222389872165908e-05, + "loss": 0.6134, + "step": 257400 + }, + { + "epoch": 5.931656995486783, + "grad_norm": 3.260687828063965, + "learning_rate": 2.2216627157836564e-05, + "loss": 0.5979, + "step": 257600 + }, + { + "epoch": 5.936262319241043, + "grad_norm": 2.9708778858184814, + "learning_rate": 2.220935559401405e-05, + "loss": 0.6135, + "step": 257800 + }, + { + "epoch": 5.940867642995302, + "grad_norm": 3.1571850776672363, + "learning_rate": 2.2202084030191536e-05, + "loss": 0.615, + "step": 258000 + }, + { + "epoch": 5.945472966749563, + "grad_norm": 3.849186658859253, + "learning_rate": 2.2194812466369018e-05, + "loss": 0.62, + "step": 258200 + }, + { + "epoch": 5.950078290503822, + "grad_norm": 3.6725234985351562, + "learning_rate": 2.21875409025465e-05, + "loss": 0.6047, + "step": 258400 + }, + { + "epoch": 5.954683614258082, + "grad_norm": 2.586672306060791, + "learning_rate": 2.2180269338723987e-05, + "loss": 0.6092, + "step": 258600 + }, + { + "epoch": 5.959288938012342, + "grad_norm": 3.1655867099761963, + "learning_rate": 2.2172997774901473e-05, + "loss": 0.609, + "step": 258800 + }, + { + "epoch": 5.963894261766602, + "grad_norm": 2.586534023284912, + "learning_rate": 2.2165726211078955e-05, + "loss": 0.6168, + "step": 259000 + }, + { + "epoch": 5.9684995855208625, + "grad_norm": 2.9593117237091064, + "learning_rate": 2.215845464725644e-05, + "loss": 0.6058, + "step": 259200 + }, + { + "epoch": 5.973104909275122, + "grad_norm": 3.1021628379821777, + "learning_rate": 2.2151183083433923e-05, + "loss": 0.6136, + "step": 259400 + }, + { + "epoch": 5.977710233029382, + "grad_norm": 3.3624014854431152, + "learning_rate": 2.2143911519611406e-05, + "loss": 0.6066, + "step": 259600 + }, + { + "epoch": 5.982315556783642, + "grad_norm": 3.2505760192871094, + "learning_rate": 2.2136639955788892e-05, + "loss": 0.6154, + "step": 259800 + }, + { + "epoch": 5.986920880537902, + "grad_norm": 3.521606683731079, + "learning_rate": 2.2129368391966378e-05, + "loss": 0.6109, + "step": 260000 + }, + { + "epoch": 5.991526204292162, + "grad_norm": 3.341179370880127, + "learning_rate": 2.212209682814386e-05, + "loss": 0.6179, + "step": 260200 + }, + { + "epoch": 5.996131528046422, + "grad_norm": 3.8948066234588623, + "learning_rate": 2.211486162214046e-05, + "loss": 0.6183, + "step": 260400 + }, + { + "epoch": 6.0, + "eval_loss": 0.5793450474739075, + "eval_runtime": 146.4894, + "eval_samples_per_second": 193.604, + "eval_steps_per_second": 12.103, + "step": 260568 + }, + { + "epoch": 6.000736851800681, + "grad_norm": 3.0538671016693115, + "learning_rate": 2.2107626416137054e-05, + "loss": 0.6025, + "step": 260600 + }, + { + "epoch": 6.005342175554942, + "grad_norm": 2.459566354751587, + "learning_rate": 2.210035485231454e-05, + "loss": 0.6142, + "step": 260800 + }, + { + "epoch": 6.0099474993092015, + "grad_norm": 3.1511268615722656, + "learning_rate": 2.2093083288492026e-05, + "loss": 0.6067, + "step": 261000 + }, + { + "epoch": 6.014552823063461, + "grad_norm": 2.9049158096313477, + "learning_rate": 2.2085811724669505e-05, + "loss": 0.6013, + "step": 261200 + }, + { + "epoch": 6.0191581468177215, + "grad_norm": 2.9899065494537354, + "learning_rate": 2.207854016084699e-05, + "loss": 0.5913, + "step": 261400 + }, + { + "epoch": 6.023763470571981, + "grad_norm": 3.2502787113189697, + "learning_rate": 2.2071268597024477e-05, + "loss": 0.5923, + "step": 261600 + }, + { + "epoch": 6.028368794326241, + "grad_norm": 3.313624382019043, + "learning_rate": 2.2063997033201963e-05, + "loss": 0.6092, + "step": 261800 + }, + { + "epoch": 6.032974118080501, + "grad_norm": 3.1583757400512695, + "learning_rate": 2.2056725469379445e-05, + "loss": 0.6048, + "step": 262000 + }, + { + "epoch": 6.037579441834761, + "grad_norm": 2.4791722297668457, + "learning_rate": 2.204945390555693e-05, + "loss": 0.6054, + "step": 262200 + }, + { + "epoch": 6.042184765589021, + "grad_norm": 3.2906877994537354, + "learning_rate": 2.2042182341734414e-05, + "loss": 0.6054, + "step": 262400 + }, + { + "epoch": 6.046790089343281, + "grad_norm": 3.4483656883239746, + "learning_rate": 2.2034910777911896e-05, + "loss": 0.5909, + "step": 262600 + }, + { + "epoch": 6.0513954130975405, + "grad_norm": 3.3106260299682617, + "learning_rate": 2.2027639214089382e-05, + "loss": 0.6066, + "step": 262800 + }, + { + "epoch": 6.056000736851801, + "grad_norm": 3.107494354248047, + "learning_rate": 2.2020367650266868e-05, + "loss": 0.5997, + "step": 263000 + }, + { + "epoch": 6.0606060606060606, + "grad_norm": 3.537893295288086, + "learning_rate": 2.2013132444263463e-05, + "loss": 0.5907, + "step": 263200 + }, + { + "epoch": 6.06521138436032, + "grad_norm": 4.185168266296387, + "learning_rate": 2.200586088044095e-05, + "loss": 0.5958, + "step": 263400 + }, + { + "epoch": 6.069816708114581, + "grad_norm": 2.7345850467681885, + "learning_rate": 2.1998589316618435e-05, + "loss": 0.6022, + "step": 263600 + }, + { + "epoch": 6.07442203186884, + "grad_norm": 2.91339111328125, + "learning_rate": 2.1991317752795914e-05, + "loss": 0.6085, + "step": 263800 + }, + { + "epoch": 6.079027355623101, + "grad_norm": 3.6052587032318115, + "learning_rate": 2.19840461889734e-05, + "loss": 0.5946, + "step": 264000 + }, + { + "epoch": 6.08363267937736, + "grad_norm": 3.44474196434021, + "learning_rate": 2.1976774625150886e-05, + "loss": 0.6043, + "step": 264200 + }, + { + "epoch": 6.08823800313162, + "grad_norm": 2.796043634414673, + "learning_rate": 2.196950306132837e-05, + "loss": 0.5965, + "step": 264400 + }, + { + "epoch": 6.09284332688588, + "grad_norm": 2.4689533710479736, + "learning_rate": 2.1962231497505855e-05, + "loss": 0.6029, + "step": 264600 + }, + { + "epoch": 6.09744865064014, + "grad_norm": 4.061243057250977, + "learning_rate": 2.195495993368334e-05, + "loss": 0.6063, + "step": 264800 + }, + { + "epoch": 6.1020539743944, + "grad_norm": 3.009557008743286, + "learning_rate": 2.1947688369860823e-05, + "loss": 0.6208, + "step": 265000 + }, + { + "epoch": 6.10665929814866, + "grad_norm": 3.3373091220855713, + "learning_rate": 2.1940416806038306e-05, + "loss": 0.609, + "step": 265200 + }, + { + "epoch": 6.11126462190292, + "grad_norm": 2.8920810222625732, + "learning_rate": 2.193314524221579e-05, + "loss": 0.5935, + "step": 265400 + }, + { + "epoch": 6.11586994565718, + "grad_norm": 4.441521644592285, + "learning_rate": 2.1925873678393274e-05, + "loss": 0.6031, + "step": 265600 + }, + { + "epoch": 6.12047526941144, + "grad_norm": 3.1611151695251465, + "learning_rate": 2.191860211457076e-05, + "loss": 0.6097, + "step": 265800 + }, + { + "epoch": 6.125080593165699, + "grad_norm": 2.591564178466797, + "learning_rate": 2.1911330550748246e-05, + "loss": 0.5988, + "step": 266000 + }, + { + "epoch": 6.12968591691996, + "grad_norm": 2.5633347034454346, + "learning_rate": 2.1904058986925732e-05, + "loss": 0.6033, + "step": 266200 + }, + { + "epoch": 6.134291240674219, + "grad_norm": 2.846238851547241, + "learning_rate": 2.189678742310321e-05, + "loss": 0.6011, + "step": 266400 + }, + { + "epoch": 6.138896564428479, + "grad_norm": 2.9226114749908447, + "learning_rate": 2.1889515859280697e-05, + "loss": 0.6115, + "step": 266600 + }, + { + "epoch": 6.1435018881827395, + "grad_norm": 3.3836491107940674, + "learning_rate": 2.1882244295458183e-05, + "loss": 0.6041, + "step": 266800 + }, + { + "epoch": 6.148107211936999, + "grad_norm": 3.141162157058716, + "learning_rate": 2.1874972731635665e-05, + "loss": 0.5929, + "step": 267000 + }, + { + "epoch": 6.1527125356912595, + "grad_norm": 3.0072133541107178, + "learning_rate": 2.186770116781315e-05, + "loss": 0.6038, + "step": 267200 + }, + { + "epoch": 6.157317859445519, + "grad_norm": 3.065700054168701, + "learning_rate": 2.1860465961809746e-05, + "loss": 0.5946, + "step": 267400 + }, + { + "epoch": 6.161923183199779, + "grad_norm": 2.743119239807129, + "learning_rate": 2.1853194397987232e-05, + "loss": 0.615, + "step": 267600 + }, + { + "epoch": 6.166528506954039, + "grad_norm": 3.0159752368927, + "learning_rate": 2.184595919198383e-05, + "loss": 0.5905, + "step": 267800 + }, + { + "epoch": 6.171133830708299, + "grad_norm": 3.077939033508301, + "learning_rate": 2.1838687628161313e-05, + "loss": 0.6043, + "step": 268000 + }, + { + "epoch": 6.175739154462558, + "grad_norm": 3.3442816734313965, + "learning_rate": 2.1831416064338796e-05, + "loss": 0.606, + "step": 268200 + }, + { + "epoch": 6.180344478216819, + "grad_norm": 2.753549337387085, + "learning_rate": 2.1824144500516282e-05, + "loss": 0.6119, + "step": 268400 + }, + { + "epoch": 6.1849498019710785, + "grad_norm": 2.6323747634887695, + "learning_rate": 2.1816872936693764e-05, + "loss": 0.5995, + "step": 268600 + }, + { + "epoch": 6.189555125725338, + "grad_norm": 3.0418403148651123, + "learning_rate": 2.180960137287125e-05, + "loss": 0.6196, + "step": 268800 + }, + { + "epoch": 6.1941604494795985, + "grad_norm": 2.880768060684204, + "learning_rate": 2.1802329809048736e-05, + "loss": 0.5948, + "step": 269000 + }, + { + "epoch": 6.198765773233858, + "grad_norm": 3.229389190673828, + "learning_rate": 2.179505824522622e-05, + "loss": 0.6088, + "step": 269200 + }, + { + "epoch": 6.203371096988119, + "grad_norm": 2.6277809143066406, + "learning_rate": 2.17877866814037e-05, + "loss": 0.6097, + "step": 269400 + }, + { + "epoch": 6.207976420742378, + "grad_norm": 2.8957467079162598, + "learning_rate": 2.1780515117581187e-05, + "loss": 0.6027, + "step": 269600 + }, + { + "epoch": 6.212581744496638, + "grad_norm": 3.4677155017852783, + "learning_rate": 2.1773243553758673e-05, + "loss": 0.6016, + "step": 269800 + }, + { + "epoch": 6.217187068250898, + "grad_norm": 2.457707405090332, + "learning_rate": 2.1765971989936156e-05, + "loss": 0.6114, + "step": 270000 + }, + { + "epoch": 6.221792392005158, + "grad_norm": 3.322401762008667, + "learning_rate": 2.1758773141751867e-05, + "loss": 0.6114, + "step": 270200 + }, + { + "epoch": 6.2263977157594175, + "grad_norm": 2.784977912902832, + "learning_rate": 2.175150157792935e-05, + "loss": 0.5959, + "step": 270400 + }, + { + "epoch": 6.231003039513678, + "grad_norm": 2.8627588748931885, + "learning_rate": 2.1744230014106835e-05, + "loss": 0.6067, + "step": 270600 + }, + { + "epoch": 6.2356083632679375, + "grad_norm": 3.0779459476470947, + "learning_rate": 2.173695845028432e-05, + "loss": 0.6007, + "step": 270800 + }, + { + "epoch": 6.240213687022198, + "grad_norm": 3.0918633937835693, + "learning_rate": 2.17296868864618e-05, + "loss": 0.6034, + "step": 271000 + }, + { + "epoch": 6.244819010776458, + "grad_norm": 3.245352029800415, + "learning_rate": 2.1722415322639286e-05, + "loss": 0.6013, + "step": 271200 + }, + { + "epoch": 6.249424334530717, + "grad_norm": 3.179417610168457, + "learning_rate": 2.1715143758816772e-05, + "loss": 0.6043, + "step": 271400 + }, + { + "epoch": 6.254029658284978, + "grad_norm": 3.051330804824829, + "learning_rate": 2.1707872194994255e-05, + "loss": 0.5994, + "step": 271600 + }, + { + "epoch": 6.258634982039237, + "grad_norm": 3.0867514610290527, + "learning_rate": 2.170060063117174e-05, + "loss": 0.607, + "step": 271800 + }, + { + "epoch": 6.263240305793497, + "grad_norm": 3.2159993648529053, + "learning_rate": 2.1693329067349227e-05, + "loss": 0.592, + "step": 272000 + }, + { + "epoch": 6.267845629547757, + "grad_norm": 2.641268253326416, + "learning_rate": 2.168605750352671e-05, + "loss": 0.6019, + "step": 272200 + }, + { + "epoch": 6.272450953302017, + "grad_norm": 3.2037994861602783, + "learning_rate": 2.167878593970419e-05, + "loss": 0.5966, + "step": 272400 + }, + { + "epoch": 6.2770562770562774, + "grad_norm": 2.531846523284912, + "learning_rate": 2.1671514375881678e-05, + "loss": 0.598, + "step": 272600 + }, + { + "epoch": 6.281661600810537, + "grad_norm": 3.13655161857605, + "learning_rate": 2.1664242812059163e-05, + "loss": 0.6102, + "step": 272800 + }, + { + "epoch": 6.286266924564797, + "grad_norm": 3.5509562492370605, + "learning_rate": 2.1656971248236646e-05, + "loss": 0.6071, + "step": 273000 + }, + { + "epoch": 6.290872248319057, + "grad_norm": 4.013696193695068, + "learning_rate": 2.1649699684414132e-05, + "loss": 0.6048, + "step": 273200 + }, + { + "epoch": 6.295477572073317, + "grad_norm": 3.690094232559204, + "learning_rate": 2.1642428120591618e-05, + "loss": 0.6037, + "step": 273400 + }, + { + "epoch": 6.300082895827576, + "grad_norm": 2.8810877799987793, + "learning_rate": 2.1635156556769097e-05, + "loss": 0.5939, + "step": 273600 + }, + { + "epoch": 6.304688219581837, + "grad_norm": 2.392094612121582, + "learning_rate": 2.1627884992946583e-05, + "loss": 0.5881, + "step": 273800 + }, + { + "epoch": 6.309293543336096, + "grad_norm": 2.9791033267974854, + "learning_rate": 2.162061342912407e-05, + "loss": 0.6063, + "step": 274000 + }, + { + "epoch": 6.313898867090357, + "grad_norm": 3.6105103492736816, + "learning_rate": 2.161334186530155e-05, + "loss": 0.5895, + "step": 274200 + }, + { + "epoch": 6.3185041908446165, + "grad_norm": 3.0321569442749023, + "learning_rate": 2.1606070301479037e-05, + "loss": 0.5868, + "step": 274400 + }, + { + "epoch": 6.323109514598876, + "grad_norm": 2.3522133827209473, + "learning_rate": 2.1598798737656523e-05, + "loss": 0.5985, + "step": 274600 + }, + { + "epoch": 6.3277148383531365, + "grad_norm": 3.172877550125122, + "learning_rate": 2.1591527173834002e-05, + "loss": 0.6006, + "step": 274800 + }, + { + "epoch": 6.332320162107396, + "grad_norm": 3.2130961418151855, + "learning_rate": 2.1584255610011488e-05, + "loss": 0.6014, + "step": 275000 + }, + { + "epoch": 6.336925485861656, + "grad_norm": 3.1269421577453613, + "learning_rate": 2.1576984046188974e-05, + "loss": 0.5893, + "step": 275200 + }, + { + "epoch": 6.341530809615916, + "grad_norm": 2.8266472816467285, + "learning_rate": 2.1569712482366457e-05, + "loss": 0.6025, + "step": 275400 + }, + { + "epoch": 6.346136133370176, + "grad_norm": 3.0055863857269287, + "learning_rate": 2.1562440918543943e-05, + "loss": 0.5972, + "step": 275600 + }, + { + "epoch": 6.350741457124435, + "grad_norm": 3.1242921352386475, + "learning_rate": 2.155516935472143e-05, + "loss": 0.603, + "step": 275800 + }, + { + "epoch": 6.355346780878696, + "grad_norm": 3.971442222595215, + "learning_rate": 2.1547897790898914e-05, + "loss": 0.6083, + "step": 276000 + }, + { + "epoch": 6.3599521046329555, + "grad_norm": 3.2091002464294434, + "learning_rate": 2.154066258489551e-05, + "loss": 0.6112, + "step": 276200 + }, + { + "epoch": 6.364557428387216, + "grad_norm": 3.378394842147827, + "learning_rate": 2.1533391021072992e-05, + "loss": 0.5943, + "step": 276400 + }, + { + "epoch": 6.3691627521414755, + "grad_norm": 3.663804292678833, + "learning_rate": 2.1526119457250475e-05, + "loss": 0.5942, + "step": 276600 + }, + { + "epoch": 6.373768075895735, + "grad_norm": 2.897817611694336, + "learning_rate": 2.151884789342796e-05, + "loss": 0.5977, + "step": 276800 + }, + { + "epoch": 6.378373399649996, + "grad_norm": 3.405097484588623, + "learning_rate": 2.1511576329605446e-05, + "loss": 0.5935, + "step": 277000 + }, + { + "epoch": 6.382978723404255, + "grad_norm": 3.117182970046997, + "learning_rate": 2.150430476578293e-05, + "loss": 0.6087, + "step": 277200 + }, + { + "epoch": 6.387584047158516, + "grad_norm": 2.3870809078216553, + "learning_rate": 2.1497033201960415e-05, + "loss": 0.6, + "step": 277400 + }, + { + "epoch": 6.392189370912775, + "grad_norm": 3.2303497791290283, + "learning_rate": 2.1489761638137897e-05, + "loss": 0.6026, + "step": 277600 + }, + { + "epoch": 6.396794694667035, + "grad_norm": 3.261338710784912, + "learning_rate": 2.1482490074315383e-05, + "loss": 0.6004, + "step": 277800 + }, + { + "epoch": 6.401400018421295, + "grad_norm": 2.4772233963012695, + "learning_rate": 2.1475218510492866e-05, + "loss": 0.6047, + "step": 278000 + }, + { + "epoch": 6.406005342175555, + "grad_norm": 4.6650590896606445, + "learning_rate": 2.1467946946670352e-05, + "loss": 0.5949, + "step": 278200 + }, + { + "epoch": 6.4106106659298145, + "grad_norm": 2.938955783843994, + "learning_rate": 2.1460675382847838e-05, + "loss": 0.6107, + "step": 278400 + }, + { + "epoch": 6.415215989684075, + "grad_norm": 2.6229448318481445, + "learning_rate": 2.145340381902532e-05, + "loss": 0.6161, + "step": 278600 + }, + { + "epoch": 6.419821313438335, + "grad_norm": 2.3591392040252686, + "learning_rate": 2.1446132255202806e-05, + "loss": 0.5965, + "step": 278800 + }, + { + "epoch": 6.424426637192594, + "grad_norm": 3.7696409225463867, + "learning_rate": 2.143886069138029e-05, + "loss": 0.6014, + "step": 279000 + }, + { + "epoch": 6.429031960946855, + "grad_norm": 3.477569818496704, + "learning_rate": 2.143158912755777e-05, + "loss": 0.5994, + "step": 279200 + }, + { + "epoch": 6.433637284701114, + "grad_norm": 3.7880353927612305, + "learning_rate": 2.1424317563735257e-05, + "loss": 0.6106, + "step": 279400 + }, + { + "epoch": 6.438242608455375, + "grad_norm": 2.632434844970703, + "learning_rate": 2.1417045999912743e-05, + "loss": 0.6119, + "step": 279600 + }, + { + "epoch": 6.442847932209634, + "grad_norm": 2.5677168369293213, + "learning_rate": 2.1409774436090226e-05, + "loss": 0.5988, + "step": 279800 + }, + { + "epoch": 6.447453255963894, + "grad_norm": 3.0066206455230713, + "learning_rate": 2.140250287226771e-05, + "loss": 0.6077, + "step": 280000 + }, + { + "epoch": 6.452058579718154, + "grad_norm": 3.006856679916382, + "learning_rate": 2.1395231308445194e-05, + "loss": 0.5956, + "step": 280200 + }, + { + "epoch": 6.456663903472414, + "grad_norm": 3.6395933628082275, + "learning_rate": 2.138799610244179e-05, + "loss": 0.6028, + "step": 280400 + }, + { + "epoch": 6.461269227226674, + "grad_norm": 2.876479148864746, + "learning_rate": 2.1380724538619275e-05, + "loss": 0.6037, + "step": 280600 + }, + { + "epoch": 6.465874550980934, + "grad_norm": 2.5705347061157227, + "learning_rate": 2.137345297479676e-05, + "loss": 0.6006, + "step": 280800 + }, + { + "epoch": 6.470479874735194, + "grad_norm": 3.720731496810913, + "learning_rate": 2.1366181410974244e-05, + "loss": 0.599, + "step": 281000 + }, + { + "epoch": 6.475085198489454, + "grad_norm": 4.109841823577881, + "learning_rate": 2.135890984715173e-05, + "loss": 0.6118, + "step": 281200 + }, + { + "epoch": 6.479690522243714, + "grad_norm": 3.3773958683013916, + "learning_rate": 2.1351638283329215e-05, + "loss": 0.6039, + "step": 281400 + }, + { + "epoch": 6.484295845997973, + "grad_norm": 2.8852438926696777, + "learning_rate": 2.1344366719506694e-05, + "loss": 0.6062, + "step": 281600 + }, + { + "epoch": 6.488901169752234, + "grad_norm": 2.8208401203155518, + "learning_rate": 2.1337131513503293e-05, + "loss": 0.6038, + "step": 281800 + }, + { + "epoch": 6.4935064935064934, + "grad_norm": 2.8988304138183594, + "learning_rate": 2.132985994968078e-05, + "loss": 0.6174, + "step": 282000 + }, + { + "epoch": 6.498111817260753, + "grad_norm": 3.9682488441467285, + "learning_rate": 2.132258838585826e-05, + "loss": 0.5891, + "step": 282200 + }, + { + "epoch": 6.5027171410150135, + "grad_norm": 2.687206745147705, + "learning_rate": 2.1315316822035747e-05, + "loss": 0.5942, + "step": 282400 + }, + { + "epoch": 6.507322464769273, + "grad_norm": 2.6542367935180664, + "learning_rate": 2.1308045258213233e-05, + "loss": 0.6074, + "step": 282600 + }, + { + "epoch": 6.511927788523533, + "grad_norm": 4.3083906173706055, + "learning_rate": 2.1300773694390716e-05, + "loss": 0.6039, + "step": 282800 + }, + { + "epoch": 6.516533112277793, + "grad_norm": 3.527921199798584, + "learning_rate": 2.1293502130568202e-05, + "loss": 0.6026, + "step": 283000 + }, + { + "epoch": 6.521138436032053, + "grad_norm": 2.9603312015533447, + "learning_rate": 2.1286230566745684e-05, + "loss": 0.6139, + "step": 283200 + }, + { + "epoch": 6.525743759786313, + "grad_norm": 3.2545838356018066, + "learning_rate": 2.1278959002923167e-05, + "loss": 0.6099, + "step": 283400 + }, + { + "epoch": 6.530349083540573, + "grad_norm": 3.675568103790283, + "learning_rate": 2.1271687439100653e-05, + "loss": 0.6022, + "step": 283600 + }, + { + "epoch": 6.5349544072948325, + "grad_norm": 2.5842385292053223, + "learning_rate": 2.126441587527814e-05, + "loss": 0.608, + "step": 283800 + }, + { + "epoch": 6.539559731049093, + "grad_norm": 3.1550087928771973, + "learning_rate": 2.1257144311455625e-05, + "loss": 0.5932, + "step": 284000 + }, + { + "epoch": 6.5441650548033525, + "grad_norm": 2.6331937313079834, + "learning_rate": 2.1249872747633107e-05, + "loss": 0.5988, + "step": 284200 + }, + { + "epoch": 6.548770378557613, + "grad_norm": 3.004626750946045, + "learning_rate": 2.124260118381059e-05, + "loss": 0.6025, + "step": 284400 + }, + { + "epoch": 6.553375702311873, + "grad_norm": 2.480337142944336, + "learning_rate": 2.1235329619988076e-05, + "loss": 0.5956, + "step": 284600 + }, + { + "epoch": 6.557981026066132, + "grad_norm": 3.277716875076294, + "learning_rate": 2.1228058056165558e-05, + "loss": 0.5961, + "step": 284800 + }, + { + "epoch": 6.562586349820393, + "grad_norm": 2.759143590927124, + "learning_rate": 2.1220786492343044e-05, + "loss": 0.5875, + "step": 285000 + }, + { + "epoch": 6.567191673574652, + "grad_norm": 2.480003833770752, + "learning_rate": 2.121351492852053e-05, + "loss": 0.5972, + "step": 285200 + }, + { + "epoch": 6.571796997328912, + "grad_norm": 3.330686569213867, + "learning_rate": 2.1206279722517125e-05, + "loss": 0.5915, + "step": 285400 + }, + { + "epoch": 6.576402321083172, + "grad_norm": 2.3041577339172363, + "learning_rate": 2.119900815869461e-05, + "loss": 0.6091, + "step": 285600 + }, + { + "epoch": 6.581007644837432, + "grad_norm": 3.41249680519104, + "learning_rate": 2.1191736594872097e-05, + "loss": 0.5871, + "step": 285800 + }, + { + "epoch": 6.5856129685916915, + "grad_norm": 2.979804277420044, + "learning_rate": 2.1184465031049576e-05, + "loss": 0.593, + "step": 286000 + }, + { + "epoch": 6.590218292345952, + "grad_norm": 3.0623714923858643, + "learning_rate": 2.1177193467227062e-05, + "loss": 0.6024, + "step": 286200 + }, + { + "epoch": 6.594823616100212, + "grad_norm": 3.924044132232666, + "learning_rate": 2.1169921903404548e-05, + "loss": 0.6108, + "step": 286400 + }, + { + "epoch": 6.599428939854472, + "grad_norm": 2.970489501953125, + "learning_rate": 2.116265033958203e-05, + "loss": 0.5991, + "step": 286600 + }, + { + "epoch": 6.604034263608732, + "grad_norm": 2.641425371170044, + "learning_rate": 2.1155378775759516e-05, + "loss": 0.6112, + "step": 286800 + }, + { + "epoch": 6.608639587362991, + "grad_norm": 2.8824870586395264, + "learning_rate": 2.1148107211937002e-05, + "loss": 0.5837, + "step": 287000 + }, + { + "epoch": 6.613244911117252, + "grad_norm": 2.681509256362915, + "learning_rate": 2.114083564811448e-05, + "loss": 0.6031, + "step": 287200 + }, + { + "epoch": 6.617850234871511, + "grad_norm": 3.070089101791382, + "learning_rate": 2.1133564084291967e-05, + "loss": 0.5896, + "step": 287400 + }, + { + "epoch": 6.622455558625772, + "grad_norm": 3.1326851844787598, + "learning_rate": 2.1126328878288566e-05, + "loss": 0.5909, + "step": 287600 + }, + { + "epoch": 6.627060882380031, + "grad_norm": 3.5522382259368896, + "learning_rate": 2.111905731446605e-05, + "loss": 0.601, + "step": 287800 + }, + { + "epoch": 6.631666206134291, + "grad_norm": 3.211503028869629, + "learning_rate": 2.1111785750643534e-05, + "loss": 0.6018, + "step": 288000 + }, + { + "epoch": 6.6362715298885515, + "grad_norm": 2.8858470916748047, + "learning_rate": 2.110451418682102e-05, + "loss": 0.6202, + "step": 288200 + }, + { + "epoch": 6.640876853642811, + "grad_norm": 3.317497730255127, + "learning_rate": 2.1097242622998503e-05, + "loss": 0.5961, + "step": 288400 + }, + { + "epoch": 6.645482177397071, + "grad_norm": 2.4950830936431885, + "learning_rate": 2.1089971059175985e-05, + "loss": 0.6002, + "step": 288600 + }, + { + "epoch": 6.650087501151331, + "grad_norm": 3.110621929168701, + "learning_rate": 2.108269949535347e-05, + "loss": 0.6046, + "step": 288800 + }, + { + "epoch": 6.654692824905591, + "grad_norm": 3.673153877258301, + "learning_rate": 2.1075427931530954e-05, + "loss": 0.5994, + "step": 289000 + }, + { + "epoch": 6.65929814865985, + "grad_norm": 3.486978530883789, + "learning_rate": 2.106815636770844e-05, + "loss": 0.5981, + "step": 289200 + }, + { + "epoch": 6.663903472414111, + "grad_norm": 3.4665493965148926, + "learning_rate": 2.1060884803885926e-05, + "loss": 0.6127, + "step": 289400 + }, + { + "epoch": 6.6685087961683704, + "grad_norm": 3.6639597415924072, + "learning_rate": 2.1053613240063408e-05, + "loss": 0.6067, + "step": 289600 + }, + { + "epoch": 6.673114119922631, + "grad_norm": 3.012031316757202, + "learning_rate": 2.1046341676240894e-05, + "loss": 0.596, + "step": 289800 + }, + { + "epoch": 6.6777194436768905, + "grad_norm": 2.717411518096924, + "learning_rate": 2.1039070112418377e-05, + "loss": 0.6002, + "step": 290000 + }, + { + "epoch": 6.68232476743115, + "grad_norm": 2.571988105773926, + "learning_rate": 2.1031798548595862e-05, + "loss": 0.5986, + "step": 290200 + }, + { + "epoch": 6.686930091185411, + "grad_norm": 2.4127187728881836, + "learning_rate": 2.1024526984773345e-05, + "loss": 0.594, + "step": 290400 + }, + { + "epoch": 6.69153541493967, + "grad_norm": 3.4181158542633057, + "learning_rate": 2.1017291778769944e-05, + "loss": 0.6023, + "step": 290600 + }, + { + "epoch": 6.696140738693931, + "grad_norm": 3.2288289070129395, + "learning_rate": 2.101005657276654e-05, + "loss": 0.6004, + "step": 290800 + }, + { + "epoch": 6.70074606244819, + "grad_norm": 3.2876336574554443, + "learning_rate": 2.1002785008944025e-05, + "loss": 0.6054, + "step": 291000 + }, + { + "epoch": 6.70535138620245, + "grad_norm": 2.789076328277588, + "learning_rate": 2.099551344512151e-05, + "loss": 0.6041, + "step": 291200 + }, + { + "epoch": 6.70995670995671, + "grad_norm": 2.86173939704895, + "learning_rate": 2.0988241881298993e-05, + "loss": 0.5944, + "step": 291400 + }, + { + "epoch": 6.71456203371097, + "grad_norm": 2.8522419929504395, + "learning_rate": 2.0980970317476476e-05, + "loss": 0.5996, + "step": 291600 + }, + { + "epoch": 6.7191673574652295, + "grad_norm": 3.7493839263916016, + "learning_rate": 2.097369875365396e-05, + "loss": 0.5959, + "step": 291800 + }, + { + "epoch": 6.72377268121949, + "grad_norm": 3.5623674392700195, + "learning_rate": 2.0966427189831444e-05, + "loss": 0.5982, + "step": 292000 + }, + { + "epoch": 6.72837800497375, + "grad_norm": 3.4356765747070312, + "learning_rate": 2.095915562600893e-05, + "loss": 0.5966, + "step": 292200 + }, + { + "epoch": 6.732983328728009, + "grad_norm": 3.1786386966705322, + "learning_rate": 2.0951884062186416e-05, + "loss": 0.6007, + "step": 292400 + }, + { + "epoch": 6.73758865248227, + "grad_norm": 2.424635410308838, + "learning_rate": 2.09446124983639e-05, + "loss": 0.6155, + "step": 292600 + }, + { + "epoch": 6.742193976236529, + "grad_norm": 3.4390957355499268, + "learning_rate": 2.0937340934541384e-05, + "loss": 0.5994, + "step": 292800 + }, + { + "epoch": 6.746799299990789, + "grad_norm": 4.376504421234131, + "learning_rate": 2.0930069370718867e-05, + "loss": 0.5985, + "step": 293000 + }, + { + "epoch": 6.751404623745049, + "grad_norm": 2.7441396713256836, + "learning_rate": 2.092279780689635e-05, + "loss": 0.6037, + "step": 293200 + }, + { + "epoch": 6.756009947499309, + "grad_norm": 2.771723985671997, + "learning_rate": 2.0915562600892948e-05, + "loss": 0.5969, + "step": 293400 + }, + { + "epoch": 6.760615271253569, + "grad_norm": 3.066767692565918, + "learning_rate": 2.0908291037070434e-05, + "loss": 0.5907, + "step": 293600 + }, + { + "epoch": 6.765220595007829, + "grad_norm": 3.7000675201416016, + "learning_rate": 2.0901019473247916e-05, + "loss": 0.5968, + "step": 293800 + }, + { + "epoch": 6.769825918762089, + "grad_norm": 2.6257290840148926, + "learning_rate": 2.0893747909425402e-05, + "loss": 0.6015, + "step": 294000 + }, + { + "epoch": 6.774431242516349, + "grad_norm": 2.972223997116089, + "learning_rate": 2.0886476345602888e-05, + "loss": 0.6104, + "step": 294200 + }, + { + "epoch": 6.779036566270609, + "grad_norm": 2.6368885040283203, + "learning_rate": 2.0879204781780367e-05, + "loss": 0.6009, + "step": 294400 + }, + { + "epoch": 6.783641890024869, + "grad_norm": 2.8268954753875732, + "learning_rate": 2.0871933217957853e-05, + "loss": 0.5894, + "step": 294600 + }, + { + "epoch": 6.788247213779129, + "grad_norm": 2.7802374362945557, + "learning_rate": 2.086466165413534e-05, + "loss": 0.6079, + "step": 294800 + }, + { + "epoch": 6.792852537533388, + "grad_norm": 3.244723320007324, + "learning_rate": 2.0857390090312822e-05, + "loss": 0.5933, + "step": 295000 + }, + { + "epoch": 6.797457861287649, + "grad_norm": 2.8554670810699463, + "learning_rate": 2.0850118526490308e-05, + "loss": 0.5893, + "step": 295200 + }, + { + "epoch": 6.802063185041908, + "grad_norm": 3.411585807800293, + "learning_rate": 2.0842846962667794e-05, + "loss": 0.5874, + "step": 295400 + }, + { + "epoch": 6.806668508796168, + "grad_norm": 3.076263904571533, + "learning_rate": 2.0835575398845276e-05, + "loss": 0.5866, + "step": 295600 + }, + { + "epoch": 6.8112738325504285, + "grad_norm": 3.0649490356445312, + "learning_rate": 2.082830383502276e-05, + "loss": 0.6006, + "step": 295800 + }, + { + "epoch": 6.815879156304688, + "grad_norm": 3.0832936763763428, + "learning_rate": 2.0821032271200245e-05, + "loss": 0.5896, + "step": 296000 + }, + { + "epoch": 6.820484480058948, + "grad_norm": 4.413976192474365, + "learning_rate": 2.081376070737773e-05, + "loss": 0.5995, + "step": 296200 + }, + { + "epoch": 6.825089803813208, + "grad_norm": 2.9547083377838135, + "learning_rate": 2.0806489143555213e-05, + "loss": 0.588, + "step": 296400 + }, + { + "epoch": 6.829695127567468, + "grad_norm": 3.1978492736816406, + "learning_rate": 2.07992175797327e-05, + "loss": 0.605, + "step": 296600 + }, + { + "epoch": 6.834300451321728, + "grad_norm": 2.5067381858825684, + "learning_rate": 2.0791946015910185e-05, + "loss": 0.5945, + "step": 296800 + }, + { + "epoch": 6.838905775075988, + "grad_norm": 2.9180538654327393, + "learning_rate": 2.0784674452087664e-05, + "loss": 0.6033, + "step": 297000 + }, + { + "epoch": 6.843511098830247, + "grad_norm": 2.9533698558807373, + "learning_rate": 2.077740288826515e-05, + "loss": 0.5826, + "step": 297200 + }, + { + "epoch": 6.848116422584508, + "grad_norm": 3.087790012359619, + "learning_rate": 2.0770131324442636e-05, + "loss": 0.6035, + "step": 297400 + }, + { + "epoch": 6.8527217463387675, + "grad_norm": 3.130922317504883, + "learning_rate": 2.076285976062012e-05, + "loss": 0.599, + "step": 297600 + }, + { + "epoch": 6.857327070093028, + "grad_norm": 3.0297648906707764, + "learning_rate": 2.0755588196797604e-05, + "loss": 0.6053, + "step": 297800 + }, + { + "epoch": 6.861932393847288, + "grad_norm": 2.8300116062164307, + "learning_rate": 2.0748352990794203e-05, + "loss": 0.6033, + "step": 298000 + }, + { + "epoch": 6.866537717601547, + "grad_norm": 3.1619298458099365, + "learning_rate": 2.0741081426971685e-05, + "loss": 0.5995, + "step": 298200 + }, + { + "epoch": 6.871143041355808, + "grad_norm": 3.3405187129974365, + "learning_rate": 2.0733809863149168e-05, + "loss": 0.5918, + "step": 298400 + }, + { + "epoch": 6.875748365110067, + "grad_norm": 2.8210740089416504, + "learning_rate": 2.0726538299326654e-05, + "loss": 0.599, + "step": 298600 + }, + { + "epoch": 6.880353688864327, + "grad_norm": 3.2792325019836426, + "learning_rate": 2.0719266735504136e-05, + "loss": 0.5899, + "step": 298800 + }, + { + "epoch": 6.884959012618587, + "grad_norm": 2.485522508621216, + "learning_rate": 2.0711995171681622e-05, + "loss": 0.5864, + "step": 299000 + }, + { + "epoch": 6.889564336372847, + "grad_norm": 2.585327625274658, + "learning_rate": 2.0704723607859108e-05, + "loss": 0.5904, + "step": 299200 + }, + { + "epoch": 6.8941696601271065, + "grad_norm": 2.3402791023254395, + "learning_rate": 2.069745204403659e-05, + "loss": 0.5845, + "step": 299400 + }, + { + "epoch": 6.898774983881367, + "grad_norm": 2.9729385375976562, + "learning_rate": 2.0690180480214073e-05, + "loss": 0.5963, + "step": 299600 + }, + { + "epoch": 6.903380307635627, + "grad_norm": 3.0972092151641846, + "learning_rate": 2.068290891639156e-05, + "loss": 0.6085, + "step": 299800 + }, + { + "epoch": 6.907985631389887, + "grad_norm": 3.688617467880249, + "learning_rate": 2.0675637352569045e-05, + "loss": 0.6013, + "step": 300000 + }, + { + "epoch": 6.912590955144147, + "grad_norm": 2.644268751144409, + "learning_rate": 2.0668365788746528e-05, + "loss": 0.5893, + "step": 300200 + }, + { + "epoch": 6.917196278898406, + "grad_norm": 3.49151873588562, + "learning_rate": 2.0661094224924013e-05, + "loss": 0.5896, + "step": 300400 + }, + { + "epoch": 6.921801602652667, + "grad_norm": 3.661813974380493, + "learning_rate": 2.06538226611015e-05, + "loss": 0.5972, + "step": 300600 + }, + { + "epoch": 6.926406926406926, + "grad_norm": 3.322665214538574, + "learning_rate": 2.0646551097278982e-05, + "loss": 0.5989, + "step": 300800 + }, + { + "epoch": 6.931012250161187, + "grad_norm": 3.0434811115264893, + "learning_rate": 2.0639279533456464e-05, + "loss": 0.5886, + "step": 301000 + }, + { + "epoch": 6.935617573915446, + "grad_norm": 3.6743383407592773, + "learning_rate": 2.063204432745306e-05, + "loss": 0.6096, + "step": 301200 + }, + { + "epoch": 6.940222897669706, + "grad_norm": 3.656269073486328, + "learning_rate": 2.0624772763630546e-05, + "loss": 0.5816, + "step": 301400 + }, + { + "epoch": 6.9448282214239665, + "grad_norm": 2.7301547527313232, + "learning_rate": 2.061750119980803e-05, + "loss": 0.5909, + "step": 301600 + }, + { + "epoch": 6.949433545178226, + "grad_norm": 2.843813896179199, + "learning_rate": 2.0610229635985517e-05, + "loss": 0.5987, + "step": 301800 + }, + { + "epoch": 6.954038868932486, + "grad_norm": 3.3553197383880615, + "learning_rate": 2.0602958072163e-05, + "loss": 0.5974, + "step": 302000 + }, + { + "epoch": 6.958644192686746, + "grad_norm": 2.880885601043701, + "learning_rate": 2.0595686508340486e-05, + "loss": 0.6037, + "step": 302200 + }, + { + "epoch": 6.963249516441006, + "grad_norm": 4.332859039306641, + "learning_rate": 2.058841494451797e-05, + "loss": 0.5863, + "step": 302400 + }, + { + "epoch": 6.967854840195265, + "grad_norm": 2.701078176498413, + "learning_rate": 2.058114338069545e-05, + "loss": 0.6154, + "step": 302600 + }, + { + "epoch": 6.972460163949526, + "grad_norm": 3.1130638122558594, + "learning_rate": 2.0573871816872937e-05, + "loss": 0.5895, + "step": 302800 + }, + { + "epoch": 6.977065487703785, + "grad_norm": 3.547954559326172, + "learning_rate": 2.0566600253050423e-05, + "loss": 0.6219, + "step": 303000 + }, + { + "epoch": 6.981670811458045, + "grad_norm": 2.777357816696167, + "learning_rate": 2.0559328689227905e-05, + "loss": 0.5969, + "step": 303200 + }, + { + "epoch": 6.9862761352123055, + "grad_norm": 3.05338978767395, + "learning_rate": 2.055205712540539e-05, + "loss": 0.5945, + "step": 303400 + }, + { + "epoch": 6.990881458966565, + "grad_norm": 2.9107885360717773, + "learning_rate": 2.0544785561582877e-05, + "loss": 0.5978, + "step": 303600 + }, + { + "epoch": 6.995486782720826, + "grad_norm": 3.5367624759674072, + "learning_rate": 2.0537550355579472e-05, + "loss": 0.6023, + "step": 303800 + }, + { + "epoch": 7.0, + "eval_loss": 0.567062258720398, + "eval_runtime": 145.2794, + "eval_samples_per_second": 195.217, + "eval_steps_per_second": 12.204, + "step": 303996 + }, + { + "epoch": 7.000092106475085, + "grad_norm": 4.047206878662109, + "learning_rate": 2.0530278791756955e-05, + "loss": 0.5904, + "step": 304000 + }, + { + "epoch": 7.004697430229345, + "grad_norm": 2.5840604305267334, + "learning_rate": 2.052300722793444e-05, + "loss": 0.5862, + "step": 304200 + }, + { + "epoch": 7.009302753983605, + "grad_norm": 3.08518385887146, + "learning_rate": 2.0515735664111923e-05, + "loss": 0.5918, + "step": 304400 + }, + { + "epoch": 7.013908077737865, + "grad_norm": 2.9152956008911133, + "learning_rate": 2.050846410028941e-05, + "loss": 0.5858, + "step": 304600 + }, + { + "epoch": 7.018513401492125, + "grad_norm": 3.358140468597412, + "learning_rate": 2.0501192536466895e-05, + "loss": 0.5958, + "step": 304800 + }, + { + "epoch": 7.023118725246385, + "grad_norm": 2.961735248565674, + "learning_rate": 2.0493920972644378e-05, + "loss": 0.5929, + "step": 305000 + }, + { + "epoch": 7.0277240490006445, + "grad_norm": 2.8176980018615723, + "learning_rate": 2.048664940882186e-05, + "loss": 0.5862, + "step": 305200 + }, + { + "epoch": 7.032329372754905, + "grad_norm": 3.275162696838379, + "learning_rate": 2.0479377844999346e-05, + "loss": 0.5934, + "step": 305400 + }, + { + "epoch": 7.036934696509165, + "grad_norm": 3.5453879833221436, + "learning_rate": 2.047210628117683e-05, + "loss": 0.588, + "step": 305600 + }, + { + "epoch": 7.041540020263424, + "grad_norm": 2.8470346927642822, + "learning_rate": 2.0464834717354314e-05, + "loss": 0.597, + "step": 305800 + }, + { + "epoch": 7.046145344017685, + "grad_norm": 2.6055123805999756, + "learning_rate": 2.0457599511350913e-05, + "loss": 0.591, + "step": 306000 + }, + { + "epoch": 7.050750667771944, + "grad_norm": 2.6110992431640625, + "learning_rate": 2.0450327947528396e-05, + "loss": 0.5906, + "step": 306200 + }, + { + "epoch": 7.055355991526204, + "grad_norm": 3.3166911602020264, + "learning_rate": 2.044305638370588e-05, + "loss": 0.582, + "step": 306400 + }, + { + "epoch": 7.059961315280464, + "grad_norm": 2.7513551712036133, + "learning_rate": 2.0435784819883364e-05, + "loss": 0.5985, + "step": 306600 + }, + { + "epoch": 7.064566639034724, + "grad_norm": 3.547184944152832, + "learning_rate": 2.0428513256060847e-05, + "loss": 0.6041, + "step": 306800 + }, + { + "epoch": 7.069171962788984, + "grad_norm": 3.074101686477661, + "learning_rate": 2.0421241692238332e-05, + "loss": 0.5915, + "step": 307000 + }, + { + "epoch": 7.073777286543244, + "grad_norm": 3.347473621368408, + "learning_rate": 2.041397012841582e-05, + "loss": 0.5927, + "step": 307200 + }, + { + "epoch": 7.078382610297504, + "grad_norm": 3.3471734523773193, + "learning_rate": 2.04066985645933e-05, + "loss": 0.5954, + "step": 307400 + }, + { + "epoch": 7.082987934051764, + "grad_norm": 2.5890464782714844, + "learning_rate": 2.0399427000770787e-05, + "loss": 0.6003, + "step": 307600 + }, + { + "epoch": 7.087593257806024, + "grad_norm": 2.5832886695861816, + "learning_rate": 2.0392155436948273e-05, + "loss": 0.5964, + "step": 307800 + }, + { + "epoch": 7.092198581560283, + "grad_norm": 3.1692888736724854, + "learning_rate": 2.0384883873125755e-05, + "loss": 0.5985, + "step": 308000 + }, + { + "epoch": 7.096803905314544, + "grad_norm": 2.8189423084259033, + "learning_rate": 2.0377612309303238e-05, + "loss": 0.5802, + "step": 308200 + }, + { + "epoch": 7.101409229068803, + "grad_norm": 2.9584109783172607, + "learning_rate": 2.0370340745480724e-05, + "loss": 0.5949, + "step": 308400 + }, + { + "epoch": 7.106014552823064, + "grad_norm": 2.7427165508270264, + "learning_rate": 2.036306918165821e-05, + "loss": 0.59, + "step": 308600 + }, + { + "epoch": 7.110619876577323, + "grad_norm": 3.3998868465423584, + "learning_rate": 2.0355797617835692e-05, + "loss": 0.5907, + "step": 308800 + }, + { + "epoch": 7.115225200331583, + "grad_norm": 3.8071186542510986, + "learning_rate": 2.0348526054013178e-05, + "loss": 0.5865, + "step": 309000 + }, + { + "epoch": 7.1198305240858435, + "grad_norm": 2.8909687995910645, + "learning_rate": 2.0341290848009773e-05, + "loss": 0.5883, + "step": 309200 + }, + { + "epoch": 7.124435847840103, + "grad_norm": 2.901247262954712, + "learning_rate": 2.0334019284187256e-05, + "loss": 0.6004, + "step": 309400 + }, + { + "epoch": 7.129041171594363, + "grad_norm": 3.5564959049224854, + "learning_rate": 2.0326747720364742e-05, + "loss": 0.5866, + "step": 309600 + }, + { + "epoch": 7.133646495348623, + "grad_norm": 3.2295401096343994, + "learning_rate": 2.0319476156542228e-05, + "loss": 0.6012, + "step": 309800 + }, + { + "epoch": 7.138251819102883, + "grad_norm": 2.9399068355560303, + "learning_rate": 2.031220459271971e-05, + "loss": 0.6112, + "step": 310000 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 3.2517316341400146, + "learning_rate": 2.0304933028897196e-05, + "loss": 0.5686, + "step": 310200 + }, + { + "epoch": 7.147462466611403, + "grad_norm": 2.8699967861175537, + "learning_rate": 2.0297661465074682e-05, + "loss": 0.5826, + "step": 310400 + }, + { + "epoch": 7.152067790365662, + "grad_norm": 3.1824159622192383, + "learning_rate": 2.0290389901252165e-05, + "loss": 0.5878, + "step": 310600 + }, + { + "epoch": 7.156673114119923, + "grad_norm": 2.710801839828491, + "learning_rate": 2.0283118337429647e-05, + "loss": 0.5989, + "step": 310800 + }, + { + "epoch": 7.1612784378741825, + "grad_norm": 3.2563531398773193, + "learning_rate": 2.0275846773607133e-05, + "loss": 0.5956, + "step": 311000 + }, + { + "epoch": 7.165883761628442, + "grad_norm": 3.0747923851013184, + "learning_rate": 2.0268575209784615e-05, + "loss": 0.5924, + "step": 311200 + }, + { + "epoch": 7.170489085382703, + "grad_norm": 3.374802589416504, + "learning_rate": 2.02613036459621e-05, + "loss": 0.594, + "step": 311400 + }, + { + "epoch": 7.175094409136962, + "grad_norm": 3.170912504196167, + "learning_rate": 2.0254032082139587e-05, + "loss": 0.5821, + "step": 311600 + }, + { + "epoch": 7.179699732891223, + "grad_norm": 2.800078868865967, + "learning_rate": 2.024676051831707e-05, + "loss": 0.5837, + "step": 311800 + }, + { + "epoch": 7.184305056645482, + "grad_norm": 4.168891906738281, + "learning_rate": 2.0239488954494552e-05, + "loss": 0.5977, + "step": 312000 + }, + { + "epoch": 7.188910380399742, + "grad_norm": 4.314828872680664, + "learning_rate": 2.023225374849115e-05, + "loss": 0.5871, + "step": 312200 + }, + { + "epoch": 7.193515704154002, + "grad_norm": 2.661975383758545, + "learning_rate": 2.0224982184668633e-05, + "loss": 0.6009, + "step": 312400 + }, + { + "epoch": 7.198121027908262, + "grad_norm": 2.217339277267456, + "learning_rate": 2.021771062084612e-05, + "loss": 0.5916, + "step": 312600 + }, + { + "epoch": 7.2027263516625215, + "grad_norm": 3.5877535343170166, + "learning_rate": 2.0210439057023605e-05, + "loss": 0.5889, + "step": 312800 + }, + { + "epoch": 7.207331675416782, + "grad_norm": 3.6511406898498535, + "learning_rate": 2.0203167493201088e-05, + "loss": 0.5906, + "step": 313000 + }, + { + "epoch": 7.211936999171042, + "grad_norm": 3.414182186126709, + "learning_rate": 2.0195895929378574e-05, + "loss": 0.5968, + "step": 313200 + }, + { + "epoch": 7.216542322925302, + "grad_norm": 3.0940256118774414, + "learning_rate": 2.0188624365556056e-05, + "loss": 0.5972, + "step": 313400 + }, + { + "epoch": 7.221147646679562, + "grad_norm": 2.9856083393096924, + "learning_rate": 2.018135280173354e-05, + "loss": 0.5918, + "step": 313600 + }, + { + "epoch": 7.225752970433821, + "grad_norm": 3.411344528198242, + "learning_rate": 2.0174081237911025e-05, + "loss": 0.5816, + "step": 313800 + }, + { + "epoch": 7.230358294188082, + "grad_norm": 3.6355364322662354, + "learning_rate": 2.016680967408851e-05, + "loss": 0.6008, + "step": 314000 + }, + { + "epoch": 7.234963617942341, + "grad_norm": 3.124440908432007, + "learning_rate": 2.0159538110265993e-05, + "loss": 0.5892, + "step": 314200 + }, + { + "epoch": 7.239568941696601, + "grad_norm": 3.7005481719970703, + "learning_rate": 2.015226654644348e-05, + "loss": 0.5834, + "step": 314400 + }, + { + "epoch": 7.244174265450861, + "grad_norm": 2.2314252853393555, + "learning_rate": 2.0144994982620965e-05, + "loss": 0.5888, + "step": 314600 + }, + { + "epoch": 7.248779589205121, + "grad_norm": 3.4673757553100586, + "learning_rate": 2.0137723418798448e-05, + "loss": 0.6033, + "step": 314800 + }, + { + "epoch": 7.2533849129593815, + "grad_norm": 3.7593634128570557, + "learning_rate": 2.013045185497593e-05, + "loss": 0.6022, + "step": 315000 + }, + { + "epoch": 7.257990236713641, + "grad_norm": 3.1982200145721436, + "learning_rate": 2.0123180291153416e-05, + "loss": 0.5823, + "step": 315200 + }, + { + "epoch": 7.262595560467901, + "grad_norm": 2.9602959156036377, + "learning_rate": 2.0115908727330902e-05, + "loss": 0.5856, + "step": 315400 + }, + { + "epoch": 7.267200884222161, + "grad_norm": 3.3312976360321045, + "learning_rate": 2.0108637163508384e-05, + "loss": 0.5826, + "step": 315600 + }, + { + "epoch": 7.271806207976421, + "grad_norm": 3.437633514404297, + "learning_rate": 2.010136559968587e-05, + "loss": 0.5889, + "step": 315800 + }, + { + "epoch": 7.27641153173068, + "grad_norm": 4.305769443511963, + "learning_rate": 2.0094094035863353e-05, + "loss": 0.5868, + "step": 316000 + }, + { + "epoch": 7.281016855484941, + "grad_norm": 3.3740241527557373, + "learning_rate": 2.0086822472040835e-05, + "loss": 0.5826, + "step": 316200 + }, + { + "epoch": 7.2856221792392, + "grad_norm": 2.570251226425171, + "learning_rate": 2.0079623623856547e-05, + "loss": 0.5914, + "step": 316400 + }, + { + "epoch": 7.29022750299346, + "grad_norm": 4.058407783508301, + "learning_rate": 2.007235206003403e-05, + "loss": 0.6015, + "step": 316600 + }, + { + "epoch": 7.2948328267477205, + "grad_norm": 3.7470202445983887, + "learning_rate": 2.0065080496211515e-05, + "loss": 0.5866, + "step": 316800 + }, + { + "epoch": 7.29943815050198, + "grad_norm": 3.243685007095337, + "learning_rate": 2.0057808932389e-05, + "loss": 0.5829, + "step": 317000 + }, + { + "epoch": 7.304043474256241, + "grad_norm": 3.5165421962738037, + "learning_rate": 2.0050537368566484e-05, + "loss": 0.5883, + "step": 317200 + }, + { + "epoch": 7.3086487980105, + "grad_norm": 2.5565083026885986, + "learning_rate": 2.004326580474397e-05, + "loss": 0.5944, + "step": 317400 + }, + { + "epoch": 7.31325412176476, + "grad_norm": 2.588308095932007, + "learning_rate": 2.0035994240921455e-05, + "loss": 0.5903, + "step": 317600 + }, + { + "epoch": 7.31785944551902, + "grad_norm": 3.13081431388855, + "learning_rate": 2.0028722677098938e-05, + "loss": 0.5962, + "step": 317800 + }, + { + "epoch": 7.32246476927328, + "grad_norm": 3.2789556980133057, + "learning_rate": 2.002145111327642e-05, + "loss": 0.5912, + "step": 318000 + }, + { + "epoch": 7.327070093027539, + "grad_norm": 2.885380744934082, + "learning_rate": 2.0014179549453906e-05, + "loss": 0.5992, + "step": 318200 + }, + { + "epoch": 7.3316754167818, + "grad_norm": 2.445709705352783, + "learning_rate": 2.0006907985631392e-05, + "loss": 0.5915, + "step": 318400 + }, + { + "epoch": 7.3362807405360595, + "grad_norm": 4.696218967437744, + "learning_rate": 1.9999636421808875e-05, + "loss": 0.602, + "step": 318600 + }, + { + "epoch": 7.34088606429032, + "grad_norm": 2.6832456588745117, + "learning_rate": 1.999236485798636e-05, + "loss": 0.5936, + "step": 318800 + }, + { + "epoch": 7.34549138804458, + "grad_norm": 2.7151763439178467, + "learning_rate": 1.9985093294163843e-05, + "loss": 0.5932, + "step": 319000 + }, + { + "epoch": 7.350096711798839, + "grad_norm": 2.803915500640869, + "learning_rate": 1.9977821730341326e-05, + "loss": 0.6016, + "step": 319200 + }, + { + "epoch": 7.3547020355531, + "grad_norm": 2.600085973739624, + "learning_rate": 1.997055016651881e-05, + "loss": 0.5952, + "step": 319400 + }, + { + "epoch": 7.359307359307359, + "grad_norm": 3.053616523742676, + "learning_rate": 1.9963278602696298e-05, + "loss": 0.5924, + "step": 319600 + }, + { + "epoch": 7.363912683061619, + "grad_norm": 2.9440255165100098, + "learning_rate": 1.995600703887378e-05, + "loss": 0.5995, + "step": 319800 + }, + { + "epoch": 7.368518006815879, + "grad_norm": 3.2415082454681396, + "learning_rate": 1.9948735475051266e-05, + "loss": 0.5919, + "step": 320000 + }, + { + "epoch": 7.373123330570139, + "grad_norm": 2.619912624359131, + "learning_rate": 1.994146391122875e-05, + "loss": 0.5929, + "step": 320200 + }, + { + "epoch": 7.377728654324399, + "grad_norm": 3.2725462913513184, + "learning_rate": 1.993419234740623e-05, + "loss": 0.5849, + "step": 320400 + }, + { + "epoch": 7.382333978078659, + "grad_norm": 4.5242743492126465, + "learning_rate": 1.9926920783583717e-05, + "loss": 0.5891, + "step": 320600 + }, + { + "epoch": 7.386939301832919, + "grad_norm": 3.2568304538726807, + "learning_rate": 1.9919649219761203e-05, + "loss": 0.5892, + "step": 320800 + }, + { + "epoch": 7.391544625587179, + "grad_norm": 3.1149260997772217, + "learning_rate": 1.9912414013757798e-05, + "loss": 0.5955, + "step": 321000 + }, + { + "epoch": 7.396149949341439, + "grad_norm": 3.272343158721924, + "learning_rate": 1.9905142449935284e-05, + "loss": 0.5938, + "step": 321200 + }, + { + "epoch": 7.400755273095698, + "grad_norm": 2.8861818313598633, + "learning_rate": 1.989787088611277e-05, + "loss": 0.5973, + "step": 321400 + }, + { + "epoch": 7.405360596849959, + "grad_norm": 4.322711944580078, + "learning_rate": 1.9890599322290252e-05, + "loss": 0.5979, + "step": 321600 + }, + { + "epoch": 7.409965920604218, + "grad_norm": 3.2034659385681152, + "learning_rate": 1.9883327758467735e-05, + "loss": 0.5897, + "step": 321800 + }, + { + "epoch": 7.414571244358479, + "grad_norm": 2.6252517700195312, + "learning_rate": 1.987605619464522e-05, + "loss": 0.593, + "step": 322000 + }, + { + "epoch": 7.419176568112738, + "grad_norm": 2.79892897605896, + "learning_rate": 1.9868784630822703e-05, + "loss": 0.5988, + "step": 322200 + }, + { + "epoch": 7.423781891866998, + "grad_norm": 4.176153659820557, + "learning_rate": 1.986151306700019e-05, + "loss": 0.5828, + "step": 322400 + }, + { + "epoch": 7.4283872156212585, + "grad_norm": 4.084776878356934, + "learning_rate": 1.9854241503177675e-05, + "loss": 0.5949, + "step": 322600 + }, + { + "epoch": 7.432992539375518, + "grad_norm": 3.2946360111236572, + "learning_rate": 1.984696993935516e-05, + "loss": 0.5911, + "step": 322800 + }, + { + "epoch": 7.437597863129778, + "grad_norm": 2.784147262573242, + "learning_rate": 1.983969837553264e-05, + "loss": 0.5864, + "step": 323000 + }, + { + "epoch": 7.442203186884038, + "grad_norm": 3.3848652839660645, + "learning_rate": 1.9832426811710126e-05, + "loss": 0.579, + "step": 323200 + }, + { + "epoch": 7.446808510638298, + "grad_norm": 3.4433703422546387, + "learning_rate": 1.9825155247887612e-05, + "loss": 0.5887, + "step": 323400 + }, + { + "epoch": 7.451413834392557, + "grad_norm": 3.3405020236968994, + "learning_rate": 1.9817920041884207e-05, + "loss": 0.5876, + "step": 323600 + }, + { + "epoch": 7.456019158146818, + "grad_norm": 3.7312204837799072, + "learning_rate": 1.9810648478061693e-05, + "loss": 0.5955, + "step": 323800 + }, + { + "epoch": 7.460624481901077, + "grad_norm": 4.358093738555908, + "learning_rate": 1.9803376914239176e-05, + "loss": 0.5866, + "step": 324000 + }, + { + "epoch": 7.465229805655338, + "grad_norm": 2.68795108795166, + "learning_rate": 1.979610535041666e-05, + "loss": 0.6004, + "step": 324200 + }, + { + "epoch": 7.4698351294095975, + "grad_norm": 2.8480021953582764, + "learning_rate": 1.978887014441326e-05, + "loss": 0.5983, + "step": 324400 + }, + { + "epoch": 7.474440453163857, + "grad_norm": 4.532684803009033, + "learning_rate": 1.9781598580590743e-05, + "loss": 0.5917, + "step": 324600 + }, + { + "epoch": 7.479045776918118, + "grad_norm": 3.242600202560425, + "learning_rate": 1.9774327016768225e-05, + "loss": 0.5981, + "step": 324800 + }, + { + "epoch": 7.483651100672377, + "grad_norm": 3.071572780609131, + "learning_rate": 1.976705545294571e-05, + "loss": 0.5892, + "step": 325000 + }, + { + "epoch": 7.488256424426638, + "grad_norm": 3.3507227897644043, + "learning_rate": 1.9759783889123194e-05, + "loss": 0.593, + "step": 325200 + }, + { + "epoch": 7.492861748180897, + "grad_norm": 3.0197222232818604, + "learning_rate": 1.975251232530068e-05, + "loss": 0.598, + "step": 325400 + }, + { + "epoch": 7.497467071935157, + "grad_norm": 2.7512412071228027, + "learning_rate": 1.9745240761478166e-05, + "loss": 0.5942, + "step": 325600 + }, + { + "epoch": 7.502072395689417, + "grad_norm": 3.136456251144409, + "learning_rate": 1.9737969197655648e-05, + "loss": 0.5828, + "step": 325800 + }, + { + "epoch": 7.506677719443677, + "grad_norm": 4.431463241577148, + "learning_rate": 1.973069763383313e-05, + "loss": 0.6002, + "step": 326000 + }, + { + "epoch": 7.5112830431979365, + "grad_norm": 2.8027350902557373, + "learning_rate": 1.9723426070010617e-05, + "loss": 0.5935, + "step": 326200 + }, + { + "epoch": 7.515888366952197, + "grad_norm": 4.174149990081787, + "learning_rate": 1.9716154506188102e-05, + "loss": 0.5837, + "step": 326400 + }, + { + "epoch": 7.520493690706457, + "grad_norm": 4.200664043426514, + "learning_rate": 1.9708919300184698e-05, + "loss": 0.6054, + "step": 326600 + }, + { + "epoch": 7.525099014460716, + "grad_norm": 2.414337158203125, + "learning_rate": 1.9701647736362184e-05, + "loss": 0.5915, + "step": 326800 + }, + { + "epoch": 7.529704338214977, + "grad_norm": 2.9279587268829346, + "learning_rate": 1.9694376172539666e-05, + "loss": 0.5921, + "step": 327000 + }, + { + "epoch": 7.534309661969236, + "grad_norm": 3.7588605880737305, + "learning_rate": 1.9687104608717152e-05, + "loss": 0.5839, + "step": 327200 + }, + { + "epoch": 7.538914985723497, + "grad_norm": 3.5354812145233154, + "learning_rate": 1.9679833044894635e-05, + "loss": 0.5954, + "step": 327400 + }, + { + "epoch": 7.543520309477756, + "grad_norm": 3.489088773727417, + "learning_rate": 1.967256148107212e-05, + "loss": 0.6005, + "step": 327600 + }, + { + "epoch": 7.548125633232016, + "grad_norm": 3.0802366733551025, + "learning_rate": 1.9665289917249603e-05, + "loss": 0.5928, + "step": 327800 + }, + { + "epoch": 7.552730956986276, + "grad_norm": 3.6488418579101562, + "learning_rate": 1.965801835342709e-05, + "loss": 0.5977, + "step": 328000 + }, + { + "epoch": 7.557336280740536, + "grad_norm": 3.7551910877227783, + "learning_rate": 1.9650746789604575e-05, + "loss": 0.5832, + "step": 328200 + }, + { + "epoch": 7.5619416044947965, + "grad_norm": 2.494746685028076, + "learning_rate": 1.9643475225782057e-05, + "loss": 0.5867, + "step": 328400 + }, + { + "epoch": 7.566546928249056, + "grad_norm": 3.4806318283081055, + "learning_rate": 1.9636240019778656e-05, + "loss": 0.5818, + "step": 328600 + }, + { + "epoch": 7.571152252003316, + "grad_norm": 3.278502941131592, + "learning_rate": 1.962896845595614e-05, + "loss": 0.5864, + "step": 328800 + }, + { + "epoch": 7.575757575757576, + "grad_norm": 3.046190023422241, + "learning_rate": 1.962169689213362e-05, + "loss": 0.5904, + "step": 329000 + }, + { + "epoch": 7.580362899511836, + "grad_norm": 2.7647550106048584, + "learning_rate": 1.9614425328311107e-05, + "loss": 0.6045, + "step": 329200 + }, + { + "epoch": 7.584968223266095, + "grad_norm": 3.201040506362915, + "learning_rate": 1.9607153764488593e-05, + "loss": 0.5932, + "step": 329400 + }, + { + "epoch": 7.589573547020356, + "grad_norm": 4.180473327636719, + "learning_rate": 1.9599882200666075e-05, + "loss": 0.5883, + "step": 329600 + }, + { + "epoch": 7.594178870774615, + "grad_norm": 2.905902862548828, + "learning_rate": 1.959261063684356e-05, + "loss": 0.593, + "step": 329800 + }, + { + "epoch": 7.598784194528875, + "grad_norm": 3.984675884246826, + "learning_rate": 1.9585339073021047e-05, + "loss": 0.6001, + "step": 330000 + }, + { + "epoch": 7.6033895182831355, + "grad_norm": 3.31915545463562, + "learning_rate": 1.9578067509198526e-05, + "loss": 0.5868, + "step": 330200 + }, + { + "epoch": 7.607994842037395, + "grad_norm": 2.907419204711914, + "learning_rate": 1.9570795945376012e-05, + "loss": 0.5885, + "step": 330400 + }, + { + "epoch": 7.612600165791655, + "grad_norm": 3.3257858753204346, + "learning_rate": 1.9563524381553498e-05, + "loss": 0.5821, + "step": 330600 + }, + { + "epoch": 7.617205489545915, + "grad_norm": 3.4570491313934326, + "learning_rate": 1.955625281773098e-05, + "loss": 0.5999, + "step": 330800 + }, + { + "epoch": 7.621810813300175, + "grad_norm": 3.399859666824341, + "learning_rate": 1.9548981253908467e-05, + "loss": 0.5977, + "step": 331000 + }, + { + "epoch": 7.626416137054435, + "grad_norm": 3.104308605194092, + "learning_rate": 1.9541709690085952e-05, + "loss": 0.5842, + "step": 331200 + }, + { + "epoch": 7.631021460808695, + "grad_norm": 3.126985788345337, + "learning_rate": 1.953443812626343e-05, + "loss": 0.6003, + "step": 331400 + }, + { + "epoch": 7.635626784562954, + "grad_norm": 2.7562856674194336, + "learning_rate": 1.9527166562440918e-05, + "loss": 0.5831, + "step": 331600 + }, + { + "epoch": 7.640232108317215, + "grad_norm": 2.559391975402832, + "learning_rate": 1.9519894998618403e-05, + "loss": 0.5963, + "step": 331800 + }, + { + "epoch": 7.6448374320714745, + "grad_norm": 3.1173386573791504, + "learning_rate": 1.9512659792615e-05, + "loss": 0.6023, + "step": 332000 + }, + { + "epoch": 7.649442755825735, + "grad_norm": 2.7219719886779785, + "learning_rate": 1.9505388228792485e-05, + "loss": 0.5869, + "step": 332200 + }, + { + "epoch": 7.654048079579995, + "grad_norm": 2.787108898162842, + "learning_rate": 1.949811666496997e-05, + "loss": 0.6002, + "step": 332400 + }, + { + "epoch": 7.658653403334254, + "grad_norm": 2.7488784790039062, + "learning_rate": 1.9490845101147453e-05, + "loss": 0.5946, + "step": 332600 + }, + { + "epoch": 7.663258727088515, + "grad_norm": 3.056337833404541, + "learning_rate": 1.948357353732494e-05, + "loss": 0.5933, + "step": 332800 + }, + { + "epoch": 7.667864050842774, + "grad_norm": 3.6845805644989014, + "learning_rate": 1.947630197350242e-05, + "loss": 0.5947, + "step": 333000 + }, + { + "epoch": 7.672469374597034, + "grad_norm": 2.5756258964538574, + "learning_rate": 1.9469030409679904e-05, + "loss": 0.5984, + "step": 333200 + }, + { + "epoch": 7.677074698351294, + "grad_norm": 3.2758781909942627, + "learning_rate": 1.946175884585739e-05, + "loss": 0.5955, + "step": 333400 + }, + { + "epoch": 7.681680022105554, + "grad_norm": 3.465355634689331, + "learning_rate": 1.9454487282034876e-05, + "loss": 0.5903, + "step": 333600 + }, + { + "epoch": 7.6862853458598135, + "grad_norm": 3.2135512828826904, + "learning_rate": 1.944721571821236e-05, + "loss": 0.5864, + "step": 333800 + }, + { + "epoch": 7.690890669614074, + "grad_norm": 2.3275580406188965, + "learning_rate": 1.9439944154389844e-05, + "loss": 0.5797, + "step": 334000 + }, + { + "epoch": 7.695495993368334, + "grad_norm": 2.834540843963623, + "learning_rate": 1.9432672590567327e-05, + "loss": 0.5891, + "step": 334200 + }, + { + "epoch": 7.700101317122594, + "grad_norm": 2.680785894393921, + "learning_rate": 1.9425401026744813e-05, + "loss": 0.5916, + "step": 334400 + }, + { + "epoch": 7.704706640876854, + "grad_norm": 4.272080421447754, + "learning_rate": 1.9418129462922295e-05, + "loss": 0.5999, + "step": 334600 + }, + { + "epoch": 7.709311964631113, + "grad_norm": 2.508699417114258, + "learning_rate": 1.941085789909978e-05, + "loss": 0.5903, + "step": 334800 + }, + { + "epoch": 7.713917288385374, + "grad_norm": 4.05361270904541, + "learning_rate": 1.9403586335277267e-05, + "loss": 0.5947, + "step": 335000 + }, + { + "epoch": 7.718522612139633, + "grad_norm": 2.704547882080078, + "learning_rate": 1.939631477145475e-05, + "loss": 0.5867, + "step": 335200 + }, + { + "epoch": 7.723127935893894, + "grad_norm": 2.987114667892456, + "learning_rate": 1.9389043207632235e-05, + "loss": 0.5883, + "step": 335400 + }, + { + "epoch": 7.727733259648153, + "grad_norm": 2.5852272510528564, + "learning_rate": 1.9381771643809718e-05, + "loss": 0.6022, + "step": 335600 + }, + { + "epoch": 7.732338583402413, + "grad_norm": 2.52404522895813, + "learning_rate": 1.93745000799872e-05, + "loss": 0.5791, + "step": 335800 + }, + { + "epoch": 7.7369439071566735, + "grad_norm": 2.9356820583343506, + "learning_rate": 1.9367228516164686e-05, + "loss": 0.5922, + "step": 336000 + }, + { + "epoch": 7.741549230910933, + "grad_norm": 4.60886287689209, + "learning_rate": 1.9359956952342172e-05, + "loss": 0.583, + "step": 336200 + }, + { + "epoch": 7.746154554665193, + "grad_norm": 4.0157856941223145, + "learning_rate": 1.9352685388519655e-05, + "loss": 0.5979, + "step": 336400 + }, + { + "epoch": 7.750759878419453, + "grad_norm": 2.8278868198394775, + "learning_rate": 1.934541382469714e-05, + "loss": 0.603, + "step": 336600 + }, + { + "epoch": 7.755365202173713, + "grad_norm": 2.852208375930786, + "learning_rate": 1.9338142260874623e-05, + "loss": 0.5775, + "step": 336800 + }, + { + "epoch": 7.759970525927972, + "grad_norm": 3.623089075088501, + "learning_rate": 1.933087069705211e-05, + "loss": 0.589, + "step": 337000 + }, + { + "epoch": 7.764575849682233, + "grad_norm": 2.7371058464050293, + "learning_rate": 1.9323635491048704e-05, + "loss": 0.5884, + "step": 337200 + }, + { + "epoch": 7.769181173436492, + "grad_norm": 4.192208290100098, + "learning_rate": 1.931636392722619e-05, + "loss": 0.5906, + "step": 337400 + }, + { + "epoch": 7.773786497190753, + "grad_norm": 4.169963836669922, + "learning_rate": 1.9309128721222786e-05, + "loss": 0.6003, + "step": 337600 + }, + { + "epoch": 7.7783918209450125, + "grad_norm": 3.134167432785034, + "learning_rate": 1.930185715740027e-05, + "loss": 0.6032, + "step": 337800 + }, + { + "epoch": 7.782997144699272, + "grad_norm": 3.1043667793273926, + "learning_rate": 1.9294585593577757e-05, + "loss": 0.5872, + "step": 338000 + }, + { + "epoch": 7.7876024684535325, + "grad_norm": 2.8810665607452393, + "learning_rate": 1.928731402975524e-05, + "loss": 0.5848, + "step": 338200 + }, + { + "epoch": 7.792207792207792, + "grad_norm": 2.3954150676727295, + "learning_rate": 1.9280042465932722e-05, + "loss": 0.587, + "step": 338400 + }, + { + "epoch": 7.796813115962053, + "grad_norm": 3.5883309841156006, + "learning_rate": 1.927277090211021e-05, + "loss": 0.5954, + "step": 338600 + }, + { + "epoch": 7.801418439716312, + "grad_norm": 2.6461853981018066, + "learning_rate": 1.926549933828769e-05, + "loss": 0.5901, + "step": 338800 + }, + { + "epoch": 7.806023763470572, + "grad_norm": 3.625852108001709, + "learning_rate": 1.9258227774465177e-05, + "loss": 0.5784, + "step": 339000 + }, + { + "epoch": 7.810629087224832, + "grad_norm": 3.1065762042999268, + "learning_rate": 1.9250956210642663e-05, + "loss": 0.5917, + "step": 339200 + }, + { + "epoch": 7.815234410979092, + "grad_norm": 2.77862286567688, + "learning_rate": 1.9243684646820145e-05, + "loss": 0.5922, + "step": 339400 + }, + { + "epoch": 7.8198397347333515, + "grad_norm": 4.303182601928711, + "learning_rate": 1.923641308299763e-05, + "loss": 0.5842, + "step": 339600 + }, + { + "epoch": 7.824445058487612, + "grad_norm": 3.4562771320343018, + "learning_rate": 1.9229141519175114e-05, + "loss": 0.6, + "step": 339800 + }, + { + "epoch": 7.8290503822418716, + "grad_norm": 2.983295440673828, + "learning_rate": 1.9221869955352596e-05, + "loss": 0.6038, + "step": 340000 + }, + { + "epoch": 7.833655705996131, + "grad_norm": 3.2224607467651367, + "learning_rate": 1.9214598391530082e-05, + "loss": 0.5834, + "step": 340200 + }, + { + "epoch": 7.838261029750392, + "grad_norm": 3.168325185775757, + "learning_rate": 1.9207326827707568e-05, + "loss": 0.5772, + "step": 340400 + }, + { + "epoch": 7.842866353504651, + "grad_norm": 2.240220546722412, + "learning_rate": 1.9200055263885054e-05, + "loss": 0.592, + "step": 340600 + }, + { + "epoch": 7.847471677258911, + "grad_norm": 3.125988245010376, + "learning_rate": 1.9192783700062536e-05, + "loss": 0.579, + "step": 340800 + }, + { + "epoch": 7.852077001013171, + "grad_norm": 3.029654026031494, + "learning_rate": 1.918551213624002e-05, + "loss": 0.6008, + "step": 341000 + }, + { + "epoch": 7.856682324767431, + "grad_norm": 3.496350049972534, + "learning_rate": 1.9178240572417505e-05, + "loss": 0.584, + "step": 341200 + }, + { + "epoch": 7.861287648521691, + "grad_norm": 3.7270963191986084, + "learning_rate": 1.9170969008594987e-05, + "loss": 0.5942, + "step": 341400 + }, + { + "epoch": 7.865892972275951, + "grad_norm": 3.109341621398926, + "learning_rate": 1.9163697444772473e-05, + "loss": 0.5729, + "step": 341600 + }, + { + "epoch": 7.870498296030211, + "grad_norm": 2.5632474422454834, + "learning_rate": 1.915642588094996e-05, + "loss": 0.5945, + "step": 341800 + }, + { + "epoch": 7.875103619784471, + "grad_norm": 2.6529979705810547, + "learning_rate": 1.9149154317127442e-05, + "loss": 0.5764, + "step": 342000 + }, + { + "epoch": 7.879708943538731, + "grad_norm": 2.466392993927002, + "learning_rate": 1.9141882753304928e-05, + "loss": 0.5857, + "step": 342200 + }, + { + "epoch": 7.884314267292991, + "grad_norm": 4.523742198944092, + "learning_rate": 1.913461118948241e-05, + "loss": 0.5771, + "step": 342400 + }, + { + "epoch": 7.888919591047251, + "grad_norm": 3.0389695167541504, + "learning_rate": 1.9127339625659893e-05, + "loss": 0.587, + "step": 342600 + }, + { + "epoch": 7.89352491480151, + "grad_norm": 3.1406664848327637, + "learning_rate": 1.912006806183738e-05, + "loss": 0.5812, + "step": 342800 + }, + { + "epoch": 7.898130238555771, + "grad_norm": 2.3682878017425537, + "learning_rate": 1.9112796498014865e-05, + "loss": 0.5873, + "step": 343000 + }, + { + "epoch": 7.90273556231003, + "grad_norm": 2.9673991203308105, + "learning_rate": 1.910552493419235e-05, + "loss": 0.5781, + "step": 343200 + }, + { + "epoch": 7.90734088606429, + "grad_norm": 3.378129243850708, + "learning_rate": 1.9098253370369833e-05, + "loss": 0.5848, + "step": 343400 + }, + { + "epoch": 7.9119462098185505, + "grad_norm": 3.566913366317749, + "learning_rate": 1.9090981806547316e-05, + "loss": 0.5836, + "step": 343600 + }, + { + "epoch": 7.91655153357281, + "grad_norm": 3.525033712387085, + "learning_rate": 1.90837102427248e-05, + "loss": 0.594, + "step": 343800 + }, + { + "epoch": 7.92115685732707, + "grad_norm": 2.8252060413360596, + "learning_rate": 1.9076438678902284e-05, + "loss": 0.5913, + "step": 344000 + }, + { + "epoch": 7.92576218108133, + "grad_norm": 4.159696102142334, + "learning_rate": 1.9069203472898883e-05, + "loss": 0.5845, + "step": 344200 + }, + { + "epoch": 7.93036750483559, + "grad_norm": 3.0841197967529297, + "learning_rate": 1.9061968266895478e-05, + "loss": 0.5936, + "step": 344400 + }, + { + "epoch": 7.93497282858985, + "grad_norm": 2.3615498542785645, + "learning_rate": 1.9054696703072964e-05, + "loss": 0.5897, + "step": 344600 + }, + { + "epoch": 7.93957815234411, + "grad_norm": 2.4957964420318604, + "learning_rate": 1.904742513925045e-05, + "loss": 0.5839, + "step": 344800 + }, + { + "epoch": 7.944183476098369, + "grad_norm": 3.6795175075531006, + "learning_rate": 1.9040153575427932e-05, + "loss": 0.5876, + "step": 345000 + }, + { + "epoch": 7.94878879985263, + "grad_norm": 2.6692540645599365, + "learning_rate": 1.9032882011605415e-05, + "loss": 0.5847, + "step": 345200 + }, + { + "epoch": 7.9533941236068895, + "grad_norm": 3.4338459968566895, + "learning_rate": 1.90256104477829e-05, + "loss": 0.5909, + "step": 345400 + }, + { + "epoch": 7.95799944736115, + "grad_norm": 2.6297929286956787, + "learning_rate": 1.9018375241779496e-05, + "loss": 0.5925, + "step": 345600 + }, + { + "epoch": 7.9626047711154095, + "grad_norm": 3.2749218940734863, + "learning_rate": 1.9011103677956982e-05, + "loss": 0.586, + "step": 345800 + }, + { + "epoch": 7.967210094869669, + "grad_norm": 2.956953763961792, + "learning_rate": 1.9003832114134468e-05, + "loss": 0.5992, + "step": 346000 + }, + { + "epoch": 7.97181541862393, + "grad_norm": 3.2748615741729736, + "learning_rate": 1.899656055031195e-05, + "loss": 0.5948, + "step": 346200 + }, + { + "epoch": 7.976420742378189, + "grad_norm": 3.0381953716278076, + "learning_rate": 1.8989288986489436e-05, + "loss": 0.5946, + "step": 346400 + }, + { + "epoch": 7.981026066132449, + "grad_norm": 2.761564254760742, + "learning_rate": 1.8982017422666922e-05, + "loss": 0.5925, + "step": 346600 + }, + { + "epoch": 7.985631389886709, + "grad_norm": 2.955965042114258, + "learning_rate": 1.89747458588444e-05, + "loss": 0.5809, + "step": 346800 + }, + { + "epoch": 7.990236713640969, + "grad_norm": 3.134218454360962, + "learning_rate": 1.8967474295021887e-05, + "loss": 0.5939, + "step": 347000 + }, + { + "epoch": 7.9948420373952285, + "grad_norm": 2.840864896774292, + "learning_rate": 1.8960202731199373e-05, + "loss": 0.5862, + "step": 347200 + }, + { + "epoch": 7.999447361149489, + "grad_norm": 2.9396276473999023, + "learning_rate": 1.8952931167376855e-05, + "loss": 0.588, + "step": 347400 + }, + { + "epoch": 8.0, + "eval_loss": 0.5577627420425415, + "eval_runtime": 145.2006, + "eval_samples_per_second": 195.323, + "eval_steps_per_second": 12.211, + "step": 347424 + }, + { + "epoch": 8.00405268490375, + "grad_norm": 2.531428098678589, + "learning_rate": 1.894565960355434e-05, + "loss": 0.5814, + "step": 347600 + }, + { + "epoch": 8.008658008658008, + "grad_norm": 3.6581764221191406, + "learning_rate": 1.8938388039731827e-05, + "loss": 0.5853, + "step": 347800 + }, + { + "epoch": 8.013263332412269, + "grad_norm": 2.494723081588745, + "learning_rate": 1.8931116475909306e-05, + "loss": 0.5866, + "step": 348000 + }, + { + "epoch": 8.017868656166529, + "grad_norm": 2.6664531230926514, + "learning_rate": 1.8923844912086792e-05, + "loss": 0.5971, + "step": 348200 + }, + { + "epoch": 8.022473979920788, + "grad_norm": 2.837907552719116, + "learning_rate": 1.8916573348264278e-05, + "loss": 0.583, + "step": 348400 + }, + { + "epoch": 8.027079303675048, + "grad_norm": 3.521794557571411, + "learning_rate": 1.8909301784441764e-05, + "loss": 0.5916, + "step": 348600 + }, + { + "epoch": 8.031684627429309, + "grad_norm": 2.823016881942749, + "learning_rate": 1.8902030220619247e-05, + "loss": 0.5879, + "step": 348800 + }, + { + "epoch": 8.036289951183567, + "grad_norm": 2.7679924964904785, + "learning_rate": 1.8894758656796733e-05, + "loss": 0.5776, + "step": 349000 + }, + { + "epoch": 8.040895274937828, + "grad_norm": 4.573503017425537, + "learning_rate": 1.888748709297422e-05, + "loss": 0.5819, + "step": 349200 + }, + { + "epoch": 8.045500598692088, + "grad_norm": 2.8802812099456787, + "learning_rate": 1.8880215529151698e-05, + "loss": 0.5816, + "step": 349400 + }, + { + "epoch": 8.050105922446347, + "grad_norm": 2.96124005317688, + "learning_rate": 1.8872943965329184e-05, + "loss": 0.6029, + "step": 349600 + }, + { + "epoch": 8.054711246200608, + "grad_norm": 2.664660930633545, + "learning_rate": 1.886567240150667e-05, + "loss": 0.5707, + "step": 349800 + }, + { + "epoch": 8.059316569954868, + "grad_norm": 3.073604106903076, + "learning_rate": 1.8858400837684152e-05, + "loss": 0.5928, + "step": 350000 + }, + { + "epoch": 8.063921893709129, + "grad_norm": 3.5625150203704834, + "learning_rate": 1.8851129273861638e-05, + "loss": 0.5908, + "step": 350200 + }, + { + "epoch": 8.068527217463387, + "grad_norm": 2.9049417972564697, + "learning_rate": 1.8843857710039124e-05, + "loss": 0.5835, + "step": 350400 + }, + { + "epoch": 8.073132541217648, + "grad_norm": 3.092867851257324, + "learning_rate": 1.8836586146216603e-05, + "loss": 0.5788, + "step": 350600 + }, + { + "epoch": 8.077737864971908, + "grad_norm": 2.719918727874756, + "learning_rate": 1.88293509402132e-05, + "loss": 0.5823, + "step": 350800 + }, + { + "epoch": 8.082343188726167, + "grad_norm": 4.278101444244385, + "learning_rate": 1.8822115734209797e-05, + "loss": 0.5818, + "step": 351000 + }, + { + "epoch": 8.086948512480427, + "grad_norm": 2.67177677154541, + "learning_rate": 1.8814844170387283e-05, + "loss": 0.5893, + "step": 351200 + }, + { + "epoch": 8.091553836234688, + "grad_norm": 3.964667558670044, + "learning_rate": 1.880757260656477e-05, + "loss": 0.5855, + "step": 351400 + }, + { + "epoch": 8.096159159988947, + "grad_norm": 3.183964252471924, + "learning_rate": 1.880030104274225e-05, + "loss": 0.5794, + "step": 351600 + }, + { + "epoch": 8.100764483743207, + "grad_norm": 2.9570157527923584, + "learning_rate": 1.8793029478919737e-05, + "loss": 0.5979, + "step": 351800 + }, + { + "epoch": 8.105369807497468, + "grad_norm": 3.8420159816741943, + "learning_rate": 1.8785757915097223e-05, + "loss": 0.5792, + "step": 352000 + }, + { + "epoch": 8.109975131251726, + "grad_norm": 4.49518346786499, + "learning_rate": 1.8778486351274705e-05, + "loss": 0.5845, + "step": 352200 + }, + { + "epoch": 8.114580455005987, + "grad_norm": 2.9264657497406006, + "learning_rate": 1.8771214787452188e-05, + "loss": 0.5866, + "step": 352400 + }, + { + "epoch": 8.119185778760247, + "grad_norm": 2.69858455657959, + "learning_rate": 1.8763943223629674e-05, + "loss": 0.5859, + "step": 352600 + }, + { + "epoch": 8.123791102514506, + "grad_norm": 3.450650453567505, + "learning_rate": 1.875667165980716e-05, + "loss": 0.5838, + "step": 352800 + }, + { + "epoch": 8.128396426268766, + "grad_norm": 2.7137413024902344, + "learning_rate": 1.8749400095984642e-05, + "loss": 0.5824, + "step": 353000 + }, + { + "epoch": 8.133001750023027, + "grad_norm": 3.0916788578033447, + "learning_rate": 1.8742128532162128e-05, + "loss": 0.5959, + "step": 353200 + }, + { + "epoch": 8.137607073777286, + "grad_norm": 3.159905433654785, + "learning_rate": 1.8734856968339614e-05, + "loss": 0.5792, + "step": 353400 + }, + { + "epoch": 8.142212397531546, + "grad_norm": 3.4873342514038086, + "learning_rate": 1.8727585404517093e-05, + "loss": 0.5834, + "step": 353600 + }, + { + "epoch": 8.146817721285807, + "grad_norm": 3.2853617668151855, + "learning_rate": 1.872031384069458e-05, + "loss": 0.5818, + "step": 353800 + }, + { + "epoch": 8.151423045040067, + "grad_norm": 4.945338726043701, + "learning_rate": 1.8713042276872065e-05, + "loss": 0.579, + "step": 354000 + }, + { + "epoch": 8.156028368794326, + "grad_norm": 2.7136645317077637, + "learning_rate": 1.8705770713049548e-05, + "loss": 0.5743, + "step": 354200 + }, + { + "epoch": 8.160633692548586, + "grad_norm": 3.545706272125244, + "learning_rate": 1.8698535507046146e-05, + "loss": 0.5861, + "step": 354400 + }, + { + "epoch": 8.165239016302847, + "grad_norm": 2.9433093070983887, + "learning_rate": 1.8691263943223632e-05, + "loss": 0.5775, + "step": 354600 + }, + { + "epoch": 8.169844340057105, + "grad_norm": 3.4373202323913574, + "learning_rate": 1.8683992379401115e-05, + "loss": 0.589, + "step": 354800 + }, + { + "epoch": 8.174449663811366, + "grad_norm": 3.6623363494873047, + "learning_rate": 1.8676720815578597e-05, + "loss": 0.5737, + "step": 355000 + }, + { + "epoch": 8.179054987565626, + "grad_norm": 3.4433813095092773, + "learning_rate": 1.8669449251756083e-05, + "loss": 0.5852, + "step": 355200 + }, + { + "epoch": 8.183660311319885, + "grad_norm": 3.2509543895721436, + "learning_rate": 1.8662177687933566e-05, + "loss": 0.5787, + "step": 355400 + }, + { + "epoch": 8.188265635074146, + "grad_norm": 2.7837679386138916, + "learning_rate": 1.865490612411105e-05, + "loss": 0.5808, + "step": 355600 + }, + { + "epoch": 8.192870958828406, + "grad_norm": 3.2710013389587402, + "learning_rate": 1.8647634560288538e-05, + "loss": 0.5764, + "step": 355800 + }, + { + "epoch": 8.197476282582665, + "grad_norm": 3.4434151649475098, + "learning_rate": 1.864036299646602e-05, + "loss": 0.5755, + "step": 356000 + }, + { + "epoch": 8.202081606336925, + "grad_norm": 3.4400975704193115, + "learning_rate": 1.8633091432643503e-05, + "loss": 0.5874, + "step": 356200 + }, + { + "epoch": 8.206686930091186, + "grad_norm": 3.25714111328125, + "learning_rate": 1.862581986882099e-05, + "loss": 0.5868, + "step": 356400 + }, + { + "epoch": 8.211292253845444, + "grad_norm": 5.997415542602539, + "learning_rate": 1.8618548304998474e-05, + "loss": 0.5689, + "step": 356600 + }, + { + "epoch": 8.215897577599705, + "grad_norm": 3.3374671936035156, + "learning_rate": 1.861131309899507e-05, + "loss": 0.5931, + "step": 356800 + }, + { + "epoch": 8.220502901353965, + "grad_norm": 2.680866241455078, + "learning_rate": 1.8604041535172556e-05, + "loss": 0.5855, + "step": 357000 + }, + { + "epoch": 8.225108225108226, + "grad_norm": 3.1401114463806152, + "learning_rate": 1.8596769971350038e-05, + "loss": 0.5864, + "step": 357200 + }, + { + "epoch": 8.229713548862485, + "grad_norm": 2.7683680057525635, + "learning_rate": 1.8589498407527524e-05, + "loss": 0.578, + "step": 357400 + }, + { + "epoch": 8.234318872616745, + "grad_norm": 3.4666707515716553, + "learning_rate": 1.858222684370501e-05, + "loss": 0.5788, + "step": 357600 + }, + { + "epoch": 8.238924196371006, + "grad_norm": 2.9666600227355957, + "learning_rate": 1.857495527988249e-05, + "loss": 0.5948, + "step": 357800 + }, + { + "epoch": 8.243529520125264, + "grad_norm": 3.1708505153656006, + "learning_rate": 1.8567683716059975e-05, + "loss": 0.5724, + "step": 358000 + }, + { + "epoch": 8.248134843879525, + "grad_norm": 3.3768389225006104, + "learning_rate": 1.856041215223746e-05, + "loss": 0.5721, + "step": 358200 + }, + { + "epoch": 8.252740167633785, + "grad_norm": 2.644984722137451, + "learning_rate": 1.8553140588414947e-05, + "loss": 0.5743, + "step": 358400 + }, + { + "epoch": 8.257345491388044, + "grad_norm": 3.155850648880005, + "learning_rate": 1.854586902459243e-05, + "loss": 0.5792, + "step": 358600 + }, + { + "epoch": 8.261950815142304, + "grad_norm": 2.7669639587402344, + "learning_rate": 1.8538597460769915e-05, + "loss": 0.5915, + "step": 358800 + }, + { + "epoch": 8.266556138896565, + "grad_norm": 3.183368682861328, + "learning_rate": 1.8531325896947398e-05, + "loss": 0.5899, + "step": 359000 + }, + { + "epoch": 8.271161462650824, + "grad_norm": 2.635927677154541, + "learning_rate": 1.852405433312488e-05, + "loss": 0.588, + "step": 359200 + }, + { + "epoch": 8.275766786405084, + "grad_norm": 3.761402130126953, + "learning_rate": 1.8516782769302366e-05, + "loss": 0.5888, + "step": 359400 + }, + { + "epoch": 8.280372110159345, + "grad_norm": 2.46097469329834, + "learning_rate": 1.850954756329896e-05, + "loss": 0.5802, + "step": 359600 + }, + { + "epoch": 8.284977433913603, + "grad_norm": 2.9108171463012695, + "learning_rate": 1.8502275999476447e-05, + "loss": 0.5836, + "step": 359800 + }, + { + "epoch": 8.289582757667864, + "grad_norm": 3.4605486392974854, + "learning_rate": 1.8495004435653933e-05, + "loss": 0.5948, + "step": 360000 + }, + { + "epoch": 8.294188081422124, + "grad_norm": 3.1106951236724854, + "learning_rate": 1.848773287183142e-05, + "loss": 0.5734, + "step": 360200 + }, + { + "epoch": 8.298793405176385, + "grad_norm": 2.4587388038635254, + "learning_rate": 1.84804613080089e-05, + "loss": 0.5776, + "step": 360400 + }, + { + "epoch": 8.303398728930643, + "grad_norm": 2.9624781608581543, + "learning_rate": 1.8473189744186384e-05, + "loss": 0.5843, + "step": 360600 + }, + { + "epoch": 8.308004052684904, + "grad_norm": 2.9920835494995117, + "learning_rate": 1.846591818036387e-05, + "loss": 0.5915, + "step": 360800 + }, + { + "epoch": 8.312609376439164, + "grad_norm": 2.7066526412963867, + "learning_rate": 1.8458646616541353e-05, + "loss": 0.5826, + "step": 361000 + }, + { + "epoch": 8.317214700193423, + "grad_norm": 3.3276214599609375, + "learning_rate": 1.845141141053795e-05, + "loss": 0.5697, + "step": 361200 + }, + { + "epoch": 8.321820023947684, + "grad_norm": 3.875354290008545, + "learning_rate": 1.8444139846715434e-05, + "loss": 0.5921, + "step": 361400 + }, + { + "epoch": 8.326425347701944, + "grad_norm": 2.708665132522583, + "learning_rate": 1.8436904640712032e-05, + "loss": 0.5803, + "step": 361600 + }, + { + "epoch": 8.331030671456203, + "grad_norm": 3.221430540084839, + "learning_rate": 1.8429633076889518e-05, + "loss": 0.5827, + "step": 361800 + }, + { + "epoch": 8.335635995210463, + "grad_norm": 2.921086549758911, + "learning_rate": 1.8422361513067e-05, + "loss": 0.5774, + "step": 362000 + }, + { + "epoch": 8.340241318964724, + "grad_norm": 3.2136096954345703, + "learning_rate": 1.8415089949244483e-05, + "loss": 0.5872, + "step": 362200 + }, + { + "epoch": 8.344846642718982, + "grad_norm": 2.5168769359588623, + "learning_rate": 1.840781838542197e-05, + "loss": 0.5836, + "step": 362400 + }, + { + "epoch": 8.349451966473243, + "grad_norm": 2.8332345485687256, + "learning_rate": 1.8400546821599452e-05, + "loss": 0.5748, + "step": 362600 + }, + { + "epoch": 8.354057290227503, + "grad_norm": 2.76617431640625, + "learning_rate": 1.8393275257776938e-05, + "loss": 0.5802, + "step": 362800 + }, + { + "epoch": 8.358662613981762, + "grad_norm": 3.170368194580078, + "learning_rate": 1.8386003693954424e-05, + "loss": 0.5819, + "step": 363000 + }, + { + "epoch": 8.363267937736023, + "grad_norm": 3.0185418128967285, + "learning_rate": 1.8378732130131906e-05, + "loss": 0.5835, + "step": 363200 + }, + { + "epoch": 8.367873261490283, + "grad_norm": 3.179088830947876, + "learning_rate": 1.837146056630939e-05, + "loss": 0.5827, + "step": 363400 + }, + { + "epoch": 8.372478585244544, + "grad_norm": 2.731213331222534, + "learning_rate": 1.8364189002486875e-05, + "loss": 0.5866, + "step": 363600 + }, + { + "epoch": 8.377083908998802, + "grad_norm": 2.9844472408294678, + "learning_rate": 1.835691743866436e-05, + "loss": 0.5795, + "step": 363800 + }, + { + "epoch": 8.381689232753063, + "grad_norm": 3.1267082691192627, + "learning_rate": 1.8349645874841843e-05, + "loss": 0.5794, + "step": 364000 + }, + { + "epoch": 8.386294556507323, + "grad_norm": 2.6979808807373047, + "learning_rate": 1.834237431101933e-05, + "loss": 0.5956, + "step": 364200 + }, + { + "epoch": 8.390899880261582, + "grad_norm": 2.9630825519561768, + "learning_rate": 1.8335102747196815e-05, + "loss": 0.5796, + "step": 364400 + }, + { + "epoch": 8.395505204015842, + "grad_norm": 3.268383741378784, + "learning_rate": 1.8327831183374297e-05, + "loss": 0.5943, + "step": 364600 + }, + { + "epoch": 8.400110527770103, + "grad_norm": 2.594475507736206, + "learning_rate": 1.832055961955178e-05, + "loss": 0.5741, + "step": 364800 + }, + { + "epoch": 8.404715851524362, + "grad_norm": 2.8668203353881836, + "learning_rate": 1.8313288055729266e-05, + "loss": 0.5843, + "step": 365000 + }, + { + "epoch": 8.409321175278622, + "grad_norm": 2.9619016647338867, + "learning_rate": 1.8306016491906748e-05, + "loss": 0.5845, + "step": 365200 + }, + { + "epoch": 8.413926499032883, + "grad_norm": 3.0649566650390625, + "learning_rate": 1.8298744928084234e-05, + "loss": 0.6053, + "step": 365400 + }, + { + "epoch": 8.418531822787141, + "grad_norm": 3.129210948944092, + "learning_rate": 1.829147336426172e-05, + "loss": 0.5795, + "step": 365600 + }, + { + "epoch": 8.423137146541402, + "grad_norm": 3.2703466415405273, + "learning_rate": 1.8284201800439203e-05, + "loss": 0.5817, + "step": 365800 + }, + { + "epoch": 8.427742470295662, + "grad_norm": 3.3453192710876465, + "learning_rate": 1.8276930236616685e-05, + "loss": 0.5837, + "step": 366000 + }, + { + "epoch": 8.432347794049921, + "grad_norm": 3.5036461353302, + "learning_rate": 1.826965867279417e-05, + "loss": 0.588, + "step": 366200 + }, + { + "epoch": 8.436953117804181, + "grad_norm": 3.5221807956695557, + "learning_rate": 1.8262387108971657e-05, + "loss": 0.5866, + "step": 366400 + }, + { + "epoch": 8.441558441558442, + "grad_norm": 3.2333667278289795, + "learning_rate": 1.825511554514914e-05, + "loss": 0.5848, + "step": 366600 + }, + { + "epoch": 8.4461637653127, + "grad_norm": 3.349844455718994, + "learning_rate": 1.8247843981326625e-05, + "loss": 0.5864, + "step": 366800 + }, + { + "epoch": 8.450769089066961, + "grad_norm": 3.0328314304351807, + "learning_rate": 1.824057241750411e-05, + "loss": 0.5713, + "step": 367000 + }, + { + "epoch": 8.455374412821222, + "grad_norm": 3.0258967876434326, + "learning_rate": 1.8233300853681594e-05, + "loss": 0.5747, + "step": 367200 + }, + { + "epoch": 8.459979736575482, + "grad_norm": 2.960977792739868, + "learning_rate": 1.8226029289859076e-05, + "loss": 0.5685, + "step": 367400 + }, + { + "epoch": 8.46458506032974, + "grad_norm": 3.144864797592163, + "learning_rate": 1.8218757726036562e-05, + "loss": 0.5863, + "step": 367600 + }, + { + "epoch": 8.469190384084001, + "grad_norm": 2.93837833404541, + "learning_rate": 1.8211486162214045e-05, + "loss": 0.5778, + "step": 367800 + }, + { + "epoch": 8.473795707838262, + "grad_norm": 3.0120725631713867, + "learning_rate": 1.8204250956210643e-05, + "loss": 0.5898, + "step": 368000 + }, + { + "epoch": 8.47840103159252, + "grad_norm": 2.907320976257324, + "learning_rate": 1.819697939238813e-05, + "loss": 0.58, + "step": 368200 + }, + { + "epoch": 8.483006355346781, + "grad_norm": 2.7009003162384033, + "learning_rate": 1.8189744186384725e-05, + "loss": 0.5916, + "step": 368400 + }, + { + "epoch": 8.487611679101041, + "grad_norm": 2.902514696121216, + "learning_rate": 1.818247262256221e-05, + "loss": 0.5683, + "step": 368600 + }, + { + "epoch": 8.4922170028553, + "grad_norm": 3.5447628498077393, + "learning_rate": 1.8175201058739693e-05, + "loss": 0.5765, + "step": 368800 + }, + { + "epoch": 8.49682232660956, + "grad_norm": 3.0981380939483643, + "learning_rate": 1.8167929494917176e-05, + "loss": 0.5716, + "step": 369000 + }, + { + "epoch": 8.501427650363821, + "grad_norm": 2.814976453781128, + "learning_rate": 1.816065793109466e-05, + "loss": 0.5864, + "step": 369200 + }, + { + "epoch": 8.50603297411808, + "grad_norm": 3.170222043991089, + "learning_rate": 1.8153386367272144e-05, + "loss": 0.5754, + "step": 369400 + }, + { + "epoch": 8.51063829787234, + "grad_norm": 2.6524722576141357, + "learning_rate": 1.814611480344963e-05, + "loss": 0.5859, + "step": 369600 + }, + { + "epoch": 8.5152436216266, + "grad_norm": 3.715766191482544, + "learning_rate": 1.8138843239627116e-05, + "loss": 0.5889, + "step": 369800 + }, + { + "epoch": 8.51984894538086, + "grad_norm": 3.317150592803955, + "learning_rate": 1.8131571675804602e-05, + "loss": 0.5823, + "step": 370000 + }, + { + "epoch": 8.52445426913512, + "grad_norm": 3.7694554328918457, + "learning_rate": 1.812430011198208e-05, + "loss": 0.5776, + "step": 370200 + }, + { + "epoch": 8.52905959288938, + "grad_norm": 3.328153371810913, + "learning_rate": 1.8117028548159567e-05, + "loss": 0.5895, + "step": 370400 + }, + { + "epoch": 8.533664916643641, + "grad_norm": 3.546926498413086, + "learning_rate": 1.8109756984337053e-05, + "loss": 0.5798, + "step": 370600 + }, + { + "epoch": 8.5382702403979, + "grad_norm": 3.3275558948516846, + "learning_rate": 1.8102485420514535e-05, + "loss": 0.5787, + "step": 370800 + }, + { + "epoch": 8.54287556415216, + "grad_norm": 2.915025234222412, + "learning_rate": 1.809521385669202e-05, + "loss": 0.5742, + "step": 371000 + }, + { + "epoch": 8.54748088790642, + "grad_norm": 2.634218454360962, + "learning_rate": 1.8087942292869507e-05, + "loss": 0.5869, + "step": 371200 + }, + { + "epoch": 8.55208621166068, + "grad_norm": 3.156991481781006, + "learning_rate": 1.808067072904699e-05, + "loss": 0.582, + "step": 371400 + }, + { + "epoch": 8.55669153541494, + "grad_norm": 3.4311141967773438, + "learning_rate": 1.8073399165224472e-05, + "loss": 0.5955, + "step": 371600 + }, + { + "epoch": 8.5612968591692, + "grad_norm": 2.9205784797668457, + "learning_rate": 1.8066127601401958e-05, + "loss": 0.5826, + "step": 371800 + }, + { + "epoch": 8.565902182923459, + "grad_norm": 3.045046329498291, + "learning_rate": 1.805885603757944e-05, + "loss": 0.5948, + "step": 372000 + }, + { + "epoch": 8.57050750667772, + "grad_norm": 3.506364583969116, + "learning_rate": 1.8051584473756926e-05, + "loss": 0.5762, + "step": 372200 + }, + { + "epoch": 8.57511283043198, + "grad_norm": 3.3249881267547607, + "learning_rate": 1.8044312909934412e-05, + "loss": 0.5784, + "step": 372400 + }, + { + "epoch": 8.579718154186239, + "grad_norm": 3.3042402267456055, + "learning_rate": 1.8037041346111898e-05, + "loss": 0.5907, + "step": 372600 + }, + { + "epoch": 8.584323477940499, + "grad_norm": 2.80226469039917, + "learning_rate": 1.8029769782289377e-05, + "loss": 0.5917, + "step": 372800 + }, + { + "epoch": 8.58892880169476, + "grad_norm": 3.677151679992676, + "learning_rate": 1.8022498218466863e-05, + "loss": 0.5947, + "step": 373000 + }, + { + "epoch": 8.593534125449018, + "grad_norm": 2.8977012634277344, + "learning_rate": 1.801522665464435e-05, + "loss": 0.5859, + "step": 373200 + }, + { + "epoch": 8.598139449203279, + "grad_norm": 3.019442558288574, + "learning_rate": 1.8007955090821832e-05, + "loss": 0.5862, + "step": 373400 + }, + { + "epoch": 8.60274477295754, + "grad_norm": 3.2348787784576416, + "learning_rate": 1.8000683526999318e-05, + "loss": 0.5909, + "step": 373600 + }, + { + "epoch": 8.607350096711798, + "grad_norm": 3.2550971508026123, + "learning_rate": 1.7993411963176804e-05, + "loss": 0.5851, + "step": 373800 + }, + { + "epoch": 8.611955420466058, + "grad_norm": 2.6115951538085938, + "learning_rate": 1.7986140399354286e-05, + "loss": 0.5935, + "step": 374000 + }, + { + "epoch": 8.616560744220319, + "grad_norm": 3.2661094665527344, + "learning_rate": 1.797886883553177e-05, + "loss": 0.581, + "step": 374200 + }, + { + "epoch": 8.62116606797458, + "grad_norm": 3.3261373043060303, + "learning_rate": 1.7971597271709255e-05, + "loss": 0.5674, + "step": 374400 + }, + { + "epoch": 8.625771391728838, + "grad_norm": 3.81058669090271, + "learning_rate": 1.7964325707886737e-05, + "loss": 0.5717, + "step": 374600 + }, + { + "epoch": 8.630376715483099, + "grad_norm": 3.5395662784576416, + "learning_rate": 1.7957090501883336e-05, + "loss": 0.5872, + "step": 374800 + }, + { + "epoch": 8.634982039237359, + "grad_norm": 3.262457847595215, + "learning_rate": 1.794981893806082e-05, + "loss": 0.5776, + "step": 375000 + }, + { + "epoch": 8.639587362991618, + "grad_norm": 3.360276460647583, + "learning_rate": 1.7942547374238304e-05, + "loss": 0.5762, + "step": 375200 + }, + { + "epoch": 8.644192686745878, + "grad_norm": 3.0414352416992188, + "learning_rate": 1.793527581041579e-05, + "loss": 0.5866, + "step": 375400 + }, + { + "epoch": 8.648798010500139, + "grad_norm": 3.696171283721924, + "learning_rate": 1.7928004246593273e-05, + "loss": 0.5764, + "step": 375600 + }, + { + "epoch": 8.653403334254397, + "grad_norm": 3.294072389602661, + "learning_rate": 1.7920732682770755e-05, + "loss": 0.5772, + "step": 375800 + }, + { + "epoch": 8.658008658008658, + "grad_norm": 3.592628240585327, + "learning_rate": 1.791346111894824e-05, + "loss": 0.5712, + "step": 376000 + }, + { + "epoch": 8.662613981762918, + "grad_norm": 2.2415127754211426, + "learning_rate": 1.7906189555125727e-05, + "loss": 0.5692, + "step": 376200 + }, + { + "epoch": 8.667219305517177, + "grad_norm": 3.072105646133423, + "learning_rate": 1.7898954349122322e-05, + "loss": 0.5747, + "step": 376400 + }, + { + "epoch": 8.671824629271438, + "grad_norm": 3.045968770980835, + "learning_rate": 1.7891682785299808e-05, + "loss": 0.5738, + "step": 376600 + }, + { + "epoch": 8.676429953025698, + "grad_norm": 3.102426052093506, + "learning_rate": 1.7884411221477294e-05, + "loss": 0.5619, + "step": 376800 + }, + { + "epoch": 8.681035276779959, + "grad_norm": 2.7489142417907715, + "learning_rate": 1.7877139657654773e-05, + "loss": 0.5673, + "step": 377000 + }, + { + "epoch": 8.685640600534217, + "grad_norm": 2.9566493034362793, + "learning_rate": 1.786986809383226e-05, + "loss": 0.5868, + "step": 377200 + }, + { + "epoch": 8.690245924288478, + "grad_norm": 3.6835222244262695, + "learning_rate": 1.7862596530009745e-05, + "loss": 0.5878, + "step": 377400 + }, + { + "epoch": 8.694851248042738, + "grad_norm": 3.0927047729492188, + "learning_rate": 1.7855324966187227e-05, + "loss": 0.59, + "step": 377600 + }, + { + "epoch": 8.699456571796997, + "grad_norm": 3.5249454975128174, + "learning_rate": 1.7848053402364713e-05, + "loss": 0.5731, + "step": 377800 + }, + { + "epoch": 8.704061895551257, + "grad_norm": 2.711740732192993, + "learning_rate": 1.78407818385422e-05, + "loss": 0.587, + "step": 378000 + }, + { + "epoch": 8.708667219305518, + "grad_norm": 3.440530776977539, + "learning_rate": 1.7833510274719682e-05, + "loss": 0.582, + "step": 378200 + }, + { + "epoch": 8.713272543059777, + "grad_norm": 3.404754400253296, + "learning_rate": 1.7826238710897164e-05, + "loss": 0.585, + "step": 378400 + }, + { + "epoch": 8.717877866814037, + "grad_norm": 3.556629180908203, + "learning_rate": 1.781896714707465e-05, + "loss": 0.5731, + "step": 378600 + }, + { + "epoch": 8.722483190568298, + "grad_norm": 2.7652220726013184, + "learning_rate": 1.7811695583252133e-05, + "loss": 0.576, + "step": 378800 + }, + { + "epoch": 8.727088514322556, + "grad_norm": 3.707284450531006, + "learning_rate": 1.780442401942962e-05, + "loss": 0.5766, + "step": 379000 + }, + { + "epoch": 8.731693838076817, + "grad_norm": 2.5321168899536133, + "learning_rate": 1.7797152455607105e-05, + "loss": 0.5838, + "step": 379200 + }, + { + "epoch": 8.736299161831077, + "grad_norm": 2.458303213119507, + "learning_rate": 1.778988089178459e-05, + "loss": 0.5953, + "step": 379400 + }, + { + "epoch": 8.740904485585336, + "grad_norm": 3.712862253189087, + "learning_rate": 1.778260932796207e-05, + "loss": 0.5852, + "step": 379600 + }, + { + "epoch": 8.745509809339596, + "grad_norm": 3.2254207134246826, + "learning_rate": 1.7775337764139556e-05, + "loss": 0.5876, + "step": 379800 + }, + { + "epoch": 8.750115133093857, + "grad_norm": 2.612973690032959, + "learning_rate": 1.776806620031704e-05, + "loss": 0.5762, + "step": 380000 + }, + { + "epoch": 8.754720456848116, + "grad_norm": 3.134018898010254, + "learning_rate": 1.7760794636494524e-05, + "loss": 0.573, + "step": 380200 + }, + { + "epoch": 8.759325780602376, + "grad_norm": 3.392620801925659, + "learning_rate": 1.7753559430491123e-05, + "loss": 0.5713, + "step": 380400 + }, + { + "epoch": 8.763931104356637, + "grad_norm": 2.867246627807617, + "learning_rate": 1.774628786666861e-05, + "loss": 0.5774, + "step": 380600 + }, + { + "epoch": 8.768536428110895, + "grad_norm": 2.381554126739502, + "learning_rate": 1.7739052660665204e-05, + "loss": 0.5897, + "step": 380800 + }, + { + "epoch": 8.773141751865156, + "grad_norm": 2.7664806842803955, + "learning_rate": 1.773178109684269e-05, + "loss": 0.5864, + "step": 381000 + }, + { + "epoch": 8.777747075619416, + "grad_norm": 3.2918121814727783, + "learning_rate": 1.7724509533020172e-05, + "loss": 0.5768, + "step": 381200 + }, + { + "epoch": 8.782352399373677, + "grad_norm": 3.4607903957366943, + "learning_rate": 1.7717237969197655e-05, + "loss": 0.5853, + "step": 381400 + }, + { + "epoch": 8.786957723127935, + "grad_norm": 3.931750535964966, + "learning_rate": 1.770996640537514e-05, + "loss": 0.5778, + "step": 381600 + }, + { + "epoch": 8.791563046882196, + "grad_norm": 2.7144854068756104, + "learning_rate": 1.7702694841552623e-05, + "loss": 0.5793, + "step": 381800 + }, + { + "epoch": 8.796168370636456, + "grad_norm": 3.4606308937072754, + "learning_rate": 1.769542327773011e-05, + "loss": 0.5813, + "step": 382000 + }, + { + "epoch": 8.800773694390715, + "grad_norm": 2.6973564624786377, + "learning_rate": 1.7688151713907595e-05, + "loss": 0.587, + "step": 382200 + }, + { + "epoch": 8.805379018144976, + "grad_norm": 2.9456260204315186, + "learning_rate": 1.768088015008508e-05, + "loss": 0.5725, + "step": 382400 + }, + { + "epoch": 8.809984341899236, + "grad_norm": 3.2708475589752197, + "learning_rate": 1.767360858626256e-05, + "loss": 0.5781, + "step": 382600 + }, + { + "epoch": 8.814589665653495, + "grad_norm": 3.055715799331665, + "learning_rate": 1.7666337022440046e-05, + "loss": 0.5829, + "step": 382800 + }, + { + "epoch": 8.819194989407755, + "grad_norm": 2.2745134830474854, + "learning_rate": 1.7659065458617532e-05, + "loss": 0.5836, + "step": 383000 + }, + { + "epoch": 8.823800313162016, + "grad_norm": 2.850818157196045, + "learning_rate": 1.7651793894795014e-05, + "loss": 0.5843, + "step": 383200 + }, + { + "epoch": 8.828405636916274, + "grad_norm": 3.06870436668396, + "learning_rate": 1.76445223309725e-05, + "loss": 0.5718, + "step": 383400 + }, + { + "epoch": 8.833010960670535, + "grad_norm": 3.3040614128112793, + "learning_rate": 1.7637250767149986e-05, + "loss": 0.5822, + "step": 383600 + }, + { + "epoch": 8.837616284424795, + "grad_norm": 2.4403414726257324, + "learning_rate": 1.7629979203327465e-05, + "loss": 0.5899, + "step": 383800 + }, + { + "epoch": 8.842221608179056, + "grad_norm": 2.8603031635284424, + "learning_rate": 1.762270763950495e-05, + "loss": 0.5811, + "step": 384000 + }, + { + "epoch": 8.846826931933315, + "grad_norm": 2.7087676525115967, + "learning_rate": 1.7615436075682437e-05, + "loss": 0.5904, + "step": 384200 + }, + { + "epoch": 8.851432255687575, + "grad_norm": 2.9180452823638916, + "learning_rate": 1.760816451185992e-05, + "loss": 0.5789, + "step": 384400 + }, + { + "epoch": 8.856037579441836, + "grad_norm": 3.6480114459991455, + "learning_rate": 1.7600892948037406e-05, + "loss": 0.5775, + "step": 384600 + }, + { + "epoch": 8.860642903196094, + "grad_norm": 3.7235474586486816, + "learning_rate": 1.7593657742034004e-05, + "loss": 0.5756, + "step": 384800 + }, + { + "epoch": 8.865248226950355, + "grad_norm": 2.3067126274108887, + "learning_rate": 1.7586386178211487e-05, + "loss": 0.5848, + "step": 385000 + }, + { + "epoch": 8.869853550704615, + "grad_norm": 3.3306350708007812, + "learning_rate": 1.7579114614388973e-05, + "loss": 0.5807, + "step": 385200 + }, + { + "epoch": 8.874458874458874, + "grad_norm": 3.0547428131103516, + "learning_rate": 1.7571879408385568e-05, + "loss": 0.5596, + "step": 385400 + }, + { + "epoch": 8.879064198213134, + "grad_norm": 3.389777660369873, + "learning_rate": 1.756460784456305e-05, + "loss": 0.5688, + "step": 385600 + }, + { + "epoch": 8.883669521967395, + "grad_norm": 3.0648694038391113, + "learning_rate": 1.7557336280740536e-05, + "loss": 0.5934, + "step": 385800 + }, + { + "epoch": 8.888274845721654, + "grad_norm": 2.877584457397461, + "learning_rate": 1.7550064716918022e-05, + "loss": 0.5749, + "step": 386000 + }, + { + "epoch": 8.892880169475914, + "grad_norm": 3.3926568031311035, + "learning_rate": 1.7542793153095505e-05, + "loss": 0.5716, + "step": 386200 + }, + { + "epoch": 8.897485493230175, + "grad_norm": 2.8832592964172363, + "learning_rate": 1.753552158927299e-05, + "loss": 0.577, + "step": 386400 + }, + { + "epoch": 8.902090816984433, + "grad_norm": 3.812521457672119, + "learning_rate": 1.7528250025450477e-05, + "loss": 0.5931, + "step": 386600 + }, + { + "epoch": 8.906696140738694, + "grad_norm": 3.376232624053955, + "learning_rate": 1.7520978461627956e-05, + "loss": 0.5758, + "step": 386800 + }, + { + "epoch": 8.911301464492954, + "grad_norm": 3.098276138305664, + "learning_rate": 1.751370689780544e-05, + "loss": 0.579, + "step": 387000 + }, + { + "epoch": 8.915906788247213, + "grad_norm": 3.7928085327148438, + "learning_rate": 1.7506435333982927e-05, + "loss": 0.5814, + "step": 387200 + }, + { + "epoch": 8.920512112001473, + "grad_norm": 2.6792008876800537, + "learning_rate": 1.749916377016041e-05, + "loss": 0.5924, + "step": 387400 + }, + { + "epoch": 8.925117435755734, + "grad_norm": 3.9660255908966064, + "learning_rate": 1.7491892206337896e-05, + "loss": 0.5773, + "step": 387600 + }, + { + "epoch": 8.929722759509993, + "grad_norm": 3.670153856277466, + "learning_rate": 1.7484657000334495e-05, + "loss": 0.5863, + "step": 387800 + }, + { + "epoch": 8.934328083264253, + "grad_norm": 2.9507553577423096, + "learning_rate": 1.7477385436511977e-05, + "loss": 0.5849, + "step": 388000 + }, + { + "epoch": 8.938933407018514, + "grad_norm": 3.5913052558898926, + "learning_rate": 1.7470113872689463e-05, + "loss": 0.5853, + "step": 388200 + }, + { + "epoch": 8.943538730772774, + "grad_norm": 3.083280086517334, + "learning_rate": 1.7462842308866945e-05, + "loss": 0.5752, + "step": 388400 + }, + { + "epoch": 8.948144054527033, + "grad_norm": 3.377192497253418, + "learning_rate": 1.7455570745044428e-05, + "loss": 0.5705, + "step": 388600 + }, + { + "epoch": 8.952749378281293, + "grad_norm": 3.040677785873413, + "learning_rate": 1.7448299181221914e-05, + "loss": 0.5838, + "step": 388800 + }, + { + "epoch": 8.957354702035554, + "grad_norm": 2.774031400680542, + "learning_rate": 1.74410276173994e-05, + "loss": 0.5814, + "step": 389000 + }, + { + "epoch": 8.961960025789812, + "grad_norm": 3.28867244720459, + "learning_rate": 1.7433756053576882e-05, + "loss": 0.5915, + "step": 389200 + }, + { + "epoch": 8.966565349544073, + "grad_norm": 2.8730006217956543, + "learning_rate": 1.7426484489754368e-05, + "loss": 0.579, + "step": 389400 + }, + { + "epoch": 8.971170673298333, + "grad_norm": 3.185356855392456, + "learning_rate": 1.741921292593185e-05, + "loss": 0.5694, + "step": 389600 + }, + { + "epoch": 8.975775997052592, + "grad_norm": 3.238297700881958, + "learning_rate": 1.7411941362109333e-05, + "loss": 0.5869, + "step": 389800 + }, + { + "epoch": 8.980381320806853, + "grad_norm": 2.7840898036956787, + "learning_rate": 1.740466979828682e-05, + "loss": 0.5765, + "step": 390000 + }, + { + "epoch": 8.984986644561113, + "grad_norm": 3.177186965942383, + "learning_rate": 1.7397398234464305e-05, + "loss": 0.587, + "step": 390200 + }, + { + "epoch": 8.989591968315372, + "grad_norm": 2.8901917934417725, + "learning_rate": 1.739012667064179e-05, + "loss": 0.5862, + "step": 390400 + }, + { + "epoch": 8.994197292069632, + "grad_norm": 2.768673896789551, + "learning_rate": 1.7382855106819274e-05, + "loss": 0.584, + "step": 390600 + }, + { + "epoch": 8.998802615823893, + "grad_norm": 2.805500030517578, + "learning_rate": 1.7375583542996756e-05, + "loss": 0.5783, + "step": 390800 + }, + { + "epoch": 9.0, + "eval_loss": 0.5523322224617004, + "eval_runtime": 146.0681, + "eval_samples_per_second": 194.163, + "eval_steps_per_second": 12.138, + "step": 390852 + }, + { + "epoch": 9.003407939578153, + "grad_norm": 4.481179714202881, + "learning_rate": 1.736834833699335e-05, + "loss": 0.5692, + "step": 391000 + }, + { + "epoch": 9.008013263332412, + "grad_norm": 3.07963228225708, + "learning_rate": 1.7361076773170837e-05, + "loss": 0.5692, + "step": 391200 + }, + { + "epoch": 9.012618587086672, + "grad_norm": 3.5437726974487305, + "learning_rate": 1.7353805209348323e-05, + "loss": 0.5569, + "step": 391400 + }, + { + "epoch": 9.017223910840933, + "grad_norm": 4.162001609802246, + "learning_rate": 1.7346533645525806e-05, + "loss": 0.5681, + "step": 391600 + }, + { + "epoch": 9.021829234595192, + "grad_norm": 3.081472873687744, + "learning_rate": 1.733926208170329e-05, + "loss": 0.5798, + "step": 391800 + }, + { + "epoch": 9.026434558349452, + "grad_norm": 2.833381175994873, + "learning_rate": 1.7331990517880778e-05, + "loss": 0.5837, + "step": 392000 + }, + { + "epoch": 9.031039882103713, + "grad_norm": 3.1221764087677, + "learning_rate": 1.7324718954058263e-05, + "loss": 0.5657, + "step": 392200 + }, + { + "epoch": 9.035645205857971, + "grad_norm": 3.7719027996063232, + "learning_rate": 1.7317447390235743e-05, + "loss": 0.5752, + "step": 392400 + }, + { + "epoch": 9.040250529612232, + "grad_norm": 3.361680507659912, + "learning_rate": 1.731017582641323e-05, + "loss": 0.5688, + "step": 392600 + }, + { + "epoch": 9.044855853366492, + "grad_norm": 3.1694486141204834, + "learning_rate": 1.7302904262590714e-05, + "loss": 0.5802, + "step": 392800 + }, + { + "epoch": 9.049461177120751, + "grad_norm": 3.455324649810791, + "learning_rate": 1.7295632698768197e-05, + "loss": 0.5688, + "step": 393000 + }, + { + "epoch": 9.054066500875011, + "grad_norm": 3.718388795852661, + "learning_rate": 1.7288361134945683e-05, + "loss": 0.5823, + "step": 393200 + }, + { + "epoch": 9.058671824629272, + "grad_norm": 3.018453359603882, + "learning_rate": 1.728108957112317e-05, + "loss": 0.5785, + "step": 393400 + }, + { + "epoch": 9.06327714838353, + "grad_norm": 2.9057810306549072, + "learning_rate": 1.7273818007300648e-05, + "loss": 0.5837, + "step": 393600 + }, + { + "epoch": 9.067882472137791, + "grad_norm": 3.081549644470215, + "learning_rate": 1.7266546443478134e-05, + "loss": 0.5735, + "step": 393800 + }, + { + "epoch": 9.072487795892052, + "grad_norm": 3.056722640991211, + "learning_rate": 1.725927487965562e-05, + "loss": 0.5877, + "step": 394000 + }, + { + "epoch": 9.07709311964631, + "grad_norm": 2.4788143634796143, + "learning_rate": 1.7252003315833102e-05, + "loss": 0.562, + "step": 394200 + }, + { + "epoch": 9.08169844340057, + "grad_norm": 2.9332191944122314, + "learning_rate": 1.7244731752010588e-05, + "loss": 0.575, + "step": 394400 + }, + { + "epoch": 9.086303767154831, + "grad_norm": 2.4606101512908936, + "learning_rate": 1.7237460188188074e-05, + "loss": 0.579, + "step": 394600 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 2.8705406188964844, + "learning_rate": 1.7230188624365553e-05, + "loss": 0.5667, + "step": 394800 + }, + { + "epoch": 9.09551441466335, + "grad_norm": 3.100910186767578, + "learning_rate": 1.722291706054304e-05, + "loss": 0.5774, + "step": 395000 + }, + { + "epoch": 9.100119738417611, + "grad_norm": 3.324007034301758, + "learning_rate": 1.7215645496720525e-05, + "loss": 0.579, + "step": 395200 + }, + { + "epoch": 9.104725062171871, + "grad_norm": 3.0006039142608643, + "learning_rate": 1.720837393289801e-05, + "loss": 0.5762, + "step": 395400 + }, + { + "epoch": 9.10933038592613, + "grad_norm": 3.3208751678466797, + "learning_rate": 1.7201102369075493e-05, + "loss": 0.5756, + "step": 395600 + }, + { + "epoch": 9.11393570968039, + "grad_norm": 2.7627346515655518, + "learning_rate": 1.7193867163072092e-05, + "loss": 0.5721, + "step": 395800 + }, + { + "epoch": 9.118541033434651, + "grad_norm": 2.2746336460113525, + "learning_rate": 1.7186595599249575e-05, + "loss": 0.5729, + "step": 396000 + }, + { + "epoch": 9.12314635718891, + "grad_norm": 3.013868808746338, + "learning_rate": 1.717932403542706e-05, + "loss": 0.5684, + "step": 396200 + }, + { + "epoch": 9.12775168094317, + "grad_norm": 3.658040761947632, + "learning_rate": 1.7172052471604543e-05, + "loss": 0.5754, + "step": 396400 + }, + { + "epoch": 9.13235700469743, + "grad_norm": 3.7302823066711426, + "learning_rate": 1.7164780907782026e-05, + "loss": 0.5768, + "step": 396600 + }, + { + "epoch": 9.13696232845169, + "grad_norm": 3.326700448989868, + "learning_rate": 1.715750934395951e-05, + "loss": 0.5704, + "step": 396800 + }, + { + "epoch": 9.14156765220595, + "grad_norm": 2.6242971420288086, + "learning_rate": 1.7150237780136997e-05, + "loss": 0.5749, + "step": 397000 + }, + { + "epoch": 9.14617297596021, + "grad_norm": 3.557640552520752, + "learning_rate": 1.7142966216314483e-05, + "loss": 0.5707, + "step": 397200 + }, + { + "epoch": 9.150778299714469, + "grad_norm": 4.010790824890137, + "learning_rate": 1.7135694652491966e-05, + "loss": 0.5678, + "step": 397400 + }, + { + "epoch": 9.15538362346873, + "grad_norm": 2.8452792167663574, + "learning_rate": 1.7128459446488564e-05, + "loss": 0.5773, + "step": 397600 + }, + { + "epoch": 9.15998894722299, + "grad_norm": 3.276362895965576, + "learning_rate": 1.7121187882666044e-05, + "loss": 0.5768, + "step": 397800 + }, + { + "epoch": 9.16459427097725, + "grad_norm": 2.479546546936035, + "learning_rate": 1.711391631884353e-05, + "loss": 0.5864, + "step": 398000 + }, + { + "epoch": 9.16919959473151, + "grad_norm": 2.4583699703216553, + "learning_rate": 1.7106644755021015e-05, + "loss": 0.5769, + "step": 398200 + }, + { + "epoch": 9.17380491848577, + "grad_norm": 3.225541830062866, + "learning_rate": 1.70993731911985e-05, + "loss": 0.5712, + "step": 398400 + }, + { + "epoch": 9.17841024224003, + "grad_norm": 2.6798887252807617, + "learning_rate": 1.7092101627375984e-05, + "loss": 0.5776, + "step": 398600 + }, + { + "epoch": 9.183015565994289, + "grad_norm": 4.995772838592529, + "learning_rate": 1.708483006355347e-05, + "loss": 0.5714, + "step": 398800 + }, + { + "epoch": 9.18762088974855, + "grad_norm": 3.2529380321502686, + "learning_rate": 1.7077558499730956e-05, + "loss": 0.5678, + "step": 399000 + }, + { + "epoch": 9.19222621350281, + "grad_norm": 2.858065366744995, + "learning_rate": 1.7070286935908435e-05, + "loss": 0.5693, + "step": 399200 + }, + { + "epoch": 9.196831537257069, + "grad_norm": 2.9561386108398438, + "learning_rate": 1.706301537208592e-05, + "loss": 0.5748, + "step": 399400 + }, + { + "epoch": 9.201436861011329, + "grad_norm": 3.378155469894409, + "learning_rate": 1.7055743808263407e-05, + "loss": 0.5608, + "step": 399600 + }, + { + "epoch": 9.20604218476559, + "grad_norm": 3.1857070922851562, + "learning_rate": 1.704847224444089e-05, + "loss": 0.5818, + "step": 399800 + }, + { + "epoch": 9.210647508519848, + "grad_norm": 4.287872791290283, + "learning_rate": 1.7041200680618375e-05, + "loss": 0.5751, + "step": 400000 + }, + { + "epoch": 9.215252832274109, + "grad_norm": 3.0986175537109375, + "learning_rate": 1.703392911679586e-05, + "loss": 0.5718, + "step": 400200 + }, + { + "epoch": 9.21985815602837, + "grad_norm": 3.245264768600464, + "learning_rate": 1.702665755297334e-05, + "loss": 0.5748, + "step": 400400 + }, + { + "epoch": 9.224463479782628, + "grad_norm": 3.5556859970092773, + "learning_rate": 1.7019385989150826e-05, + "loss": 0.5736, + "step": 400600 + }, + { + "epoch": 9.229068803536888, + "grad_norm": 3.7929844856262207, + "learning_rate": 1.7012114425328312e-05, + "loss": 0.5812, + "step": 400800 + }, + { + "epoch": 9.233674127291149, + "grad_norm": 3.4577155113220215, + "learning_rate": 1.7004879219324907e-05, + "loss": 0.5725, + "step": 401000 + }, + { + "epoch": 9.238279451045408, + "grad_norm": 3.2667465209960938, + "learning_rate": 1.6997607655502393e-05, + "loss": 0.5845, + "step": 401200 + }, + { + "epoch": 9.242884774799668, + "grad_norm": 3.0420265197753906, + "learning_rate": 1.699033609167988e-05, + "loss": 0.5881, + "step": 401400 + }, + { + "epoch": 9.247490098553929, + "grad_norm": 2.9382212162017822, + "learning_rate": 1.698306452785736e-05, + "loss": 0.5904, + "step": 401600 + }, + { + "epoch": 9.252095422308189, + "grad_norm": 3.358121395111084, + "learning_rate": 1.6975792964034844e-05, + "loss": 0.5678, + "step": 401800 + }, + { + "epoch": 9.256700746062448, + "grad_norm": 6.984818458557129, + "learning_rate": 1.696852140021233e-05, + "loss": 0.5678, + "step": 402000 + }, + { + "epoch": 9.261306069816708, + "grad_norm": 2.786919355392456, + "learning_rate": 1.6961249836389812e-05, + "loss": 0.5801, + "step": 402200 + }, + { + "epoch": 9.265911393570969, + "grad_norm": 2.3352646827697754, + "learning_rate": 1.69539782725673e-05, + "loss": 0.5722, + "step": 402400 + }, + { + "epoch": 9.270516717325227, + "grad_norm": 3.7076637744903564, + "learning_rate": 1.6946706708744784e-05, + "loss": 0.5773, + "step": 402600 + }, + { + "epoch": 9.275122041079488, + "grad_norm": 2.3307411670684814, + "learning_rate": 1.6939435144922267e-05, + "loss": 0.559, + "step": 402800 + }, + { + "epoch": 9.279727364833748, + "grad_norm": 3.8850185871124268, + "learning_rate": 1.6932163581099753e-05, + "loss": 0.5656, + "step": 403000 + }, + { + "epoch": 9.284332688588007, + "grad_norm": 3.246624231338501, + "learning_rate": 1.6924892017277235e-05, + "loss": 0.5893, + "step": 403200 + }, + { + "epoch": 9.288938012342268, + "grad_norm": 2.8703010082244873, + "learning_rate": 1.691762045345472e-05, + "loss": 0.5618, + "step": 403400 + }, + { + "epoch": 9.293543336096528, + "grad_norm": 3.0698583126068115, + "learning_rate": 1.6910385247451316e-05, + "loss": 0.5576, + "step": 403600 + }, + { + "epoch": 9.298148659850787, + "grad_norm": 2.5422704219818115, + "learning_rate": 1.6903113683628802e-05, + "loss": 0.5816, + "step": 403800 + }, + { + "epoch": 9.302753983605047, + "grad_norm": 3.1674082279205322, + "learning_rate": 1.6895842119806285e-05, + "loss": 0.586, + "step": 404000 + }, + { + "epoch": 9.307359307359308, + "grad_norm": 3.551118850708008, + "learning_rate": 1.688857055598377e-05, + "loss": 0.5837, + "step": 404200 + }, + { + "epoch": 9.311964631113566, + "grad_norm": 2.9174249172210693, + "learning_rate": 1.6881298992161257e-05, + "loss": 0.5796, + "step": 404400 + }, + { + "epoch": 9.316569954867827, + "grad_norm": 3.165064573287964, + "learning_rate": 1.6874027428338736e-05, + "loss": 0.5811, + "step": 404600 + }, + { + "epoch": 9.321175278622087, + "grad_norm": 3.249115228652954, + "learning_rate": 1.6866755864516222e-05, + "loss": 0.5701, + "step": 404800 + }, + { + "epoch": 9.325780602376348, + "grad_norm": 3.103433847427368, + "learning_rate": 1.6859484300693708e-05, + "loss": 0.5762, + "step": 405000 + }, + { + "epoch": 9.330385926130607, + "grad_norm": 3.612250328063965, + "learning_rate": 1.6852212736871194e-05, + "loss": 0.5697, + "step": 405200 + }, + { + "epoch": 9.334991249884867, + "grad_norm": 4.119315147399902, + "learning_rate": 1.6844941173048676e-05, + "loss": 0.5787, + "step": 405400 + }, + { + "epoch": 9.339596573639128, + "grad_norm": 4.298466205596924, + "learning_rate": 1.6837669609226162e-05, + "loss": 0.563, + "step": 405600 + }, + { + "epoch": 9.344201897393386, + "grad_norm": 2.9533872604370117, + "learning_rate": 1.6830398045403648e-05, + "loss": 0.5874, + "step": 405800 + }, + { + "epoch": 9.348807221147647, + "grad_norm": 3.106743812561035, + "learning_rate": 1.6823126481581127e-05, + "loss": 0.5742, + "step": 406000 + }, + { + "epoch": 9.353412544901907, + "grad_norm": 2.491485118865967, + "learning_rate": 1.6815854917758613e-05, + "loss": 0.5726, + "step": 406200 + }, + { + "epoch": 9.358017868656166, + "grad_norm": 3.787768602371216, + "learning_rate": 1.680861971175521e-05, + "loss": 0.5682, + "step": 406400 + }, + { + "epoch": 9.362623192410426, + "grad_norm": 3.0979113578796387, + "learning_rate": 1.6801348147932694e-05, + "loss": 0.5698, + "step": 406600 + }, + { + "epoch": 9.367228516164687, + "grad_norm": 2.7929654121398926, + "learning_rate": 1.679407658411018e-05, + "loss": 0.5644, + "step": 406800 + }, + { + "epoch": 9.371833839918946, + "grad_norm": 2.568842887878418, + "learning_rate": 1.6786805020287666e-05, + "loss": 0.5827, + "step": 407000 + }, + { + "epoch": 9.376439163673206, + "grad_norm": 3.327233076095581, + "learning_rate": 1.677953345646515e-05, + "loss": 0.5885, + "step": 407200 + }, + { + "epoch": 9.381044487427467, + "grad_norm": 2.4469523429870605, + "learning_rate": 1.677226189264263e-05, + "loss": 0.5692, + "step": 407400 + }, + { + "epoch": 9.385649811181725, + "grad_norm": 2.9710397720336914, + "learning_rate": 1.6764990328820117e-05, + "loss": 0.5651, + "step": 407600 + }, + { + "epoch": 9.390255134935986, + "grad_norm": 3.184701442718506, + "learning_rate": 1.67577187649976e-05, + "loss": 0.5642, + "step": 407800 + }, + { + "epoch": 9.394860458690246, + "grad_norm": 2.20974063873291, + "learning_rate": 1.6750447201175085e-05, + "loss": 0.5758, + "step": 408000 + }, + { + "epoch": 9.399465782444507, + "grad_norm": 2.655507802963257, + "learning_rate": 1.674317563735257e-05, + "loss": 0.5758, + "step": 408200 + }, + { + "epoch": 9.404071106198765, + "grad_norm": 2.7525951862335205, + "learning_rate": 1.6735904073530054e-05, + "loss": 0.5656, + "step": 408400 + }, + { + "epoch": 9.408676429953026, + "grad_norm": 3.151881694793701, + "learning_rate": 1.6728632509707536e-05, + "loss": 0.5701, + "step": 408600 + }, + { + "epoch": 9.413281753707286, + "grad_norm": 2.949288845062256, + "learning_rate": 1.6721360945885022e-05, + "loss": 0.5764, + "step": 408800 + }, + { + "epoch": 9.417887077461545, + "grad_norm": 3.4469354152679443, + "learning_rate": 1.6714089382062505e-05, + "loss": 0.577, + "step": 409000 + }, + { + "epoch": 9.422492401215806, + "grad_norm": 3.0155844688415527, + "learning_rate": 1.670681781823999e-05, + "loss": 0.5771, + "step": 409200 + }, + { + "epoch": 9.427097724970066, + "grad_norm": 3.070678472518921, + "learning_rate": 1.669958261223659e-05, + "loss": 0.5845, + "step": 409400 + }, + { + "epoch": 9.431703048724325, + "grad_norm": 2.2283005714416504, + "learning_rate": 1.6692311048414072e-05, + "loss": 0.5636, + "step": 409600 + }, + { + "epoch": 9.436308372478585, + "grad_norm": 2.9284727573394775, + "learning_rate": 1.6685039484591558e-05, + "loss": 0.581, + "step": 409800 + }, + { + "epoch": 9.440913696232846, + "grad_norm": 3.039658308029175, + "learning_rate": 1.6677767920769044e-05, + "loss": 0.568, + "step": 410000 + }, + { + "epoch": 9.445519019987104, + "grad_norm": 2.3291382789611816, + "learning_rate": 1.6670496356946523e-05, + "loss": 0.5628, + "step": 410200 + }, + { + "epoch": 9.450124343741365, + "grad_norm": 2.804224967956543, + "learning_rate": 1.666322479312401e-05, + "loss": 0.5728, + "step": 410400 + }, + { + "epoch": 9.454729667495625, + "grad_norm": 3.0607471466064453, + "learning_rate": 1.6655989587120607e-05, + "loss": 0.5655, + "step": 410600 + }, + { + "epoch": 9.459334991249884, + "grad_norm": 2.556657075881958, + "learning_rate": 1.664871802329809e-05, + "loss": 0.5723, + "step": 410800 + }, + { + "epoch": 9.463940315004145, + "grad_norm": 2.7533397674560547, + "learning_rate": 1.6641446459475576e-05, + "loss": 0.574, + "step": 411000 + }, + { + "epoch": 9.468545638758405, + "grad_norm": 2.0265374183654785, + "learning_rate": 1.663417489565306e-05, + "loss": 0.5751, + "step": 411200 + }, + { + "epoch": 9.473150962512666, + "grad_norm": 3.6705782413482666, + "learning_rate": 1.6626903331830544e-05, + "loss": 0.5787, + "step": 411400 + }, + { + "epoch": 9.477756286266924, + "grad_norm": 2.9727859497070312, + "learning_rate": 1.6619631768008027e-05, + "loss": 0.5658, + "step": 411600 + }, + { + "epoch": 9.482361610021185, + "grad_norm": 2.764256477355957, + "learning_rate": 1.6612360204185513e-05, + "loss": 0.5674, + "step": 411800 + }, + { + "epoch": 9.486966933775445, + "grad_norm": 4.042858123779297, + "learning_rate": 1.6605088640362995e-05, + "loss": 0.5726, + "step": 412000 + }, + { + "epoch": 9.491572257529704, + "grad_norm": 3.3755908012390137, + "learning_rate": 1.659781707654048e-05, + "loss": 0.5881, + "step": 412200 + }, + { + "epoch": 9.496177581283964, + "grad_norm": 3.193547487258911, + "learning_rate": 1.6590545512717967e-05, + "loss": 0.5649, + "step": 412400 + }, + { + "epoch": 9.500782905038225, + "grad_norm": 3.653513193130493, + "learning_rate": 1.658327394889545e-05, + "loss": 0.5807, + "step": 412600 + }, + { + "epoch": 9.505388228792484, + "grad_norm": 2.6081292629241943, + "learning_rate": 1.6576002385072935e-05, + "loss": 0.5756, + "step": 412800 + }, + { + "epoch": 9.509993552546744, + "grad_norm": 3.2407031059265137, + "learning_rate": 1.6568730821250418e-05, + "loss": 0.5617, + "step": 413000 + }, + { + "epoch": 9.514598876301005, + "grad_norm": 3.620607614517212, + "learning_rate": 1.6561459257427904e-05, + "loss": 0.5697, + "step": 413200 + }, + { + "epoch": 9.519204200055263, + "grad_norm": 3.521552085876465, + "learning_rate": 1.6554187693605386e-05, + "loss": 0.5804, + "step": 413400 + }, + { + "epoch": 9.523809523809524, + "grad_norm": 3.074605941772461, + "learning_rate": 1.6546916129782872e-05, + "loss": 0.5629, + "step": 413600 + }, + { + "epoch": 9.528414847563784, + "grad_norm": 3.518260955810547, + "learning_rate": 1.6539644565960358e-05, + "loss": 0.5772, + "step": 413800 + }, + { + "epoch": 9.533020171318043, + "grad_norm": 4.271399974822998, + "learning_rate": 1.653237300213784e-05, + "loss": 0.5713, + "step": 414000 + }, + { + "epoch": 9.537625495072303, + "grad_norm": 2.3440682888031006, + "learning_rate": 1.6525101438315323e-05, + "loss": 0.5715, + "step": 414200 + }, + { + "epoch": 9.542230818826564, + "grad_norm": 3.0813117027282715, + "learning_rate": 1.651782987449281e-05, + "loss": 0.5761, + "step": 414400 + }, + { + "epoch": 9.546836142580823, + "grad_norm": 3.250023126602173, + "learning_rate": 1.651055831067029e-05, + "loss": 0.5737, + "step": 414600 + }, + { + "epoch": 9.551441466335083, + "grad_norm": 3.33170485496521, + "learning_rate": 1.6503286746847778e-05, + "loss": 0.5758, + "step": 414800 + }, + { + "epoch": 9.556046790089344, + "grad_norm": 3.526592254638672, + "learning_rate": 1.6496015183025263e-05, + "loss": 0.5632, + "step": 415000 + }, + { + "epoch": 9.560652113843604, + "grad_norm": 3.5739214420318604, + "learning_rate": 1.6488743619202746e-05, + "loss": 0.5862, + "step": 415200 + }, + { + "epoch": 9.565257437597863, + "grad_norm": 3.0339114665985107, + "learning_rate": 1.648147205538023e-05, + "loss": 0.5733, + "step": 415400 + }, + { + "epoch": 9.569862761352123, + "grad_norm": 2.7391164302825928, + "learning_rate": 1.6474200491557714e-05, + "loss": 0.5671, + "step": 415600 + }, + { + "epoch": 9.574468085106384, + "grad_norm": 2.494318962097168, + "learning_rate": 1.64669289277352e-05, + "loss": 0.586, + "step": 415800 + }, + { + "epoch": 9.579073408860642, + "grad_norm": 2.9275710582733154, + "learning_rate": 1.6459657363912683e-05, + "loss": 0.5868, + "step": 416000 + }, + { + "epoch": 9.583678732614903, + "grad_norm": 2.980813980102539, + "learning_rate": 1.645238580009017e-05, + "loss": 0.5784, + "step": 416200 + }, + { + "epoch": 9.588284056369163, + "grad_norm": 2.879387617111206, + "learning_rate": 1.6445114236267655e-05, + "loss": 0.5534, + "step": 416400 + }, + { + "epoch": 9.592889380123422, + "grad_norm": 2.6000664234161377, + "learning_rate": 1.643787903026425e-05, + "loss": 0.5756, + "step": 416600 + }, + { + "epoch": 9.597494703877683, + "grad_norm": 2.941392660140991, + "learning_rate": 1.6430607466441736e-05, + "loss": 0.5787, + "step": 416800 + }, + { + "epoch": 9.602100027631943, + "grad_norm": 3.4506592750549316, + "learning_rate": 1.6423335902619215e-05, + "loss": 0.5694, + "step": 417000 + }, + { + "epoch": 9.606705351386202, + "grad_norm": 3.014477014541626, + "learning_rate": 1.64160643387967e-05, + "loss": 0.5785, + "step": 417200 + }, + { + "epoch": 9.611310675140462, + "grad_norm": 3.4871177673339844, + "learning_rate": 1.6408792774974187e-05, + "loss": 0.58, + "step": 417400 + }, + { + "epoch": 9.615915998894723, + "grad_norm": 3.0921521186828613, + "learning_rate": 1.6401521211151673e-05, + "loss": 0.5571, + "step": 417600 + }, + { + "epoch": 9.620521322648981, + "grad_norm": 3.4543838500976562, + "learning_rate": 1.6394249647329155e-05, + "loss": 0.5689, + "step": 417800 + }, + { + "epoch": 9.625126646403242, + "grad_norm": 3.799140453338623, + "learning_rate": 1.638697808350664e-05, + "loss": 0.5898, + "step": 418000 + }, + { + "epoch": 9.629731970157502, + "grad_norm": 2.701723575592041, + "learning_rate": 1.6379706519684124e-05, + "loss": 0.584, + "step": 418200 + }, + { + "epoch": 9.634337293911763, + "grad_norm": 3.6430578231811523, + "learning_rate": 1.6372434955861606e-05, + "loss": 0.5649, + "step": 418400 + }, + { + "epoch": 9.638942617666022, + "grad_norm": 3.381641387939453, + "learning_rate": 1.6365163392039092e-05, + "loss": 0.5755, + "step": 418600 + }, + { + "epoch": 9.643547941420282, + "grad_norm": 3.1972997188568115, + "learning_rate": 1.6357891828216578e-05, + "loss": 0.5767, + "step": 418800 + }, + { + "epoch": 9.648153265174543, + "grad_norm": 2.762204647064209, + "learning_rate": 1.635062026439406e-05, + "loss": 0.5659, + "step": 419000 + }, + { + "epoch": 9.652758588928801, + "grad_norm": 2.9068100452423096, + "learning_rate": 1.6343348700571546e-05, + "loss": 0.5822, + "step": 419200 + }, + { + "epoch": 9.657363912683062, + "grad_norm": 3.140155553817749, + "learning_rate": 1.633607713674903e-05, + "loss": 0.5617, + "step": 419400 + }, + { + "epoch": 9.661969236437322, + "grad_norm": 3.393486738204956, + "learning_rate": 1.632880557292651e-05, + "loss": 0.59, + "step": 419600 + }, + { + "epoch": 9.666574560191581, + "grad_norm": 3.3374438285827637, + "learning_rate": 1.6321534009103997e-05, + "loss": 0.5863, + "step": 419800 + }, + { + "epoch": 9.671179883945841, + "grad_norm": 2.8088021278381348, + "learning_rate": 1.6314262445281483e-05, + "loss": 0.5748, + "step": 420000 + }, + { + "epoch": 9.675785207700102, + "grad_norm": 3.9555068016052246, + "learning_rate": 1.630702723927808e-05, + "loss": 0.5809, + "step": 420200 + }, + { + "epoch": 9.68039053145436, + "grad_norm": 3.5197412967681885, + "learning_rate": 1.6299755675455564e-05, + "loss": 0.5796, + "step": 420400 + }, + { + "epoch": 9.684995855208621, + "grad_norm": 2.782156229019165, + "learning_rate": 1.629248411163305e-05, + "loss": 0.5738, + "step": 420600 + }, + { + "epoch": 9.689601178962882, + "grad_norm": 3.482156276702881, + "learning_rate": 1.6285212547810533e-05, + "loss": 0.5766, + "step": 420800 + }, + { + "epoch": 9.69420650271714, + "grad_norm": 2.7525062561035156, + "learning_rate": 1.6277940983988015e-05, + "loss": 0.5774, + "step": 421000 + }, + { + "epoch": 9.6988118264714, + "grad_norm": 3.4421098232269287, + "learning_rate": 1.62706694201655e-05, + "loss": 0.5709, + "step": 421200 + }, + { + "epoch": 9.703417150225661, + "grad_norm": 2.6279489994049072, + "learning_rate": 1.6263397856342984e-05, + "loss": 0.5647, + "step": 421400 + }, + { + "epoch": 9.70802247397992, + "grad_norm": 2.9035749435424805, + "learning_rate": 1.625612629252047e-05, + "loss": 0.5715, + "step": 421600 + }, + { + "epoch": 9.71262779773418, + "grad_norm": 3.324402093887329, + "learning_rate": 1.6248854728697956e-05, + "loss": 0.5828, + "step": 421800 + }, + { + "epoch": 9.71723312148844, + "grad_norm": 3.438358783721924, + "learning_rate": 1.6241583164875438e-05, + "loss": 0.5718, + "step": 422000 + }, + { + "epoch": 9.721838445242701, + "grad_norm": 3.8631820678710938, + "learning_rate": 1.623431160105292e-05, + "loss": 0.5787, + "step": 422200 + }, + { + "epoch": 9.72644376899696, + "grad_norm": 3.755342483520508, + "learning_rate": 1.6227040037230407e-05, + "loss": 0.5799, + "step": 422400 + }, + { + "epoch": 9.73104909275122, + "grad_norm": 4.419692039489746, + "learning_rate": 1.6219768473407893e-05, + "loss": 0.572, + "step": 422600 + }, + { + "epoch": 9.735654416505481, + "grad_norm": 3.5479319095611572, + "learning_rate": 1.6212496909585375e-05, + "loss": 0.5864, + "step": 422800 + }, + { + "epoch": 9.74025974025974, + "grad_norm": 2.9561350345611572, + "learning_rate": 1.620522534576286e-05, + "loss": 0.5806, + "step": 423000 + }, + { + "epoch": 9.744865064014, + "grad_norm": 2.8360917568206787, + "learning_rate": 1.6197953781940347e-05, + "loss": 0.5824, + "step": 423200 + }, + { + "epoch": 9.74947038776826, + "grad_norm": 3.090214967727661, + "learning_rate": 1.619068221811783e-05, + "loss": 0.5732, + "step": 423400 + }, + { + "epoch": 9.75407571152252, + "grad_norm": 3.477980613708496, + "learning_rate": 1.6183447012114428e-05, + "loss": 0.5688, + "step": 423600 + }, + { + "epoch": 9.75868103527678, + "grad_norm": 2.7006947994232178, + "learning_rate": 1.617617544829191e-05, + "loss": 0.5744, + "step": 423800 + }, + { + "epoch": 9.76328635903104, + "grad_norm": 3.080148696899414, + "learning_rate": 1.6168903884469393e-05, + "loss": 0.5751, + "step": 424000 + }, + { + "epoch": 9.767891682785299, + "grad_norm": 3.5952165126800537, + "learning_rate": 1.616163232064688e-05, + "loss": 0.5706, + "step": 424200 + }, + { + "epoch": 9.77249700653956, + "grad_norm": 3.141179323196411, + "learning_rate": 1.6154360756824365e-05, + "loss": 0.5758, + "step": 424400 + }, + { + "epoch": 9.77710233029382, + "grad_norm": 2.961484909057617, + "learning_rate": 1.6147089193001847e-05, + "loss": 0.5813, + "step": 424600 + }, + { + "epoch": 9.78170765404808, + "grad_norm": 3.8218839168548584, + "learning_rate": 1.6139817629179333e-05, + "loss": 0.5677, + "step": 424800 + }, + { + "epoch": 9.78631297780234, + "grad_norm": 2.3090195655822754, + "learning_rate": 1.6132546065356816e-05, + "loss": 0.5806, + "step": 425000 + }, + { + "epoch": 9.7909183015566, + "grad_norm": 2.9039793014526367, + "learning_rate": 1.61252745015343e-05, + "loss": 0.5748, + "step": 425200 + }, + { + "epoch": 9.79552362531086, + "grad_norm": 3.138223171234131, + "learning_rate": 1.6118002937711784e-05, + "loss": 0.5711, + "step": 425400 + }, + { + "epoch": 9.800128949065119, + "grad_norm": 3.0506985187530518, + "learning_rate": 1.611073137388927e-05, + "loss": 0.5705, + "step": 425600 + }, + { + "epoch": 9.80473427281938, + "grad_norm": 2.8752284049987793, + "learning_rate": 1.6103496167885865e-05, + "loss": 0.5763, + "step": 425800 + }, + { + "epoch": 9.80933959657364, + "grad_norm": 3.2525858879089355, + "learning_rate": 1.609622460406335e-05, + "loss": 0.5703, + "step": 426000 + }, + { + "epoch": 9.813944920327899, + "grad_norm": 3.1471686363220215, + "learning_rate": 1.6088953040240837e-05, + "loss": 0.5665, + "step": 426200 + }, + { + "epoch": 9.818550244082159, + "grad_norm": 2.207984685897827, + "learning_rate": 1.6081681476418316e-05, + "loss": 0.5853, + "step": 426400 + }, + { + "epoch": 9.82315556783642, + "grad_norm": 3.0319111347198486, + "learning_rate": 1.6074409912595802e-05, + "loss": 0.5681, + "step": 426600 + }, + { + "epoch": 9.827760891590678, + "grad_norm": 2.59087872505188, + "learning_rate": 1.6067138348773288e-05, + "loss": 0.5699, + "step": 426800 + }, + { + "epoch": 9.832366215344939, + "grad_norm": 2.8990542888641357, + "learning_rate": 1.605986678495077e-05, + "loss": 0.5683, + "step": 427000 + }, + { + "epoch": 9.8369715390992, + "grad_norm": 2.702047109603882, + "learning_rate": 1.6052595221128257e-05, + "loss": 0.5793, + "step": 427200 + }, + { + "epoch": 9.841576862853458, + "grad_norm": 3.4502487182617188, + "learning_rate": 1.6045323657305743e-05, + "loss": 0.5725, + "step": 427400 + }, + { + "epoch": 9.846182186607718, + "grad_norm": 2.59299635887146, + "learning_rate": 1.6038052093483225e-05, + "loss": 0.5812, + "step": 427600 + }, + { + "epoch": 9.850787510361979, + "grad_norm": 3.2523722648620605, + "learning_rate": 1.6030780529660708e-05, + "loss": 0.5743, + "step": 427800 + }, + { + "epoch": 9.855392834116238, + "grad_norm": 2.8594510555267334, + "learning_rate": 1.6023508965838194e-05, + "loss": 0.5623, + "step": 428000 + }, + { + "epoch": 9.859998157870498, + "grad_norm": 3.209397792816162, + "learning_rate": 1.6016237402015676e-05, + "loss": 0.5766, + "step": 428200 + }, + { + "epoch": 9.864603481624759, + "grad_norm": 3.1015477180480957, + "learning_rate": 1.6008965838193162e-05, + "loss": 0.5793, + "step": 428400 + }, + { + "epoch": 9.869208805379017, + "grad_norm": 2.951237440109253, + "learning_rate": 1.600173063218976e-05, + "loss": 0.5745, + "step": 428600 + }, + { + "epoch": 9.873814129133278, + "grad_norm": 3.060861825942993, + "learning_rate": 1.5994459068367243e-05, + "loss": 0.58, + "step": 428800 + }, + { + "epoch": 9.878419452887538, + "grad_norm": 2.895648956298828, + "learning_rate": 1.598718750454473e-05, + "loss": 0.5711, + "step": 429000 + }, + { + "epoch": 9.883024776641799, + "grad_norm": 3.3986594676971436, + "learning_rate": 1.597991594072221e-05, + "loss": 0.5773, + "step": 429200 + }, + { + "epoch": 9.887630100396057, + "grad_norm": 3.4484481811523438, + "learning_rate": 1.5972644376899694e-05, + "loss": 0.5774, + "step": 429400 + }, + { + "epoch": 9.892235424150318, + "grad_norm": 3.4718239307403564, + "learning_rate": 1.596537281307718e-05, + "loss": 0.577, + "step": 429600 + }, + { + "epoch": 9.896840747904578, + "grad_norm": 2.872313976287842, + "learning_rate": 1.5958101249254666e-05, + "loss": 0.5752, + "step": 429800 + }, + { + "epoch": 9.901446071658837, + "grad_norm": 3.031860589981079, + "learning_rate": 1.595082968543215e-05, + "loss": 0.5822, + "step": 430000 + }, + { + "epoch": 9.906051395413098, + "grad_norm": 3.6577560901641846, + "learning_rate": 1.5943558121609634e-05, + "loss": 0.5665, + "step": 430200 + }, + { + "epoch": 9.910656719167358, + "grad_norm": 2.5794246196746826, + "learning_rate": 1.593628655778712e-05, + "loss": 0.584, + "step": 430400 + }, + { + "epoch": 9.915262042921617, + "grad_norm": 3.3950586318969727, + "learning_rate": 1.5929014993964603e-05, + "loss": 0.5548, + "step": 430600 + }, + { + "epoch": 9.919867366675877, + "grad_norm": 2.942765474319458, + "learning_rate": 1.5921743430142085e-05, + "loss": 0.5763, + "step": 430800 + }, + { + "epoch": 9.924472690430138, + "grad_norm": 2.925250291824341, + "learning_rate": 1.591447186631957e-05, + "loss": 0.5572, + "step": 431000 + }, + { + "epoch": 9.929078014184396, + "grad_norm": 2.779973030090332, + "learning_rate": 1.5907200302497057e-05, + "loss": 0.5651, + "step": 431200 + }, + { + "epoch": 9.933683337938657, + "grad_norm": 2.9650769233703613, + "learning_rate": 1.589992873867454e-05, + "loss": 0.5765, + "step": 431400 + }, + { + "epoch": 9.938288661692917, + "grad_norm": 2.562681198120117, + "learning_rate": 1.5892657174852026e-05, + "loss": 0.5653, + "step": 431600 + }, + { + "epoch": 9.942893985447178, + "grad_norm": 3.5627505779266357, + "learning_rate": 1.5885385611029508e-05, + "loss": 0.5617, + "step": 431800 + }, + { + "epoch": 9.947499309201437, + "grad_norm": 3.4282848834991455, + "learning_rate": 1.5878150405026103e-05, + "loss": 0.5673, + "step": 432000 + }, + { + "epoch": 9.952104632955697, + "grad_norm": 3.556774139404297, + "learning_rate": 1.587087884120359e-05, + "loss": 0.5806, + "step": 432200 + }, + { + "epoch": 9.956709956709958, + "grad_norm": 3.773305654525757, + "learning_rate": 1.5863607277381075e-05, + "loss": 0.5731, + "step": 432400 + }, + { + "epoch": 9.961315280464216, + "grad_norm": 2.997891902923584, + "learning_rate": 1.5856335713558558e-05, + "loss": 0.5713, + "step": 432600 + }, + { + "epoch": 9.965920604218477, + "grad_norm": 3.1296310424804688, + "learning_rate": 1.5849064149736044e-05, + "loss": 0.5618, + "step": 432800 + }, + { + "epoch": 9.970525927972737, + "grad_norm": 3.2844841480255127, + "learning_rate": 1.584179258591353e-05, + "loss": 0.5774, + "step": 433000 + }, + { + "epoch": 9.975131251726996, + "grad_norm": 2.3057990074157715, + "learning_rate": 1.583452102209101e-05, + "loss": 0.5783, + "step": 433200 + }, + { + "epoch": 9.979736575481256, + "grad_norm": 2.7492430210113525, + "learning_rate": 1.5827249458268495e-05, + "loss": 0.5757, + "step": 433400 + }, + { + "epoch": 9.984341899235517, + "grad_norm": 2.937319278717041, + "learning_rate": 1.581997789444598e-05, + "loss": 0.5814, + "step": 433600 + }, + { + "epoch": 9.988947222989776, + "grad_norm": 2.906705141067505, + "learning_rate": 1.5812706330623463e-05, + "loss": 0.5783, + "step": 433800 + }, + { + "epoch": 9.993552546744036, + "grad_norm": 2.5989561080932617, + "learning_rate": 1.580543476680095e-05, + "loss": 0.5865, + "step": 434000 + }, + { + "epoch": 9.998157870498297, + "grad_norm": 3.9318482875823975, + "learning_rate": 1.5798163202978435e-05, + "loss": 0.5763, + "step": 434200 + }, + { + "epoch": 10.0, + "eval_loss": 0.5503791570663452, + "eval_runtime": 162.7661, + "eval_samples_per_second": 174.244, + "eval_steps_per_second": 10.893, + "step": 434280 + }, + { + "epoch": 10.002763194252555, + "grad_norm": 3.2857744693756104, + "learning_rate": 1.579092799697503e-05, + "loss": 0.584, + "step": 434400 + }, + { + "epoch": 10.007368518006816, + "grad_norm": 2.8815836906433105, + "learning_rate": 1.578369279097163e-05, + "loss": 0.5577, + "step": 434600 + }, + { + "epoch": 10.011973841761076, + "grad_norm": 2.8259472846984863, + "learning_rate": 1.577642122714911e-05, + "loss": 0.5706, + "step": 434800 + }, + { + "epoch": 10.016579165515335, + "grad_norm": 2.6986682415008545, + "learning_rate": 1.5769149663326594e-05, + "loss": 0.5662, + "step": 435000 + }, + { + "epoch": 10.021184489269595, + "grad_norm": 3.565535545349121, + "learning_rate": 1.576187809950408e-05, + "loss": 0.5678, + "step": 435200 + }, + { + "epoch": 10.025789813023856, + "grad_norm": 2.950361967086792, + "learning_rate": 1.5754606535681565e-05, + "loss": 0.5743, + "step": 435400 + }, + { + "epoch": 10.030395136778116, + "grad_norm": 2.67488694190979, + "learning_rate": 1.5747334971859048e-05, + "loss": 0.5798, + "step": 435600 + }, + { + "epoch": 10.035000460532375, + "grad_norm": 2.76383113861084, + "learning_rate": 1.5740063408036534e-05, + "loss": 0.5616, + "step": 435800 + }, + { + "epoch": 10.039605784286636, + "grad_norm": 2.7756881713867188, + "learning_rate": 1.573279184421402e-05, + "loss": 0.582, + "step": 436000 + }, + { + "epoch": 10.044211108040896, + "grad_norm": 3.5233657360076904, + "learning_rate": 1.57255202803915e-05, + "loss": 0.5697, + "step": 436200 + }, + { + "epoch": 10.048816431795155, + "grad_norm": 2.4349958896636963, + "learning_rate": 1.5718248716568985e-05, + "loss": 0.565, + "step": 436400 + }, + { + "epoch": 10.053421755549415, + "grad_norm": 2.8675363063812256, + "learning_rate": 1.571097715274647e-05, + "loss": 0.5669, + "step": 436600 + }, + { + "epoch": 10.058027079303676, + "grad_norm": 2.97993540763855, + "learning_rate": 1.5703705588923953e-05, + "loss": 0.5759, + "step": 436800 + }, + { + "epoch": 10.062632403057934, + "grad_norm": 3.144256353378296, + "learning_rate": 1.569643402510144e-05, + "loss": 0.5606, + "step": 437000 + }, + { + "epoch": 10.067237726812195, + "grad_norm": 2.9724130630493164, + "learning_rate": 1.5689162461278925e-05, + "loss": 0.5811, + "step": 437200 + }, + { + "epoch": 10.071843050566455, + "grad_norm": 3.0126194953918457, + "learning_rate": 1.5681890897456408e-05, + "loss": 0.5646, + "step": 437400 + }, + { + "epoch": 10.076448374320714, + "grad_norm": 2.4599967002868652, + "learning_rate": 1.567461933363389e-05, + "loss": 0.582, + "step": 437600 + }, + { + "epoch": 10.081053698074975, + "grad_norm": 2.899773120880127, + "learning_rate": 1.5667347769811376e-05, + "loss": 0.5706, + "step": 437800 + }, + { + "epoch": 10.085659021829235, + "grad_norm": 2.810063600540161, + "learning_rate": 1.566007620598886e-05, + "loss": 0.5678, + "step": 438000 + }, + { + "epoch": 10.090264345583494, + "grad_norm": 2.5231738090515137, + "learning_rate": 1.5652804642166345e-05, + "loss": 0.5784, + "step": 438200 + }, + { + "epoch": 10.094869669337754, + "grad_norm": 2.406407594680786, + "learning_rate": 1.564553307834383e-05, + "loss": 0.5653, + "step": 438400 + }, + { + "epoch": 10.099474993092015, + "grad_norm": 3.0736372470855713, + "learning_rate": 1.5638261514521316e-05, + "loss": 0.5703, + "step": 438600 + }, + { + "epoch": 10.104080316846275, + "grad_norm": 2.5752131938934326, + "learning_rate": 1.5630989950698796e-05, + "loss": 0.5667, + "step": 438800 + }, + { + "epoch": 10.108685640600534, + "grad_norm": 3.5142014026641846, + "learning_rate": 1.562371838687628e-05, + "loss": 0.5829, + "step": 439000 + }, + { + "epoch": 10.113290964354794, + "grad_norm": 3.4527642726898193, + "learning_rate": 1.5616446823053767e-05, + "loss": 0.5663, + "step": 439200 + }, + { + "epoch": 10.117896288109055, + "grad_norm": 3.4308338165283203, + "learning_rate": 1.560917525923125e-05, + "loss": 0.5725, + "step": 439400 + }, + { + "epoch": 10.122501611863314, + "grad_norm": 2.8218986988067627, + "learning_rate": 1.5601903695408736e-05, + "loss": 0.5723, + "step": 439600 + }, + { + "epoch": 10.127106935617574, + "grad_norm": 3.292224407196045, + "learning_rate": 1.559463213158622e-05, + "loss": 0.5753, + "step": 439800 + }, + { + "epoch": 10.131712259371835, + "grad_norm": 2.748878240585327, + "learning_rate": 1.5587396925582817e-05, + "loss": 0.5636, + "step": 440000 + }, + { + "epoch": 10.136317583126093, + "grad_norm": 3.109516143798828, + "learning_rate": 1.55801253617603e-05, + "loss": 0.5534, + "step": 440200 + }, + { + "epoch": 10.140922906880354, + "grad_norm": 3.0865907669067383, + "learning_rate": 1.5572853797937785e-05, + "loss": 0.5619, + "step": 440400 + }, + { + "epoch": 10.145528230634614, + "grad_norm": 3.90614914894104, + "learning_rate": 1.5565582234115268e-05, + "loss": 0.5706, + "step": 440600 + }, + { + "epoch": 10.150133554388873, + "grad_norm": 3.342874526977539, + "learning_rate": 1.5558310670292754e-05, + "loss": 0.5666, + "step": 440800 + }, + { + "epoch": 10.154738878143133, + "grad_norm": 3.8713204860687256, + "learning_rate": 1.555103910647024e-05, + "loss": 0.57, + "step": 441000 + }, + { + "epoch": 10.159344201897394, + "grad_norm": 3.356518507003784, + "learning_rate": 1.5543767542647722e-05, + "loss": 0.5671, + "step": 441200 + }, + { + "epoch": 10.163949525651653, + "grad_norm": 3.4761784076690674, + "learning_rate": 1.5536495978825208e-05, + "loss": 0.5756, + "step": 441400 + }, + { + "epoch": 10.168554849405913, + "grad_norm": 3.0958569049835205, + "learning_rate": 1.552922441500269e-05, + "loss": 0.5545, + "step": 441600 + }, + { + "epoch": 10.173160173160174, + "grad_norm": 3.5366578102111816, + "learning_rate": 1.5521952851180173e-05, + "loss": 0.5569, + "step": 441800 + }, + { + "epoch": 10.177765496914432, + "grad_norm": 3.0976240634918213, + "learning_rate": 1.551468128735766e-05, + "loss": 0.5787, + "step": 442000 + }, + { + "epoch": 10.182370820668693, + "grad_norm": 2.9487667083740234, + "learning_rate": 1.5507409723535145e-05, + "loss": 0.5719, + "step": 442200 + }, + { + "epoch": 10.186976144422953, + "grad_norm": 3.9748101234436035, + "learning_rate": 1.5500138159712628e-05, + "loss": 0.5772, + "step": 442400 + }, + { + "epoch": 10.191581468177214, + "grad_norm": 3.862164258956909, + "learning_rate": 1.5492866595890113e-05, + "loss": 0.5625, + "step": 442600 + }, + { + "epoch": 10.196186791931472, + "grad_norm": 3.5025646686553955, + "learning_rate": 1.5485595032067596e-05, + "loss": 0.5744, + "step": 442800 + }, + { + "epoch": 10.200792115685733, + "grad_norm": 2.911168336868286, + "learning_rate": 1.5478323468245082e-05, + "loss": 0.5606, + "step": 443000 + }, + { + "epoch": 10.205397439439993, + "grad_norm": 3.7616069316864014, + "learning_rate": 1.5471051904422564e-05, + "loss": 0.5661, + "step": 443200 + }, + { + "epoch": 10.210002763194252, + "grad_norm": 2.7809619903564453, + "learning_rate": 1.546378034060005e-05, + "loss": 0.5583, + "step": 443400 + }, + { + "epoch": 10.214608086948513, + "grad_norm": 4.306921005249023, + "learning_rate": 1.5456508776777536e-05, + "loss": 0.5722, + "step": 443600 + }, + { + "epoch": 10.219213410702773, + "grad_norm": 2.9962635040283203, + "learning_rate": 1.544923721295502e-05, + "loss": 0.5667, + "step": 443800 + }, + { + "epoch": 10.223818734457032, + "grad_norm": 2.8903093338012695, + "learning_rate": 1.54419656491325e-05, + "loss": 0.5655, + "step": 444000 + }, + { + "epoch": 10.228424058211292, + "grad_norm": 3.401038885116577, + "learning_rate": 1.5434694085309987e-05, + "loss": 0.5708, + "step": 444200 + }, + { + "epoch": 10.233029381965553, + "grad_norm": 2.52870774269104, + "learning_rate": 1.542742252148747e-05, + "loss": 0.564, + "step": 444400 + }, + { + "epoch": 10.237634705719811, + "grad_norm": 3.640711784362793, + "learning_rate": 1.542018731548407e-05, + "loss": 0.5723, + "step": 444600 + }, + { + "epoch": 10.242240029474072, + "grad_norm": 2.800980806350708, + "learning_rate": 1.5412915751661554e-05, + "loss": 0.5613, + "step": 444800 + }, + { + "epoch": 10.246845353228332, + "grad_norm": 3.3499021530151367, + "learning_rate": 1.5405644187839037e-05, + "loss": 0.5739, + "step": 445000 + }, + { + "epoch": 10.251450676982591, + "grad_norm": 2.415259838104248, + "learning_rate": 1.5398408981835635e-05, + "loss": 0.566, + "step": 445200 + }, + { + "epoch": 10.256056000736852, + "grad_norm": 3.0290565490722656, + "learning_rate": 1.5391137418013118e-05, + "loss": 0.5834, + "step": 445400 + }, + { + "epoch": 10.260661324491112, + "grad_norm": 2.8429722785949707, + "learning_rate": 1.5383865854190604e-05, + "loss": 0.5733, + "step": 445600 + }, + { + "epoch": 10.265266648245372, + "grad_norm": 2.889953136444092, + "learning_rate": 1.5376594290368086e-05, + "loss": 0.5635, + "step": 445800 + }, + { + "epoch": 10.269871971999631, + "grad_norm": 3.5304625034332275, + "learning_rate": 1.536932272654557e-05, + "loss": 0.5689, + "step": 446000 + }, + { + "epoch": 10.274477295753892, + "grad_norm": 3.2472240924835205, + "learning_rate": 1.5362051162723055e-05, + "loss": 0.5626, + "step": 446200 + }, + { + "epoch": 10.279082619508152, + "grad_norm": 3.443258047103882, + "learning_rate": 1.535477959890054e-05, + "loss": 0.5686, + "step": 446400 + }, + { + "epoch": 10.28368794326241, + "grad_norm": 3.0054471492767334, + "learning_rate": 1.5347508035078027e-05, + "loss": 0.563, + "step": 446600 + }, + { + "epoch": 10.288293267016671, + "grad_norm": 3.115903854370117, + "learning_rate": 1.534023647125551e-05, + "loss": 0.5618, + "step": 446800 + }, + { + "epoch": 10.292898590770932, + "grad_norm": 3.3110814094543457, + "learning_rate": 1.533296490743299e-05, + "loss": 0.5567, + "step": 447000 + }, + { + "epoch": 10.29750391452519, + "grad_norm": 2.7418558597564697, + "learning_rate": 1.5325693343610478e-05, + "loss": 0.5526, + "step": 447200 + }, + { + "epoch": 10.302109238279451, + "grad_norm": 3.7277021408081055, + "learning_rate": 1.531842177978796e-05, + "loss": 0.5549, + "step": 447400 + }, + { + "epoch": 10.306714562033712, + "grad_norm": 3.4062893390655518, + "learning_rate": 1.5311150215965446e-05, + "loss": 0.5749, + "step": 447600 + }, + { + "epoch": 10.31131988578797, + "grad_norm": 3.7285375595092773, + "learning_rate": 1.5303878652142932e-05, + "loss": 0.5635, + "step": 447800 + }, + { + "epoch": 10.31592520954223, + "grad_norm": 3.451294183731079, + "learning_rate": 1.5296643446139527e-05, + "loss": 0.5615, + "step": 448000 + }, + { + "epoch": 10.320530533296491, + "grad_norm": 2.3311824798583984, + "learning_rate": 1.5289371882317013e-05, + "loss": 0.5668, + "step": 448200 + }, + { + "epoch": 10.32513585705075, + "grad_norm": 3.451138734817505, + "learning_rate": 1.52821003184945e-05, + "loss": 0.5659, + "step": 448400 + }, + { + "epoch": 10.32974118080501, + "grad_norm": 2.754631757736206, + "learning_rate": 1.5274828754671978e-05, + "loss": 0.5746, + "step": 448600 + }, + { + "epoch": 10.33434650455927, + "grad_norm": 3.7678756713867188, + "learning_rate": 1.5267557190849464e-05, + "loss": 0.5647, + "step": 448800 + }, + { + "epoch": 10.33895182831353, + "grad_norm": 2.977546215057373, + "learning_rate": 1.526028562702695e-05, + "loss": 0.5675, + "step": 449000 + }, + { + "epoch": 10.34355715206779, + "grad_norm": 2.2833847999572754, + "learning_rate": 1.5253014063204434e-05, + "loss": 0.5571, + "step": 449200 + }, + { + "epoch": 10.34816247582205, + "grad_norm": 3.352088689804077, + "learning_rate": 1.5245742499381918e-05, + "loss": 0.5803, + "step": 449400 + }, + { + "epoch": 10.352767799576311, + "grad_norm": 3.5555100440979004, + "learning_rate": 1.5238470935559403e-05, + "loss": 0.5733, + "step": 449600 + }, + { + "epoch": 10.35737312333057, + "grad_norm": 2.3247013092041016, + "learning_rate": 1.5231199371736885e-05, + "loss": 0.5573, + "step": 449800 + }, + { + "epoch": 10.36197844708483, + "grad_norm": 2.8674051761627197, + "learning_rate": 1.5223964165733482e-05, + "loss": 0.5689, + "step": 450000 + }, + { + "epoch": 10.36658377083909, + "grad_norm": 3.7046046257019043, + "learning_rate": 1.5216692601910966e-05, + "loss": 0.5753, + "step": 450200 + }, + { + "epoch": 10.37118909459335, + "grad_norm": 3.3615944385528564, + "learning_rate": 1.520942103808845e-05, + "loss": 0.5823, + "step": 450400 + }, + { + "epoch": 10.37579441834761, + "grad_norm": 4.1015238761901855, + "learning_rate": 1.5202149474265936e-05, + "loss": 0.5746, + "step": 450600 + }, + { + "epoch": 10.38039974210187, + "grad_norm": 2.8031654357910156, + "learning_rate": 1.519487791044342e-05, + "loss": 0.5707, + "step": 450800 + }, + { + "epoch": 10.385005065856129, + "grad_norm": 3.370823860168457, + "learning_rate": 1.5187606346620907e-05, + "loss": 0.5652, + "step": 451000 + }, + { + "epoch": 10.38961038961039, + "grad_norm": 3.259626626968384, + "learning_rate": 1.5180371140617503e-05, + "loss": 0.5666, + "step": 451200 + }, + { + "epoch": 10.39421571336465, + "grad_norm": 4.263233184814453, + "learning_rate": 1.5173099576794984e-05, + "loss": 0.5697, + "step": 451400 + }, + { + "epoch": 10.398821037118909, + "grad_norm": 2.861125946044922, + "learning_rate": 1.5165828012972468e-05, + "loss": 0.5782, + "step": 451600 + }, + { + "epoch": 10.40342636087317, + "grad_norm": 3.5409388542175293, + "learning_rate": 1.5158556449149954e-05, + "loss": 0.5651, + "step": 451800 + }, + { + "epoch": 10.40803168462743, + "grad_norm": 3.4147825241088867, + "learning_rate": 1.5151284885327439e-05, + "loss": 0.5646, + "step": 452000 + }, + { + "epoch": 10.412637008381688, + "grad_norm": 2.7869393825531006, + "learning_rate": 1.5144013321504925e-05, + "loss": 0.5703, + "step": 452200 + }, + { + "epoch": 10.417242332135949, + "grad_norm": 3.2392425537109375, + "learning_rate": 1.5136741757682409e-05, + "loss": 0.5604, + "step": 452400 + }, + { + "epoch": 10.42184765589021, + "grad_norm": 3.0997209548950195, + "learning_rate": 1.5129470193859893e-05, + "loss": 0.5643, + "step": 452600 + }, + { + "epoch": 10.42645297964447, + "grad_norm": 2.82871150970459, + "learning_rate": 1.5122198630037375e-05, + "loss": 0.5727, + "step": 452800 + }, + { + "epoch": 10.431058303398729, + "grad_norm": 3.5015945434570312, + "learning_rate": 1.511492706621486e-05, + "loss": 0.566, + "step": 453000 + }, + { + "epoch": 10.435663627152989, + "grad_norm": 2.741576671600342, + "learning_rate": 1.5107655502392344e-05, + "loss": 0.5739, + "step": 453200 + }, + { + "epoch": 10.44026895090725, + "grad_norm": 3.6131269931793213, + "learning_rate": 1.510038393856983e-05, + "loss": 0.5577, + "step": 453400 + }, + { + "epoch": 10.444874274661508, + "grad_norm": 3.369382858276367, + "learning_rate": 1.5093112374747314e-05, + "loss": 0.5676, + "step": 453600 + }, + { + "epoch": 10.449479598415769, + "grad_norm": 2.8960883617401123, + "learning_rate": 1.50858408109248e-05, + "loss": 0.5564, + "step": 453800 + }, + { + "epoch": 10.45408492217003, + "grad_norm": 3.1670303344726562, + "learning_rate": 1.507856924710228e-05, + "loss": 0.5777, + "step": 454000 + }, + { + "epoch": 10.458690245924288, + "grad_norm": 2.9549036026000977, + "learning_rate": 1.5071297683279765e-05, + "loss": 0.5686, + "step": 454200 + }, + { + "epoch": 10.463295569678548, + "grad_norm": 3.2000997066497803, + "learning_rate": 1.5064026119457251e-05, + "loss": 0.5699, + "step": 454400 + }, + { + "epoch": 10.467900893432809, + "grad_norm": 3.4854366779327393, + "learning_rate": 1.5056754555634735e-05, + "loss": 0.5676, + "step": 454600 + }, + { + "epoch": 10.472506217187068, + "grad_norm": 3.309793710708618, + "learning_rate": 1.504948299181222e-05, + "loss": 0.5569, + "step": 454800 + }, + { + "epoch": 10.477111540941328, + "grad_norm": 2.8395392894744873, + "learning_rate": 1.5042211427989705e-05, + "loss": 0.569, + "step": 455000 + }, + { + "epoch": 10.481716864695588, + "grad_norm": 3.1417343616485596, + "learning_rate": 1.503493986416719e-05, + "loss": 0.5696, + "step": 455200 + }, + { + "epoch": 10.486322188449847, + "grad_norm": 3.1475086212158203, + "learning_rate": 1.5027668300344672e-05, + "loss": 0.5647, + "step": 455400 + }, + { + "epoch": 10.490927512204108, + "grad_norm": 3.277684450149536, + "learning_rate": 1.5020396736522156e-05, + "loss": 0.58, + "step": 455600 + }, + { + "epoch": 10.495532835958368, + "grad_norm": 3.4847934246063232, + "learning_rate": 1.501312517269964e-05, + "loss": 0.5613, + "step": 455800 + }, + { + "epoch": 10.500138159712627, + "grad_norm": 2.999443292617798, + "learning_rate": 1.5005853608877126e-05, + "loss": 0.5617, + "step": 456000 + }, + { + "epoch": 10.504743483466887, + "grad_norm": 2.7356905937194824, + "learning_rate": 1.4998582045054609e-05, + "loss": 0.5679, + "step": 456200 + }, + { + "epoch": 10.509348807221148, + "grad_norm": 3.4266111850738525, + "learning_rate": 1.4991310481232093e-05, + "loss": 0.5498, + "step": 456400 + }, + { + "epoch": 10.513954130975408, + "grad_norm": 3.2002484798431396, + "learning_rate": 1.4984038917409579e-05, + "loss": 0.5548, + "step": 456600 + }, + { + "epoch": 10.518559454729667, + "grad_norm": 2.8026347160339355, + "learning_rate": 1.4976767353587063e-05, + "loss": 0.5744, + "step": 456800 + }, + { + "epoch": 10.523164778483928, + "grad_norm": 3.1921815872192383, + "learning_rate": 1.4969495789764547e-05, + "loss": 0.5578, + "step": 457000 + }, + { + "epoch": 10.527770102238188, + "grad_norm": 3.4548532962799072, + "learning_rate": 1.4962224225942032e-05, + "loss": 0.5696, + "step": 457200 + }, + { + "epoch": 10.532375425992447, + "grad_norm": 2.9152286052703857, + "learning_rate": 1.4954952662119516e-05, + "loss": 0.5592, + "step": 457400 + }, + { + "epoch": 10.536980749746707, + "grad_norm": 2.514460802078247, + "learning_rate": 1.4947681098297e-05, + "loss": 0.5629, + "step": 457600 + }, + { + "epoch": 10.541586073500968, + "grad_norm": 3.525857448577881, + "learning_rate": 1.4940409534474484e-05, + "loss": 0.5682, + "step": 457800 + }, + { + "epoch": 10.546191397255226, + "grad_norm": 3.7510156631469727, + "learning_rate": 1.4933137970651969e-05, + "loss": 0.5724, + "step": 458000 + }, + { + "epoch": 10.550796721009487, + "grad_norm": 3.3571460247039795, + "learning_rate": 1.4925902764648565e-05, + "loss": 0.568, + "step": 458200 + }, + { + "epoch": 10.555402044763747, + "grad_norm": 2.8560373783111572, + "learning_rate": 1.491863120082605e-05, + "loss": 0.5629, + "step": 458400 + }, + { + "epoch": 10.560007368518006, + "grad_norm": 3.2522172927856445, + "learning_rate": 1.4911359637003534e-05, + "loss": 0.5643, + "step": 458600 + }, + { + "epoch": 10.564612692272267, + "grad_norm": 3.196812152862549, + "learning_rate": 1.490408807318102e-05, + "loss": 0.5603, + "step": 458800 + }, + { + "epoch": 10.569218016026527, + "grad_norm": 3.4622724056243896, + "learning_rate": 1.4896816509358502e-05, + "loss": 0.5625, + "step": 459000 + }, + { + "epoch": 10.573823339780787, + "grad_norm": 2.8027701377868652, + "learning_rate": 1.4889544945535987e-05, + "loss": 0.5714, + "step": 459200 + }, + { + "epoch": 10.578428663535046, + "grad_norm": 2.9170937538146973, + "learning_rate": 1.4882273381713472e-05, + "loss": 0.5723, + "step": 459400 + }, + { + "epoch": 10.583033987289307, + "grad_norm": 3.9101593494415283, + "learning_rate": 1.4875001817890955e-05, + "loss": 0.5641, + "step": 459600 + }, + { + "epoch": 10.587639311043567, + "grad_norm": 3.0541505813598633, + "learning_rate": 1.4867730254068441e-05, + "loss": 0.5576, + "step": 459800 + }, + { + "epoch": 10.592244634797826, + "grad_norm": 2.2873919010162354, + "learning_rate": 1.4860458690245925e-05, + "loss": 0.5682, + "step": 460000 + }, + { + "epoch": 10.596849958552086, + "grad_norm": 4.237215518951416, + "learning_rate": 1.485318712642341e-05, + "loss": 0.5709, + "step": 460200 + }, + { + "epoch": 10.601455282306347, + "grad_norm": 3.327172040939331, + "learning_rate": 1.4845915562600894e-05, + "loss": 0.5712, + "step": 460400 + }, + { + "epoch": 10.606060606060606, + "grad_norm": 3.6685967445373535, + "learning_rate": 1.4838643998778378e-05, + "loss": 0.5654, + "step": 460600 + }, + { + "epoch": 10.610665929814866, + "grad_norm": 3.0738847255706787, + "learning_rate": 1.4831372434955862e-05, + "loss": 0.5676, + "step": 460800 + }, + { + "epoch": 10.615271253569126, + "grad_norm": 2.7393765449523926, + "learning_rate": 1.4824100871133346e-05, + "loss": 0.5645, + "step": 461000 + }, + { + "epoch": 10.619876577323385, + "grad_norm": 2.6728312969207764, + "learning_rate": 1.481682930731083e-05, + "loss": 0.5598, + "step": 461200 + }, + { + "epoch": 10.624481901077646, + "grad_norm": 2.915443181991577, + "learning_rate": 1.4809557743488316e-05, + "loss": 0.5659, + "step": 461400 + }, + { + "epoch": 10.629087224831906, + "grad_norm": 3.617243528366089, + "learning_rate": 1.4802286179665799e-05, + "loss": 0.5706, + "step": 461600 + }, + { + "epoch": 10.633692548586165, + "grad_norm": 3.1753506660461426, + "learning_rate": 1.4795014615843283e-05, + "loss": 0.5551, + "step": 461800 + }, + { + "epoch": 10.638297872340425, + "grad_norm": 3.585939407348633, + "learning_rate": 1.4787743052020769e-05, + "loss": 0.5634, + "step": 462000 + }, + { + "epoch": 10.642903196094686, + "grad_norm": 2.8085319995880127, + "learning_rate": 1.4780471488198252e-05, + "loss": 0.572, + "step": 462200 + }, + { + "epoch": 10.647508519848945, + "grad_norm": 3.3687798976898193, + "learning_rate": 1.4773236282194848e-05, + "loss": 0.5583, + "step": 462400 + }, + { + "epoch": 10.652113843603205, + "grad_norm": 3.3423590660095215, + "learning_rate": 1.4765964718372333e-05, + "loss": 0.572, + "step": 462600 + }, + { + "epoch": 10.656719167357465, + "grad_norm": 2.939920425415039, + "learning_rate": 1.4758693154549819e-05, + "loss": 0.5757, + "step": 462800 + }, + { + "epoch": 10.661324491111726, + "grad_norm": 3.0086398124694824, + "learning_rate": 1.4751457948546416e-05, + "loss": 0.5608, + "step": 463000 + }, + { + "epoch": 10.665929814865985, + "grad_norm": 3.477020740509033, + "learning_rate": 1.4744186384723898e-05, + "loss": 0.5704, + "step": 463200 + }, + { + "epoch": 10.670535138620245, + "grad_norm": 3.078016996383667, + "learning_rate": 1.4736914820901384e-05, + "loss": 0.5596, + "step": 463400 + }, + { + "epoch": 10.675140462374506, + "grad_norm": 3.607464075088501, + "learning_rate": 1.4729643257078868e-05, + "loss": 0.5584, + "step": 463600 + }, + { + "epoch": 10.679745786128764, + "grad_norm": 2.997453212738037, + "learning_rate": 1.4722371693256352e-05, + "loss": 0.5566, + "step": 463800 + }, + { + "epoch": 10.684351109883025, + "grad_norm": 2.907045364379883, + "learning_rate": 1.4715100129433837e-05, + "loss": 0.5699, + "step": 464000 + }, + { + "epoch": 10.688956433637285, + "grad_norm": 2.692117929458618, + "learning_rate": 1.470782856561132e-05, + "loss": 0.5607, + "step": 464200 + }, + { + "epoch": 10.693561757391544, + "grad_norm": 3.213514804840088, + "learning_rate": 1.4700557001788805e-05, + "loss": 0.5622, + "step": 464400 + }, + { + "epoch": 10.698167081145805, + "grad_norm": 2.8752119541168213, + "learning_rate": 1.469328543796629e-05, + "loss": 0.5759, + "step": 464600 + }, + { + "epoch": 10.702772404900065, + "grad_norm": 2.7570505142211914, + "learning_rate": 1.4686013874143773e-05, + "loss": 0.5732, + "step": 464800 + }, + { + "epoch": 10.707377728654324, + "grad_norm": 2.800178050994873, + "learning_rate": 1.467877866814037e-05, + "loss": 0.5693, + "step": 465000 + }, + { + "epoch": 10.711983052408584, + "grad_norm": 2.608539342880249, + "learning_rate": 1.4671507104317856e-05, + "loss": 0.5683, + "step": 465200 + }, + { + "epoch": 10.716588376162845, + "grad_norm": 3.1758687496185303, + "learning_rate": 1.4664235540495339e-05, + "loss": 0.571, + "step": 465400 + }, + { + "epoch": 10.721193699917103, + "grad_norm": 2.868126392364502, + "learning_rate": 1.4656963976672823e-05, + "loss": 0.5706, + "step": 465600 + }, + { + "epoch": 10.725799023671364, + "grad_norm": 3.5965654850006104, + "learning_rate": 1.4649692412850309e-05, + "loss": 0.5794, + "step": 465800 + }, + { + "epoch": 10.730404347425624, + "grad_norm": 3.169706106185913, + "learning_rate": 1.4642420849027791e-05, + "loss": 0.5607, + "step": 466000 + }, + { + "epoch": 10.735009671179885, + "grad_norm": 3.449915647506714, + "learning_rate": 1.4635149285205276e-05, + "loss": 0.5568, + "step": 466200 + }, + { + "epoch": 10.739614994934144, + "grad_norm": 2.98413348197937, + "learning_rate": 1.4627877721382762e-05, + "loss": 0.5568, + "step": 466400 + }, + { + "epoch": 10.744220318688404, + "grad_norm": 3.158750534057617, + "learning_rate": 1.4620606157560244e-05, + "loss": 0.5663, + "step": 466600 + }, + { + "epoch": 10.748825642442664, + "grad_norm": 3.3182640075683594, + "learning_rate": 1.461333459373773e-05, + "loss": 0.547, + "step": 466800 + }, + { + "epoch": 10.753430966196923, + "grad_norm": 2.8168752193450928, + "learning_rate": 1.4606063029915214e-05, + "loss": 0.5632, + "step": 467000 + }, + { + "epoch": 10.758036289951184, + "grad_norm": 2.551912546157837, + "learning_rate": 1.4598791466092697e-05, + "loss": 0.564, + "step": 467200 + }, + { + "epoch": 10.762641613705444, + "grad_norm": 3.1398086547851562, + "learning_rate": 1.4591519902270183e-05, + "loss": 0.5656, + "step": 467400 + }, + { + "epoch": 10.767246937459703, + "grad_norm": 2.806156873703003, + "learning_rate": 1.4584248338447667e-05, + "loss": 0.5712, + "step": 467600 + }, + { + "epoch": 10.771852261213963, + "grad_norm": 4.065160751342773, + "learning_rate": 1.4576976774625151e-05, + "loss": 0.5717, + "step": 467800 + }, + { + "epoch": 10.776457584968224, + "grad_norm": 2.9523870944976807, + "learning_rate": 1.4569705210802635e-05, + "loss": 0.5639, + "step": 468000 + }, + { + "epoch": 10.781062908722483, + "grad_norm": 2.7149343490600586, + "learning_rate": 1.456243364698012e-05, + "loss": 0.5587, + "step": 468200 + }, + { + "epoch": 10.785668232476743, + "grad_norm": 3.6111390590667725, + "learning_rate": 1.4555162083157606e-05, + "loss": 0.5648, + "step": 468400 + }, + { + "epoch": 10.790273556231003, + "grad_norm": 3.606182098388672, + "learning_rate": 1.4547890519335088e-05, + "loss": 0.5726, + "step": 468600 + }, + { + "epoch": 10.794878879985262, + "grad_norm": 2.920095443725586, + "learning_rate": 1.4540618955512572e-05, + "loss": 0.5633, + "step": 468800 + }, + { + "epoch": 10.799484203739523, + "grad_norm": 3.206716299057007, + "learning_rate": 1.4533347391690058e-05, + "loss": 0.5613, + "step": 469000 + }, + { + "epoch": 10.804089527493783, + "grad_norm": 2.6802709102630615, + "learning_rate": 1.452607582786754e-05, + "loss": 0.5627, + "step": 469200 + }, + { + "epoch": 10.808694851248042, + "grad_norm": 2.581068754196167, + "learning_rate": 1.4518804264045027e-05, + "loss": 0.5645, + "step": 469400 + }, + { + "epoch": 10.813300175002302, + "grad_norm": 2.9014952182769775, + "learning_rate": 1.451153270022251e-05, + "loss": 0.5717, + "step": 469600 + }, + { + "epoch": 10.817905498756563, + "grad_norm": 3.2331104278564453, + "learning_rate": 1.4504261136399993e-05, + "loss": 0.5561, + "step": 469800 + }, + { + "epoch": 10.822510822510823, + "grad_norm": 3.1601359844207764, + "learning_rate": 1.449698957257748e-05, + "loss": 0.5606, + "step": 470000 + }, + { + "epoch": 10.827116146265082, + "grad_norm": 2.7113850116729736, + "learning_rate": 1.4489718008754963e-05, + "loss": 0.5678, + "step": 470200 + }, + { + "epoch": 10.831721470019342, + "grad_norm": 3.2414093017578125, + "learning_rate": 1.4482446444932446e-05, + "loss": 0.5614, + "step": 470400 + }, + { + "epoch": 10.836326793773603, + "grad_norm": 2.453719139099121, + "learning_rate": 1.4475174881109932e-05, + "loss": 0.5608, + "step": 470600 + }, + { + "epoch": 10.840932117527862, + "grad_norm": 3.3981575965881348, + "learning_rate": 1.4467903317287416e-05, + "loss": 0.5644, + "step": 470800 + }, + { + "epoch": 10.845537441282122, + "grad_norm": 3.9584078788757324, + "learning_rate": 1.4460631753464902e-05, + "loss": 0.5649, + "step": 471000 + }, + { + "epoch": 10.850142765036383, + "grad_norm": 2.609250545501709, + "learning_rate": 1.4453360189642385e-05, + "loss": 0.5737, + "step": 471200 + }, + { + "epoch": 10.854748088790641, + "grad_norm": 2.888627767562866, + "learning_rate": 1.4446088625819869e-05, + "loss": 0.5763, + "step": 471400 + }, + { + "epoch": 10.859353412544902, + "grad_norm": 3.1995785236358643, + "learning_rate": 1.4438817061997355e-05, + "loss": 0.5639, + "step": 471600 + }, + { + "epoch": 10.863958736299162, + "grad_norm": 3.3186826705932617, + "learning_rate": 1.4431545498174837e-05, + "loss": 0.5643, + "step": 471800 + }, + { + "epoch": 10.868564060053421, + "grad_norm": 3.200171709060669, + "learning_rate": 1.4424310292171434e-05, + "loss": 0.5615, + "step": 472000 + }, + { + "epoch": 10.873169383807681, + "grad_norm": 3.0748026371002197, + "learning_rate": 1.4417038728348918e-05, + "loss": 0.5585, + "step": 472200 + }, + { + "epoch": 10.877774707561942, + "grad_norm": 2.5233187675476074, + "learning_rate": 1.4409767164526404e-05, + "loss": 0.5693, + "step": 472400 + }, + { + "epoch": 10.882380031316202, + "grad_norm": 3.0431416034698486, + "learning_rate": 1.4402531958523001e-05, + "loss": 0.5627, + "step": 472600 + }, + { + "epoch": 10.886985355070461, + "grad_norm": 2.8230044841766357, + "learning_rate": 1.4395260394700484e-05, + "loss": 0.5725, + "step": 472800 + }, + { + "epoch": 10.891590678824722, + "grad_norm": 2.9966495037078857, + "learning_rate": 1.438798883087797e-05, + "loss": 0.566, + "step": 473000 + }, + { + "epoch": 10.896196002578982, + "grad_norm": 3.4623706340789795, + "learning_rate": 1.4380717267055454e-05, + "loss": 0.5664, + "step": 473200 + }, + { + "epoch": 10.90080132633324, + "grad_norm": 2.6742422580718994, + "learning_rate": 1.4373445703232936e-05, + "loss": 0.5632, + "step": 473400 + }, + { + "epoch": 10.905406650087501, + "grad_norm": 3.300477981567383, + "learning_rate": 1.4366174139410422e-05, + "loss": 0.5797, + "step": 473600 + }, + { + "epoch": 10.910011973841762, + "grad_norm": 2.9274837970733643, + "learning_rate": 1.4358902575587907e-05, + "loss": 0.5604, + "step": 473800 + }, + { + "epoch": 10.91461729759602, + "grad_norm": 2.8111343383789062, + "learning_rate": 1.4351631011765389e-05, + "loss": 0.5643, + "step": 474000 + }, + { + "epoch": 10.919222621350281, + "grad_norm": 2.678849458694458, + "learning_rate": 1.4344359447942875e-05, + "loss": 0.5646, + "step": 474200 + }, + { + "epoch": 10.923827945104541, + "grad_norm": 2.9185190200805664, + "learning_rate": 1.433708788412036e-05, + "loss": 0.5691, + "step": 474400 + }, + { + "epoch": 10.9284332688588, + "grad_norm": 3.45511794090271, + "learning_rate": 1.4329852678116956e-05, + "loss": 0.568, + "step": 474600 + }, + { + "epoch": 10.93303859261306, + "grad_norm": 3.163696527481079, + "learning_rate": 1.4322617472113553e-05, + "loss": 0.551, + "step": 474800 + }, + { + "epoch": 10.937643916367321, + "grad_norm": 3.591355323791504, + "learning_rate": 1.4315345908291039e-05, + "loss": 0.5525, + "step": 475000 + }, + { + "epoch": 10.94224924012158, + "grad_norm": 3.392062187194824, + "learning_rate": 1.4308074344468521e-05, + "loss": 0.562, + "step": 475200 + }, + { + "epoch": 10.94685456387584, + "grad_norm": 3.2929904460906982, + "learning_rate": 1.4300802780646006e-05, + "loss": 0.5711, + "step": 475400 + }, + { + "epoch": 10.9514598876301, + "grad_norm": 3.0882444381713867, + "learning_rate": 1.4293531216823492e-05, + "loss": 0.5573, + "step": 475600 + }, + { + "epoch": 10.95606521138436, + "grad_norm": 3.073636054992676, + "learning_rate": 1.4286259653000974e-05, + "loss": 0.5738, + "step": 475800 + }, + { + "epoch": 10.96067053513862, + "grad_norm": 3.676265001296997, + "learning_rate": 1.4278988089178458e-05, + "loss": 0.5551, + "step": 476000 + }, + { + "epoch": 10.96527585889288, + "grad_norm": 3.5196962356567383, + "learning_rate": 1.4271716525355944e-05, + "loss": 0.565, + "step": 476200 + }, + { + "epoch": 10.96988118264714, + "grad_norm": 3.1859378814697266, + "learning_rate": 1.4264444961533427e-05, + "loss": 0.565, + "step": 476400 + }, + { + "epoch": 10.9744865064014, + "grad_norm": 2.839648962020874, + "learning_rate": 1.4257173397710913e-05, + "loss": 0.5656, + "step": 476600 + }, + { + "epoch": 10.97909183015566, + "grad_norm": 3.086836338043213, + "learning_rate": 1.4249901833888397e-05, + "loss": 0.5653, + "step": 476800 + }, + { + "epoch": 10.98369715390992, + "grad_norm": 3.1683554649353027, + "learning_rate": 1.424263027006588e-05, + "loss": 0.5679, + "step": 477000 + }, + { + "epoch": 10.98830247766418, + "grad_norm": 3.869631052017212, + "learning_rate": 1.4235358706243365e-05, + "loss": 0.5605, + "step": 477200 + }, + { + "epoch": 10.99290780141844, + "grad_norm": 3.0082695484161377, + "learning_rate": 1.422808714242085e-05, + "loss": 0.5558, + "step": 477400 + }, + { + "epoch": 10.9975131251727, + "grad_norm": 2.398848295211792, + "learning_rate": 1.4220815578598334e-05, + "loss": 0.5681, + "step": 477600 + }, + { + "epoch": 11.0, + "eval_loss": 0.5423869490623474, + "eval_runtime": 166.7579, + "eval_samples_per_second": 170.073, + "eval_steps_per_second": 10.632, + "step": 477708 + }, + { + "epoch": 11.002118448926959, + "grad_norm": 3.4873993396759033, + "learning_rate": 1.4213544014775818e-05, + "loss": 0.5714, + "step": 477800 + }, + { + "epoch": 11.00672377268122, + "grad_norm": 2.91593337059021, + "learning_rate": 1.4206272450953302e-05, + "loss": 0.558, + "step": 478000 + }, + { + "epoch": 11.01132909643548, + "grad_norm": 3.278059959411621, + "learning_rate": 1.4199037244949899e-05, + "loss": 0.5672, + "step": 478200 + }, + { + "epoch": 11.015934420189739, + "grad_norm": 2.8523497581481934, + "learning_rate": 1.4191765681127385e-05, + "loss": 0.5712, + "step": 478400 + }, + { + "epoch": 11.020539743944, + "grad_norm": 2.9555845260620117, + "learning_rate": 1.4184494117304868e-05, + "loss": 0.5571, + "step": 478600 + }, + { + "epoch": 11.02514506769826, + "grad_norm": 3.0521042346954346, + "learning_rate": 1.4177222553482352e-05, + "loss": 0.5565, + "step": 478800 + }, + { + "epoch": 11.029750391452518, + "grad_norm": 3.1665375232696533, + "learning_rate": 1.4169950989659838e-05, + "loss": 0.5573, + "step": 479000 + }, + { + "epoch": 11.034355715206779, + "grad_norm": 2.929694890975952, + "learning_rate": 1.416267942583732e-05, + "loss": 0.5592, + "step": 479200 + }, + { + "epoch": 11.03896103896104, + "grad_norm": 3.597895860671997, + "learning_rate": 1.4155407862014806e-05, + "loss": 0.5658, + "step": 479400 + }, + { + "epoch": 11.043566362715298, + "grad_norm": 2.8076977729797363, + "learning_rate": 1.414813629819229e-05, + "loss": 0.554, + "step": 479600 + }, + { + "epoch": 11.048171686469558, + "grad_norm": 3.267301559448242, + "learning_rate": 1.4140864734369773e-05, + "loss": 0.568, + "step": 479800 + }, + { + "epoch": 11.052777010223819, + "grad_norm": 3.1478617191314697, + "learning_rate": 1.4133593170547259e-05, + "loss": 0.5702, + "step": 480000 + }, + { + "epoch": 11.05738233397808, + "grad_norm": 2.9337422847747803, + "learning_rate": 1.4126321606724743e-05, + "loss": 0.5565, + "step": 480200 + }, + { + "epoch": 11.061987657732338, + "grad_norm": 2.9447715282440186, + "learning_rate": 1.4119050042902226e-05, + "loss": 0.5574, + "step": 480400 + }, + { + "epoch": 11.066592981486599, + "grad_norm": 3.1373238563537598, + "learning_rate": 1.4111778479079711e-05, + "loss": 0.5539, + "step": 480600 + }, + { + "epoch": 11.07119830524086, + "grad_norm": 3.812917947769165, + "learning_rate": 1.4104506915257196e-05, + "loss": 0.5531, + "step": 480800 + }, + { + "epoch": 11.075803628995118, + "grad_norm": 2.356658935546875, + "learning_rate": 1.409723535143468e-05, + "loss": 0.5652, + "step": 481000 + }, + { + "epoch": 11.080408952749378, + "grad_norm": 3.141037940979004, + "learning_rate": 1.4089963787612164e-05, + "loss": 0.552, + "step": 481200 + }, + { + "epoch": 11.085014276503639, + "grad_norm": 3.6183485984802246, + "learning_rate": 1.4082692223789648e-05, + "loss": 0.5601, + "step": 481400 + }, + { + "epoch": 11.089619600257898, + "grad_norm": 2.416588306427002, + "learning_rate": 1.4075420659967134e-05, + "loss": 0.5571, + "step": 481600 + }, + { + "epoch": 11.094224924012158, + "grad_norm": 3.343844175338745, + "learning_rate": 1.4068149096144617e-05, + "loss": 0.5558, + "step": 481800 + }, + { + "epoch": 11.098830247766418, + "grad_norm": 2.7808258533477783, + "learning_rate": 1.4060877532322101e-05, + "loss": 0.5705, + "step": 482000 + }, + { + "epoch": 11.103435571520677, + "grad_norm": 2.9986512660980225, + "learning_rate": 1.4053605968499587e-05, + "loss": 0.5589, + "step": 482200 + }, + { + "epoch": 11.108040895274938, + "grad_norm": 3.053314685821533, + "learning_rate": 1.404633440467707e-05, + "loss": 0.5586, + "step": 482400 + }, + { + "epoch": 11.112646219029198, + "grad_norm": 2.5446858406066895, + "learning_rate": 1.4039062840854555e-05, + "loss": 0.5498, + "step": 482600 + }, + { + "epoch": 11.117251542783457, + "grad_norm": 3.446873664855957, + "learning_rate": 1.403179127703204e-05, + "loss": 0.5492, + "step": 482800 + }, + { + "epoch": 11.121856866537717, + "grad_norm": 2.8828375339508057, + "learning_rate": 1.4024519713209522e-05, + "loss": 0.5557, + "step": 483000 + }, + { + "epoch": 11.126462190291978, + "grad_norm": 2.941978693008423, + "learning_rate": 1.4017248149387008e-05, + "loss": 0.5594, + "step": 483200 + }, + { + "epoch": 11.131067514046238, + "grad_norm": 3.2115888595581055, + "learning_rate": 1.4009976585564492e-05, + "loss": 0.5628, + "step": 483400 + }, + { + "epoch": 11.135672837800497, + "grad_norm": 3.323775291442871, + "learning_rate": 1.4002705021741975e-05, + "loss": 0.5574, + "step": 483600 + }, + { + "epoch": 11.140278161554757, + "grad_norm": 3.129638195037842, + "learning_rate": 1.399543345791946e-05, + "loss": 0.5684, + "step": 483800 + }, + { + "epoch": 11.144883485309018, + "grad_norm": 3.986924886703491, + "learning_rate": 1.3988161894096945e-05, + "loss": 0.5628, + "step": 484000 + }, + { + "epoch": 11.149488809063277, + "grad_norm": 3.6282601356506348, + "learning_rate": 1.3980890330274429e-05, + "loss": 0.563, + "step": 484200 + }, + { + "epoch": 11.154094132817537, + "grad_norm": 3.6448795795440674, + "learning_rate": 1.3973618766451913e-05, + "loss": 0.5604, + "step": 484400 + }, + { + "epoch": 11.158699456571798, + "grad_norm": 4.1878662109375, + "learning_rate": 1.3966347202629397e-05, + "loss": 0.5612, + "step": 484600 + }, + { + "epoch": 11.163304780326056, + "grad_norm": 3.1330618858337402, + "learning_rate": 1.3959075638806883e-05, + "loss": 0.5677, + "step": 484800 + }, + { + "epoch": 11.167910104080317, + "grad_norm": 3.0046327114105225, + "learning_rate": 1.3951804074984366e-05, + "loss": 0.5619, + "step": 485000 + }, + { + "epoch": 11.172515427834577, + "grad_norm": 2.964937925338745, + "learning_rate": 1.3944532511161852e-05, + "loss": 0.5613, + "step": 485200 + }, + { + "epoch": 11.177120751588836, + "grad_norm": 3.570549249649048, + "learning_rate": 1.3937260947339336e-05, + "loss": 0.556, + "step": 485400 + }, + { + "epoch": 11.181726075343096, + "grad_norm": 2.8079946041107178, + "learning_rate": 1.3930025741335933e-05, + "loss": 0.5513, + "step": 485600 + }, + { + "epoch": 11.186331399097357, + "grad_norm": 2.9145305156707764, + "learning_rate": 1.3922754177513416e-05, + "loss": 0.5556, + "step": 485800 + }, + { + "epoch": 11.190936722851616, + "grad_norm": 3.74453067779541, + "learning_rate": 1.3915482613690901e-05, + "loss": 0.5637, + "step": 486000 + }, + { + "epoch": 11.195542046605876, + "grad_norm": 3.717133045196533, + "learning_rate": 1.3908211049868386e-05, + "loss": 0.5556, + "step": 486200 + }, + { + "epoch": 11.200147370360137, + "grad_norm": 3.242644786834717, + "learning_rate": 1.3900939486045868e-05, + "loss": 0.5659, + "step": 486400 + }, + { + "epoch": 11.204752694114397, + "grad_norm": 3.173064708709717, + "learning_rate": 1.3893667922223354e-05, + "loss": 0.5541, + "step": 486600 + }, + { + "epoch": 11.209358017868656, + "grad_norm": 2.6633315086364746, + "learning_rate": 1.3886396358400838e-05, + "loss": 0.575, + "step": 486800 + }, + { + "epoch": 11.213963341622916, + "grad_norm": 3.097576856613159, + "learning_rate": 1.3879124794578323e-05, + "loss": 0.5574, + "step": 487000 + }, + { + "epoch": 11.218568665377177, + "grad_norm": 3.0718281269073486, + "learning_rate": 1.3871853230755807e-05, + "loss": 0.5525, + "step": 487200 + }, + { + "epoch": 11.223173989131435, + "grad_norm": 4.379641056060791, + "learning_rate": 1.3864581666933291e-05, + "loss": 0.5663, + "step": 487400 + }, + { + "epoch": 11.227779312885696, + "grad_norm": 2.682002305984497, + "learning_rate": 1.3857310103110775e-05, + "loss": 0.5677, + "step": 487600 + }, + { + "epoch": 11.232384636639956, + "grad_norm": 3.9355664253234863, + "learning_rate": 1.385003853928826e-05, + "loss": 0.5645, + "step": 487800 + }, + { + "epoch": 11.236989960394215, + "grad_norm": 3.0530924797058105, + "learning_rate": 1.3842766975465744e-05, + "loss": 0.5584, + "step": 488000 + }, + { + "epoch": 11.241595284148476, + "grad_norm": 3.311056613922119, + "learning_rate": 1.383549541164323e-05, + "loss": 0.5569, + "step": 488200 + }, + { + "epoch": 11.246200607902736, + "grad_norm": 3.7261691093444824, + "learning_rate": 1.3828260205639826e-05, + "loss": 0.5502, + "step": 488400 + }, + { + "epoch": 11.250805931656995, + "grad_norm": 2.7717695236206055, + "learning_rate": 1.3820988641817309e-05, + "loss": 0.566, + "step": 488600 + }, + { + "epoch": 11.255411255411255, + "grad_norm": 2.931807041168213, + "learning_rate": 1.3813717077994795e-05, + "loss": 0.5555, + "step": 488800 + }, + { + "epoch": 11.260016579165516, + "grad_norm": 3.2359492778778076, + "learning_rate": 1.3806445514172279e-05, + "loss": 0.5608, + "step": 489000 + }, + { + "epoch": 11.264621902919774, + "grad_norm": 2.8134891986846924, + "learning_rate": 1.3799173950349762e-05, + "loss": 0.5694, + "step": 489200 + }, + { + "epoch": 11.269227226674035, + "grad_norm": 3.166933059692383, + "learning_rate": 1.3791902386527248e-05, + "loss": 0.554, + "step": 489400 + }, + { + "epoch": 11.273832550428295, + "grad_norm": 3.2484281063079834, + "learning_rate": 1.3784630822704732e-05, + "loss": 0.5577, + "step": 489600 + }, + { + "epoch": 11.278437874182554, + "grad_norm": 3.1392292976379395, + "learning_rate": 1.3777359258882214e-05, + "loss": 0.5552, + "step": 489800 + }, + { + "epoch": 11.283043197936815, + "grad_norm": 3.2946557998657227, + "learning_rate": 1.37700876950597e-05, + "loss": 0.5578, + "step": 490000 + }, + { + "epoch": 11.287648521691075, + "grad_norm": 3.325127124786377, + "learning_rate": 1.3762816131237184e-05, + "loss": 0.546, + "step": 490200 + }, + { + "epoch": 11.292253845445336, + "grad_norm": 2.8645405769348145, + "learning_rate": 1.3755544567414669e-05, + "loss": 0.5588, + "step": 490400 + }, + { + "epoch": 11.296859169199594, + "grad_norm": 4.306154727935791, + "learning_rate": 1.3748273003592153e-05, + "loss": 0.5669, + "step": 490600 + }, + { + "epoch": 11.301464492953855, + "grad_norm": 3.181588649749756, + "learning_rate": 1.3741001439769637e-05, + "loss": 0.5579, + "step": 490800 + }, + { + "epoch": 11.306069816708115, + "grad_norm": 2.690722942352295, + "learning_rate": 1.3733729875947121e-05, + "loss": 0.5648, + "step": 491000 + }, + { + "epoch": 11.310675140462374, + "grad_norm": 3.0725932121276855, + "learning_rate": 1.3726458312124606e-05, + "loss": 0.552, + "step": 491200 + }, + { + "epoch": 11.315280464216634, + "grad_norm": 3.268697500228882, + "learning_rate": 1.371918674830209e-05, + "loss": 0.5616, + "step": 491400 + }, + { + "epoch": 11.319885787970895, + "grad_norm": 3.456531524658203, + "learning_rate": 1.3711915184479574e-05, + "loss": 0.5647, + "step": 491600 + }, + { + "epoch": 11.324491111725154, + "grad_norm": 2.9139323234558105, + "learning_rate": 1.3704643620657058e-05, + "loss": 0.5547, + "step": 491800 + }, + { + "epoch": 11.329096435479414, + "grad_norm": 3.0424976348876953, + "learning_rate": 1.3697372056834544e-05, + "loss": 0.5564, + "step": 492000 + }, + { + "epoch": 11.333701759233675, + "grad_norm": 3.656421661376953, + "learning_rate": 1.3690100493012028e-05, + "loss": 0.5547, + "step": 492200 + }, + { + "epoch": 11.338307082987933, + "grad_norm": 2.804675340652466, + "learning_rate": 1.368282892918951e-05, + "loss": 0.5504, + "step": 492400 + }, + { + "epoch": 11.342912406742194, + "grad_norm": 2.907458543777466, + "learning_rate": 1.3675557365366997e-05, + "loss": 0.5557, + "step": 492600 + }, + { + "epoch": 11.347517730496454, + "grad_norm": 3.368783712387085, + "learning_rate": 1.3668322159363594e-05, + "loss": 0.5629, + "step": 492800 + }, + { + "epoch": 11.352123054250713, + "grad_norm": 2.7372817993164062, + "learning_rate": 1.366108695336019e-05, + "loss": 0.5667, + "step": 493000 + }, + { + "epoch": 11.356728378004973, + "grad_norm": 3.26176381111145, + "learning_rate": 1.3653815389537675e-05, + "loss": 0.5581, + "step": 493200 + }, + { + "epoch": 11.361333701759234, + "grad_norm": 3.8855667114257812, + "learning_rate": 1.3646543825715157e-05, + "loss": 0.5589, + "step": 493400 + }, + { + "epoch": 11.365939025513494, + "grad_norm": 2.7255947589874268, + "learning_rate": 1.3639272261892643e-05, + "loss": 0.5483, + "step": 493600 + }, + { + "epoch": 11.370544349267753, + "grad_norm": 3.3710615634918213, + "learning_rate": 1.3632000698070127e-05, + "loss": 0.5443, + "step": 493800 + }, + { + "epoch": 11.375149673022014, + "grad_norm": 3.4504647254943848, + "learning_rate": 1.3624729134247612e-05, + "loss": 0.5645, + "step": 494000 + }, + { + "epoch": 11.379754996776274, + "grad_norm": 2.8603811264038086, + "learning_rate": 1.3617493928244209e-05, + "loss": 0.5617, + "step": 494200 + }, + { + "epoch": 11.384360320530533, + "grad_norm": 3.494835376739502, + "learning_rate": 1.3610222364421693e-05, + "loss": 0.5618, + "step": 494400 + }, + { + "epoch": 11.388965644284793, + "grad_norm": 2.891660213470459, + "learning_rate": 1.3602950800599177e-05, + "loss": 0.5602, + "step": 494600 + }, + { + "epoch": 11.393570968039054, + "grad_norm": 3.5556230545043945, + "learning_rate": 1.3595679236776661e-05, + "loss": 0.5636, + "step": 494800 + }, + { + "epoch": 11.398176291793312, + "grad_norm": 2.6204235553741455, + "learning_rate": 1.3588407672954145e-05, + "loss": 0.5588, + "step": 495000 + }, + { + "epoch": 11.402781615547573, + "grad_norm": 3.596064805984497, + "learning_rate": 1.3581136109131631e-05, + "loss": 0.5591, + "step": 495200 + }, + { + "epoch": 11.407386939301833, + "grad_norm": 3.3406505584716797, + "learning_rate": 1.3573864545309114e-05, + "loss": 0.5631, + "step": 495400 + }, + { + "epoch": 11.411992263056092, + "grad_norm": 3.357797145843506, + "learning_rate": 1.3566592981486598e-05, + "loss": 0.5515, + "step": 495600 + }, + { + "epoch": 11.416597586810353, + "grad_norm": 3.499816656112671, + "learning_rate": 1.3559321417664084e-05, + "loss": 0.5618, + "step": 495800 + }, + { + "epoch": 11.421202910564613, + "grad_norm": 2.6165995597839355, + "learning_rate": 1.3552049853841568e-05, + "loss": 0.5569, + "step": 496000 + }, + { + "epoch": 11.425808234318872, + "grad_norm": 2.801710844039917, + "learning_rate": 1.354477829001905e-05, + "loss": 0.5481, + "step": 496200 + }, + { + "epoch": 11.430413558073132, + "grad_norm": 3.2075862884521484, + "learning_rate": 1.3537506726196537e-05, + "loss": 0.5672, + "step": 496400 + }, + { + "epoch": 11.435018881827393, + "grad_norm": 2.7603681087493896, + "learning_rate": 1.3530235162374021e-05, + "loss": 0.5631, + "step": 496600 + }, + { + "epoch": 11.439624205581651, + "grad_norm": 2.9977965354919434, + "learning_rate": 1.3522963598551505e-05, + "loss": 0.5488, + "step": 496800 + }, + { + "epoch": 11.444229529335912, + "grad_norm": 2.803744316101074, + "learning_rate": 1.351569203472899e-05, + "loss": 0.5578, + "step": 497000 + }, + { + "epoch": 11.448834853090172, + "grad_norm": 3.423523187637329, + "learning_rate": 1.3508420470906474e-05, + "loss": 0.563, + "step": 497200 + }, + { + "epoch": 11.453440176844433, + "grad_norm": 3.3576598167419434, + "learning_rate": 1.3501148907083958e-05, + "loss": 0.5555, + "step": 497400 + }, + { + "epoch": 11.458045500598692, + "grad_norm": 3.5315699577331543, + "learning_rate": 1.3493877343261442e-05, + "loss": 0.564, + "step": 497600 + }, + { + "epoch": 11.462650824352952, + "grad_norm": 2.96769642829895, + "learning_rate": 1.3486605779438926e-05, + "loss": 0.5566, + "step": 497800 + }, + { + "epoch": 11.467256148107213, + "grad_norm": 2.768280267715454, + "learning_rate": 1.347933421561641e-05, + "loss": 0.5495, + "step": 498000 + }, + { + "epoch": 11.471861471861471, + "grad_norm": 3.71108078956604, + "learning_rate": 1.3472062651793895e-05, + "loss": 0.561, + "step": 498200 + }, + { + "epoch": 11.476466795615732, + "grad_norm": 3.0207583904266357, + "learning_rate": 1.346479108797138e-05, + "loss": 0.5592, + "step": 498400 + }, + { + "epoch": 11.481072119369992, + "grad_norm": 3.7575089931488037, + "learning_rate": 1.3457519524148863e-05, + "loss": 0.556, + "step": 498600 + }, + { + "epoch": 11.485677443124251, + "grad_norm": 3.050069570541382, + "learning_rate": 1.3450247960326347e-05, + "loss": 0.5618, + "step": 498800 + }, + { + "epoch": 11.490282766878511, + "grad_norm": 3.386132001876831, + "learning_rate": 1.3443012754322944e-05, + "loss": 0.5526, + "step": 499000 + }, + { + "epoch": 11.494888090632772, + "grad_norm": 2.874272584915161, + "learning_rate": 1.343574119050043e-05, + "loss": 0.561, + "step": 499200 + }, + { + "epoch": 11.49949341438703, + "grad_norm": 2.706022262573242, + "learning_rate": 1.3428469626677914e-05, + "loss": 0.5654, + "step": 499400 + }, + { + "epoch": 11.504098738141291, + "grad_norm": 3.2156574726104736, + "learning_rate": 1.3421234420674511e-05, + "loss": 0.561, + "step": 499600 + }, + { + "epoch": 11.508704061895552, + "grad_norm": 2.3544604778289795, + "learning_rate": 1.3413962856851994e-05, + "loss": 0.5725, + "step": 499800 + }, + { + "epoch": 11.513309385649812, + "grad_norm": 3.362711191177368, + "learning_rate": 1.340669129302948e-05, + "loss": 0.5669, + "step": 500000 + }, + { + "epoch": 11.51791470940407, + "grad_norm": 3.2059872150421143, + "learning_rate": 1.3399419729206964e-05, + "loss": 0.5644, + "step": 500200 + }, + { + "epoch": 11.522520033158331, + "grad_norm": 2.8236091136932373, + "learning_rate": 1.3392148165384448e-05, + "loss": 0.5625, + "step": 500400 + }, + { + "epoch": 11.527125356912592, + "grad_norm": 3.4916694164276123, + "learning_rate": 1.3384912959381045e-05, + "loss": 0.5556, + "step": 500600 + }, + { + "epoch": 11.53173068066685, + "grad_norm": 2.976130962371826, + "learning_rate": 1.337764139555853e-05, + "loss": 0.5611, + "step": 500800 + }, + { + "epoch": 11.536336004421111, + "grad_norm": 3.8852763175964355, + "learning_rate": 1.3370369831736013e-05, + "loss": 0.5676, + "step": 501000 + }, + { + "epoch": 11.540941328175371, + "grad_norm": 3.322741985321045, + "learning_rate": 1.3363098267913498e-05, + "loss": 0.5669, + "step": 501200 + }, + { + "epoch": 11.54554665192963, + "grad_norm": 2.873216390609741, + "learning_rate": 1.3355826704090982e-05, + "loss": 0.5711, + "step": 501400 + }, + { + "epoch": 11.55015197568389, + "grad_norm": 2.740243911743164, + "learning_rate": 1.3348555140268466e-05, + "loss": 0.5584, + "step": 501600 + }, + { + "epoch": 11.554757299438151, + "grad_norm": 2.4768307209014893, + "learning_rate": 1.334128357644595e-05, + "loss": 0.5652, + "step": 501800 + }, + { + "epoch": 11.55936262319241, + "grad_norm": 2.8483810424804688, + "learning_rate": 1.3334012012623435e-05, + "loss": 0.5571, + "step": 502000 + }, + { + "epoch": 11.56396794694667, + "grad_norm": 2.515305280685425, + "learning_rate": 1.332674044880092e-05, + "loss": 0.5515, + "step": 502200 + }, + { + "epoch": 11.56857327070093, + "grad_norm": 2.972041606903076, + "learning_rate": 1.3319468884978403e-05, + "loss": 0.5558, + "step": 502400 + }, + { + "epoch": 11.57317859445519, + "grad_norm": 2.9375104904174805, + "learning_rate": 1.3312197321155887e-05, + "loss": 0.5514, + "step": 502600 + }, + { + "epoch": 11.57778391820945, + "grad_norm": 2.7295291423797607, + "learning_rate": 1.3304925757333373e-05, + "loss": 0.5623, + "step": 502800 + }, + { + "epoch": 11.58238924196371, + "grad_norm": 3.959212064743042, + "learning_rate": 1.3297654193510857e-05, + "loss": 0.5642, + "step": 503000 + }, + { + "epoch": 11.58699456571797, + "grad_norm": 3.5699644088745117, + "learning_rate": 1.329038262968834e-05, + "loss": 0.553, + "step": 503200 + }, + { + "epoch": 11.59159988947223, + "grad_norm": 2.7130260467529297, + "learning_rate": 1.3283111065865826e-05, + "loss": 0.5518, + "step": 503400 + }, + { + "epoch": 11.59620521322649, + "grad_norm": 2.8701913356781006, + "learning_rate": 1.327583950204331e-05, + "loss": 0.5529, + "step": 503600 + }, + { + "epoch": 11.600810536980749, + "grad_norm": 2.8965420722961426, + "learning_rate": 1.3268567938220794e-05, + "loss": 0.556, + "step": 503800 + }, + { + "epoch": 11.60541586073501, + "grad_norm": 2.749356269836426, + "learning_rate": 1.3261296374398278e-05, + "loss": 0.5555, + "step": 504000 + }, + { + "epoch": 11.61002118448927, + "grad_norm": 3.4139721393585205, + "learning_rate": 1.3254024810575763e-05, + "loss": 0.5527, + "step": 504200 + }, + { + "epoch": 11.61462650824353, + "grad_norm": 2.8031933307647705, + "learning_rate": 1.3246753246753247e-05, + "loss": 0.5462, + "step": 504400 + }, + { + "epoch": 11.619231831997789, + "grad_norm": 3.3376245498657227, + "learning_rate": 1.3239481682930731e-05, + "loss": 0.563, + "step": 504600 + }, + { + "epoch": 11.62383715575205, + "grad_norm": 2.577552318572998, + "learning_rate": 1.3232210119108217e-05, + "loss": 0.551, + "step": 504800 + }, + { + "epoch": 11.62844247950631, + "grad_norm": 2.8429970741271973, + "learning_rate": 1.32249385552857e-05, + "loss": 0.5543, + "step": 505000 + }, + { + "epoch": 11.633047803260569, + "grad_norm": 2.562479019165039, + "learning_rate": 1.3217666991463184e-05, + "loss": 0.5515, + "step": 505200 + }, + { + "epoch": 11.63765312701483, + "grad_norm": 3.0546891689300537, + "learning_rate": 1.321039542764067e-05, + "loss": 0.5623, + "step": 505400 + }, + { + "epoch": 11.64225845076909, + "grad_norm": 3.4961397647857666, + "learning_rate": 1.3203123863818152e-05, + "loss": 0.552, + "step": 505600 + }, + { + "epoch": 11.646863774523348, + "grad_norm": 2.9656572341918945, + "learning_rate": 1.3195852299995636e-05, + "loss": 0.5574, + "step": 505800 + }, + { + "epoch": 11.651469098277609, + "grad_norm": 2.4916203022003174, + "learning_rate": 1.3188617093992233e-05, + "loss": 0.5648, + "step": 506000 + }, + { + "epoch": 11.65607442203187, + "grad_norm": 3.366502046585083, + "learning_rate": 1.318134553016972e-05, + "loss": 0.5664, + "step": 506200 + }, + { + "epoch": 11.660679745786128, + "grad_norm": 2.6675634384155273, + "learning_rate": 1.3174073966347203e-05, + "loss": 0.561, + "step": 506400 + }, + { + "epoch": 11.665285069540388, + "grad_norm": 2.798330068588257, + "learning_rate": 1.3166802402524688e-05, + "loss": 0.5609, + "step": 506600 + }, + { + "epoch": 11.669890393294649, + "grad_norm": 3.1210689544677734, + "learning_rate": 1.3159530838702172e-05, + "loss": 0.5702, + "step": 506800 + }, + { + "epoch": 11.67449571704891, + "grad_norm": 4.939085006713867, + "learning_rate": 1.3152259274879656e-05, + "loss": 0.559, + "step": 507000 + }, + { + "epoch": 11.679101040803168, + "grad_norm": 3.4388821125030518, + "learning_rate": 1.314498771105714e-05, + "loss": 0.5616, + "step": 507200 + }, + { + "epoch": 11.683706364557429, + "grad_norm": 2.974487066268921, + "learning_rate": 1.3137716147234625e-05, + "loss": 0.5539, + "step": 507400 + }, + { + "epoch": 11.688311688311689, + "grad_norm": 2.9296836853027344, + "learning_rate": 1.3130444583412109e-05, + "loss": 0.5657, + "step": 507600 + }, + { + "epoch": 11.692917012065948, + "grad_norm": 3.680119276046753, + "learning_rate": 1.3123173019589593e-05, + "loss": 0.5624, + "step": 507800 + }, + { + "epoch": 11.697522335820208, + "grad_norm": 2.4661529064178467, + "learning_rate": 1.3115901455767077e-05, + "loss": 0.5654, + "step": 508000 + }, + { + "epoch": 11.702127659574469, + "grad_norm": 3.046780586242676, + "learning_rate": 1.3108629891944563e-05, + "loss": 0.5738, + "step": 508200 + }, + { + "epoch": 11.706732983328727, + "grad_norm": 2.5584285259246826, + "learning_rate": 1.3101358328122046e-05, + "loss": 0.554, + "step": 508400 + }, + { + "epoch": 11.711338307082988, + "grad_norm": 3.345762014389038, + "learning_rate": 1.309408676429953e-05, + "loss": 0.5675, + "step": 508600 + }, + { + "epoch": 11.715943630837248, + "grad_norm": 2.661505937576294, + "learning_rate": 1.3086815200477016e-05, + "loss": 0.5649, + "step": 508800 + }, + { + "epoch": 11.720548954591507, + "grad_norm": 3.2517101764678955, + "learning_rate": 1.3079543636654498e-05, + "loss": 0.5518, + "step": 509000 + }, + { + "epoch": 11.725154278345768, + "grad_norm": 2.780080556869507, + "learning_rate": 1.3072272072831983e-05, + "loss": 0.5524, + "step": 509200 + }, + { + "epoch": 11.729759602100028, + "grad_norm": 3.5682613849639893, + "learning_rate": 1.3065000509009468e-05, + "loss": 0.5642, + "step": 509400 + }, + { + "epoch": 11.734364925854287, + "grad_norm": 2.8819103240966797, + "learning_rate": 1.3057728945186953e-05, + "loss": 0.5553, + "step": 509600 + }, + { + "epoch": 11.738970249608547, + "grad_norm": 3.0837063789367676, + "learning_rate": 1.3050457381364437e-05, + "loss": 0.5659, + "step": 509800 + }, + { + "epoch": 11.743575573362808, + "grad_norm": 2.94468355178833, + "learning_rate": 1.3043185817541921e-05, + "loss": 0.5447, + "step": 510000 + }, + { + "epoch": 11.748180897117066, + "grad_norm": 2.4969544410705566, + "learning_rate": 1.3035914253719405e-05, + "loss": 0.5503, + "step": 510200 + }, + { + "epoch": 11.752786220871327, + "grad_norm": 2.9021387100219727, + "learning_rate": 1.302864268989689e-05, + "loss": 0.5498, + "step": 510400 + }, + { + "epoch": 11.757391544625587, + "grad_norm": 3.434443473815918, + "learning_rate": 1.3021371126074374e-05, + "loss": 0.5611, + "step": 510600 + }, + { + "epoch": 11.761996868379848, + "grad_norm": 2.887711763381958, + "learning_rate": 1.3014099562251858e-05, + "loss": 0.5348, + "step": 510800 + }, + { + "epoch": 11.766602192134107, + "grad_norm": 3.3442883491516113, + "learning_rate": 1.3006827998429342e-05, + "loss": 0.5537, + "step": 511000 + }, + { + "epoch": 11.771207515888367, + "grad_norm": 2.8919873237609863, + "learning_rate": 1.2999592792425939e-05, + "loss": 0.5731, + "step": 511200 + }, + { + "epoch": 11.775812839642628, + "grad_norm": 3.013439178466797, + "learning_rate": 1.2992321228603423e-05, + "loss": 0.5496, + "step": 511400 + }, + { + "epoch": 11.780418163396886, + "grad_norm": 2.499289035797119, + "learning_rate": 1.298508602260002e-05, + "loss": 0.5678, + "step": 511600 + }, + { + "epoch": 11.785023487151147, + "grad_norm": 3.30153226852417, + "learning_rate": 1.2977814458777506e-05, + "loss": 0.557, + "step": 511800 + }, + { + "epoch": 11.789628810905407, + "grad_norm": 3.1825666427612305, + "learning_rate": 1.2970542894954989e-05, + "loss": 0.5488, + "step": 512000 + }, + { + "epoch": 11.794234134659666, + "grad_norm": 3.229902744293213, + "learning_rate": 1.2963271331132473e-05, + "loss": 0.567, + "step": 512200 + }, + { + "epoch": 11.798839458413926, + "grad_norm": 2.68745493888855, + "learning_rate": 1.2955999767309959e-05, + "loss": 0.5674, + "step": 512400 + }, + { + "epoch": 11.803444782168187, + "grad_norm": 3.270493507385254, + "learning_rate": 1.2948728203487441e-05, + "loss": 0.5487, + "step": 512600 + }, + { + "epoch": 11.808050105922446, + "grad_norm": 3.161329507827759, + "learning_rate": 1.2941456639664927e-05, + "loss": 0.5625, + "step": 512800 + }, + { + "epoch": 11.812655429676706, + "grad_norm": 2.8435397148132324, + "learning_rate": 1.2934185075842411e-05, + "loss": 0.5453, + "step": 513000 + }, + { + "epoch": 11.817260753430967, + "grad_norm": 2.739443778991699, + "learning_rate": 1.2926913512019896e-05, + "loss": 0.5546, + "step": 513200 + }, + { + "epoch": 11.821866077185225, + "grad_norm": 2.649695634841919, + "learning_rate": 1.291964194819738e-05, + "loss": 0.5519, + "step": 513400 + }, + { + "epoch": 11.826471400939486, + "grad_norm": 3.265084981918335, + "learning_rate": 1.2912370384374864e-05, + "loss": 0.5485, + "step": 513600 + }, + { + "epoch": 11.831076724693746, + "grad_norm": 3.4886581897735596, + "learning_rate": 1.2905098820552348e-05, + "loss": 0.5591, + "step": 513800 + }, + { + "epoch": 11.835682048448007, + "grad_norm": 3.9588351249694824, + "learning_rate": 1.2897827256729833e-05, + "loss": 0.5473, + "step": 514000 + }, + { + "epoch": 11.840287372202265, + "grad_norm": 3.0174105167388916, + "learning_rate": 1.2890555692907317e-05, + "loss": 0.5637, + "step": 514200 + }, + { + "epoch": 11.844892695956526, + "grad_norm": 2.9916231632232666, + "learning_rate": 1.2883284129084803e-05, + "loss": 0.5631, + "step": 514400 + }, + { + "epoch": 11.849498019710786, + "grad_norm": 3.1581456661224365, + "learning_rate": 1.2876012565262285e-05, + "loss": 0.5553, + "step": 514600 + }, + { + "epoch": 11.854103343465045, + "grad_norm": 2.6253857612609863, + "learning_rate": 1.286874100143977e-05, + "loss": 0.5628, + "step": 514800 + }, + { + "epoch": 11.858708667219306, + "grad_norm": 3.4651215076446533, + "learning_rate": 1.2861469437617255e-05, + "loss": 0.555, + "step": 515000 + }, + { + "epoch": 11.863313990973566, + "grad_norm": 5.004696369171143, + "learning_rate": 1.2854197873794738e-05, + "loss": 0.5577, + "step": 515200 + }, + { + "epoch": 11.867919314727825, + "grad_norm": 3.6759226322174072, + "learning_rate": 1.2846926309972222e-05, + "loss": 0.5608, + "step": 515400 + }, + { + "epoch": 11.872524638482085, + "grad_norm": 4.682463645935059, + "learning_rate": 1.2839654746149708e-05, + "loss": 0.5639, + "step": 515600 + }, + { + "epoch": 11.877129962236346, + "grad_norm": 3.1118075847625732, + "learning_rate": 1.2832419540146305e-05, + "loss": 0.5593, + "step": 515800 + }, + { + "epoch": 11.881735285990604, + "grad_norm": 3.632077693939209, + "learning_rate": 1.2825147976323787e-05, + "loss": 0.5623, + "step": 516000 + }, + { + "epoch": 11.886340609744865, + "grad_norm": 3.2938430309295654, + "learning_rate": 1.2817876412501273e-05, + "loss": 0.5567, + "step": 516200 + }, + { + "epoch": 11.890945933499125, + "grad_norm": 3.3671703338623047, + "learning_rate": 1.2810604848678758e-05, + "loss": 0.558, + "step": 516400 + }, + { + "epoch": 11.895551257253384, + "grad_norm": 2.796440839767456, + "learning_rate": 1.2803333284856242e-05, + "loss": 0.5565, + "step": 516600 + }, + { + "epoch": 11.900156581007645, + "grad_norm": 4.450211524963379, + "learning_rate": 1.2796061721033726e-05, + "loss": 0.551, + "step": 516800 + }, + { + "epoch": 11.904761904761905, + "grad_norm": 2.8839023113250732, + "learning_rate": 1.278879015721121e-05, + "loss": 0.5542, + "step": 517000 + }, + { + "epoch": 11.909367228516164, + "grad_norm": 3.0486433506011963, + "learning_rate": 1.2781518593388694e-05, + "loss": 0.5539, + "step": 517200 + }, + { + "epoch": 11.913972552270424, + "grad_norm": 3.802643299102783, + "learning_rate": 1.2774247029566179e-05, + "loss": 0.552, + "step": 517400 + }, + { + "epoch": 11.918577876024685, + "grad_norm": 3.0999131202697754, + "learning_rate": 1.2766975465743663e-05, + "loss": 0.548, + "step": 517600 + }, + { + "epoch": 11.923183199778945, + "grad_norm": 2.016890525817871, + "learning_rate": 1.2759703901921149e-05, + "loss": 0.5554, + "step": 517800 + }, + { + "epoch": 11.927788523533204, + "grad_norm": 3.3377721309661865, + "learning_rate": 1.2752432338098631e-05, + "loss": 0.5529, + "step": 518000 + }, + { + "epoch": 11.932393847287464, + "grad_norm": 3.55546498298645, + "learning_rate": 1.2745197132095228e-05, + "loss": 0.5649, + "step": 518200 + }, + { + "epoch": 11.936999171041725, + "grad_norm": 2.818472146987915, + "learning_rate": 1.2737961926091825e-05, + "loss": 0.5533, + "step": 518400 + }, + { + "epoch": 11.941604494795984, + "grad_norm": 3.24600887298584, + "learning_rate": 1.273069036226931e-05, + "loss": 0.5611, + "step": 518600 + }, + { + "epoch": 11.946209818550244, + "grad_norm": 2.4776716232299805, + "learning_rate": 1.2723418798446795e-05, + "loss": 0.5607, + "step": 518800 + }, + { + "epoch": 11.950815142304505, + "grad_norm": 3.1430325508117676, + "learning_rate": 1.2716147234624278e-05, + "loss": 0.5527, + "step": 519000 + }, + { + "epoch": 11.955420466058763, + "grad_norm": 3.204774856567383, + "learning_rate": 1.2708875670801762e-05, + "loss": 0.5664, + "step": 519200 + }, + { + "epoch": 11.960025789813024, + "grad_norm": 4.431719779968262, + "learning_rate": 1.2701604106979248e-05, + "loss": 0.5497, + "step": 519400 + }, + { + "epoch": 11.964631113567284, + "grad_norm": 3.592142343521118, + "learning_rate": 1.269433254315673e-05, + "loss": 0.5578, + "step": 519600 + }, + { + "epoch": 11.969236437321543, + "grad_norm": 3.99924373626709, + "learning_rate": 1.2687060979334216e-05, + "loss": 0.5628, + "step": 519800 + }, + { + "epoch": 11.973841761075803, + "grad_norm": 3.2276337146759033, + "learning_rate": 1.26797894155117e-05, + "loss": 0.5689, + "step": 520000 + }, + { + "epoch": 11.978447084830064, + "grad_norm": 3.4805285930633545, + "learning_rate": 1.2672517851689185e-05, + "loss": 0.5545, + "step": 520200 + }, + { + "epoch": 11.983052408584324, + "grad_norm": 3.0665435791015625, + "learning_rate": 1.2665246287866669e-05, + "loss": 0.5564, + "step": 520400 + }, + { + "epoch": 11.987657732338583, + "grad_norm": 3.2864396572113037, + "learning_rate": 1.2657974724044153e-05, + "loss": 0.5689, + "step": 520600 + }, + { + "epoch": 11.992263056092844, + "grad_norm": 4.101680278778076, + "learning_rate": 1.2650703160221637e-05, + "loss": 0.5586, + "step": 520800 + }, + { + "epoch": 11.996868379847104, + "grad_norm": 3.1322414875030518, + "learning_rate": 1.2643431596399122e-05, + "loss": 0.5665, + "step": 521000 + }, + { + "epoch": 12.0, + "eval_loss": 0.5357550382614136, + "eval_runtime": 168.6634, + "eval_samples_per_second": 168.152, + "eval_steps_per_second": 10.512, + "step": 521136 + }, + { + "epoch": 12.001473703601363, + "grad_norm": 3.8854141235351562, + "learning_rate": 1.2636160032576606e-05, + "loss": 0.5461, + "step": 521200 + }, + { + "epoch": 12.006079027355623, + "grad_norm": 3.1888136863708496, + "learning_rate": 1.2628888468754092e-05, + "loss": 0.5459, + "step": 521400 + }, + { + "epoch": 12.010684351109884, + "grad_norm": 2.910918951034546, + "learning_rate": 1.2621653262750689e-05, + "loss": 0.5549, + "step": 521600 + }, + { + "epoch": 12.015289674864142, + "grad_norm": 3.348071813583374, + "learning_rate": 1.2614381698928171e-05, + "loss": 0.5604, + "step": 521800 + }, + { + "epoch": 12.019894998618403, + "grad_norm": 2.9339218139648438, + "learning_rate": 1.2607110135105655e-05, + "loss": 0.5526, + "step": 522000 + }, + { + "epoch": 12.024500322372663, + "grad_norm": 2.8683624267578125, + "learning_rate": 1.2599838571283141e-05, + "loss": 0.5467, + "step": 522200 + }, + { + "epoch": 12.029105646126922, + "grad_norm": 3.5013625621795654, + "learning_rate": 1.2592567007460624e-05, + "loss": 0.5531, + "step": 522400 + }, + { + "epoch": 12.033710969881183, + "grad_norm": 2.941629409790039, + "learning_rate": 1.258529544363811e-05, + "loss": 0.553, + "step": 522600 + }, + { + "epoch": 12.038316293635443, + "grad_norm": 4.11761999130249, + "learning_rate": 1.2578023879815594e-05, + "loss": 0.5439, + "step": 522800 + }, + { + "epoch": 12.042921617389702, + "grad_norm": 2.471980094909668, + "learning_rate": 1.2570752315993077e-05, + "loss": 0.5569, + "step": 523000 + }, + { + "epoch": 12.047526941143962, + "grad_norm": 3.2188100814819336, + "learning_rate": 1.2563480752170562e-05, + "loss": 0.5494, + "step": 523200 + }, + { + "epoch": 12.052132264898223, + "grad_norm": 3.172586679458618, + "learning_rate": 1.2556209188348047e-05, + "loss": 0.5461, + "step": 523400 + }, + { + "epoch": 12.056737588652481, + "grad_norm": 2.9670534133911133, + "learning_rate": 1.254893762452553e-05, + "loss": 0.5479, + "step": 523600 + }, + { + "epoch": 12.061342912406742, + "grad_norm": 2.948775053024292, + "learning_rate": 1.2541666060703015e-05, + "loss": 0.5428, + "step": 523800 + }, + { + "epoch": 12.065948236161002, + "grad_norm": 2.3510968685150146, + "learning_rate": 1.25343944968805e-05, + "loss": 0.5526, + "step": 524000 + }, + { + "epoch": 12.070553559915263, + "grad_norm": 4.567888259887695, + "learning_rate": 1.2527122933057985e-05, + "loss": 0.5531, + "step": 524200 + }, + { + "epoch": 12.075158883669522, + "grad_norm": 3.3408639430999756, + "learning_rate": 1.2519851369235468e-05, + "loss": 0.545, + "step": 524400 + }, + { + "epoch": 12.079764207423782, + "grad_norm": 3.8344717025756836, + "learning_rate": 1.2512579805412952e-05, + "loss": 0.5609, + "step": 524600 + }, + { + "epoch": 12.084369531178043, + "grad_norm": 2.317721366882324, + "learning_rate": 1.2505308241590438e-05, + "loss": 0.5464, + "step": 524800 + }, + { + "epoch": 12.088974854932301, + "grad_norm": 2.980634927749634, + "learning_rate": 1.249803667776792e-05, + "loss": 0.5394, + "step": 525000 + }, + { + "epoch": 12.093580178686562, + "grad_norm": 3.3348686695098877, + "learning_rate": 1.2490765113945405e-05, + "loss": 0.5551, + "step": 525200 + }, + { + "epoch": 12.098185502440822, + "grad_norm": 3.0347678661346436, + "learning_rate": 1.248349355012289e-05, + "loss": 0.551, + "step": 525400 + }, + { + "epoch": 12.102790826195081, + "grad_norm": 3.286358118057251, + "learning_rate": 1.2476221986300373e-05, + "loss": 0.5652, + "step": 525600 + }, + { + "epoch": 12.107396149949341, + "grad_norm": 3.339553117752075, + "learning_rate": 1.2468950422477859e-05, + "loss": 0.5548, + "step": 525800 + }, + { + "epoch": 12.112001473703602, + "grad_norm": 2.7706658840179443, + "learning_rate": 1.2461715216474456e-05, + "loss": 0.5553, + "step": 526000 + }, + { + "epoch": 12.11660679745786, + "grad_norm": 2.956782817840576, + "learning_rate": 1.245444365265194e-05, + "loss": 0.5499, + "step": 526200 + }, + { + "epoch": 12.121212121212121, + "grad_norm": 3.1089818477630615, + "learning_rate": 1.2447172088829423e-05, + "loss": 0.5438, + "step": 526400 + }, + { + "epoch": 12.125817444966382, + "grad_norm": 2.3799941539764404, + "learning_rate": 1.2439900525006909e-05, + "loss": 0.5455, + "step": 526600 + }, + { + "epoch": 12.13042276872064, + "grad_norm": 4.066889762878418, + "learning_rate": 1.2432628961184393e-05, + "loss": 0.554, + "step": 526800 + }, + { + "epoch": 12.1350280924749, + "grad_norm": 3.080200672149658, + "learning_rate": 1.2425357397361875e-05, + "loss": 0.5558, + "step": 527000 + }, + { + "epoch": 12.139633416229161, + "grad_norm": 2.6959545612335205, + "learning_rate": 1.2418085833539361e-05, + "loss": 0.5523, + "step": 527200 + }, + { + "epoch": 12.14423873998342, + "grad_norm": 2.727076292037964, + "learning_rate": 1.2410814269716845e-05, + "loss": 0.5532, + "step": 527400 + }, + { + "epoch": 12.14884406373768, + "grad_norm": 4.091317653656006, + "learning_rate": 1.2403579063713442e-05, + "loss": 0.5506, + "step": 527600 + }, + { + "epoch": 12.153449387491941, + "grad_norm": 3.140193462371826, + "learning_rate": 1.2396307499890928e-05, + "loss": 0.5585, + "step": 527800 + }, + { + "epoch": 12.158054711246201, + "grad_norm": 3.9469101428985596, + "learning_rate": 1.238903593606841e-05, + "loss": 0.5526, + "step": 528000 + }, + { + "epoch": 12.16266003500046, + "grad_norm": 3.1913647651672363, + "learning_rate": 1.2381764372245895e-05, + "loss": 0.5434, + "step": 528200 + }, + { + "epoch": 12.16726535875472, + "grad_norm": 2.5850043296813965, + "learning_rate": 1.2374492808423381e-05, + "loss": 0.5574, + "step": 528400 + }, + { + "epoch": 12.171870682508981, + "grad_norm": 2.807478904724121, + "learning_rate": 1.2367221244600863e-05, + "loss": 0.5513, + "step": 528600 + }, + { + "epoch": 12.17647600626324, + "grad_norm": 3.168511390686035, + "learning_rate": 1.2359949680778348e-05, + "loss": 0.5539, + "step": 528800 + }, + { + "epoch": 12.1810813300175, + "grad_norm": 3.547654867172241, + "learning_rate": 1.2352678116955834e-05, + "loss": 0.5485, + "step": 529000 + }, + { + "epoch": 12.18568665377176, + "grad_norm": 2.912477493286133, + "learning_rate": 1.2345406553133316e-05, + "loss": 0.5397, + "step": 529200 + }, + { + "epoch": 12.19029197752602, + "grad_norm": 3.6767055988311768, + "learning_rate": 1.2338134989310802e-05, + "loss": 0.5581, + "step": 529400 + }, + { + "epoch": 12.19489730128028, + "grad_norm": 3.366831064224243, + "learning_rate": 1.2330863425488286e-05, + "loss": 0.5556, + "step": 529600 + }, + { + "epoch": 12.19950262503454, + "grad_norm": 3.2023255825042725, + "learning_rate": 1.2323628219484883e-05, + "loss": 0.5535, + "step": 529800 + }, + { + "epoch": 12.2041079487888, + "grad_norm": 2.8660030364990234, + "learning_rate": 1.2316356655662366e-05, + "loss": 0.5565, + "step": 530000 + }, + { + "epoch": 12.20871327254306, + "grad_norm": 2.7351365089416504, + "learning_rate": 1.2309085091839852e-05, + "loss": 0.5524, + "step": 530200 + }, + { + "epoch": 12.21331859629732, + "grad_norm": 3.0572710037231445, + "learning_rate": 1.2301813528017336e-05, + "loss": 0.5538, + "step": 530400 + }, + { + "epoch": 12.217923920051579, + "grad_norm": 3.4272561073303223, + "learning_rate": 1.229454196419482e-05, + "loss": 0.5479, + "step": 530600 + }, + { + "epoch": 12.22252924380584, + "grad_norm": 2.7951786518096924, + "learning_rate": 1.2287270400372304e-05, + "loss": 0.5439, + "step": 530800 + }, + { + "epoch": 12.2271345675601, + "grad_norm": 3.222205638885498, + "learning_rate": 1.2279998836549789e-05, + "loss": 0.5448, + "step": 531000 + }, + { + "epoch": 12.23173989131436, + "grad_norm": 2.7026097774505615, + "learning_rate": 1.2272727272727274e-05, + "loss": 0.5442, + "step": 531200 + }, + { + "epoch": 12.236345215068619, + "grad_norm": 3.955737590789795, + "learning_rate": 1.2265455708904757e-05, + "loss": 0.5558, + "step": 531400 + }, + { + "epoch": 12.24095053882288, + "grad_norm": 3.2188639640808105, + "learning_rate": 1.2258184145082241e-05, + "loss": 0.5434, + "step": 531600 + }, + { + "epoch": 12.24555586257714, + "grad_norm": 2.410057783126831, + "learning_rate": 1.2250912581259727e-05, + "loss": 0.5655, + "step": 531800 + }, + { + "epoch": 12.250161186331399, + "grad_norm": 2.9907710552215576, + "learning_rate": 1.224364101743721e-05, + "loss": 0.5488, + "step": 532000 + }, + { + "epoch": 12.254766510085659, + "grad_norm": 3.5246376991271973, + "learning_rate": 1.2236369453614696e-05, + "loss": 0.5616, + "step": 532200 + }, + { + "epoch": 12.25937183383992, + "grad_norm": 3.297806978225708, + "learning_rate": 1.222909788979218e-05, + "loss": 0.5688, + "step": 532400 + }, + { + "epoch": 12.263977157594178, + "grad_norm": 2.848536729812622, + "learning_rate": 1.2221826325969662e-05, + "loss": 0.5417, + "step": 532600 + }, + { + "epoch": 12.268582481348439, + "grad_norm": 2.740184783935547, + "learning_rate": 1.2214554762147148e-05, + "loss": 0.558, + "step": 532800 + }, + { + "epoch": 12.2731878051027, + "grad_norm": 3.5397231578826904, + "learning_rate": 1.2207283198324632e-05, + "loss": 0.5495, + "step": 533000 + }, + { + "epoch": 12.277793128856958, + "grad_norm": 2.4408740997314453, + "learning_rate": 1.220004799232123e-05, + "loss": 0.5511, + "step": 533200 + }, + { + "epoch": 12.282398452611218, + "grad_norm": 3.476637363433838, + "learning_rate": 1.2192776428498712e-05, + "loss": 0.5498, + "step": 533400 + }, + { + "epoch": 12.287003776365479, + "grad_norm": 3.668010711669922, + "learning_rate": 1.2185504864676198e-05, + "loss": 0.549, + "step": 533600 + }, + { + "epoch": 12.291609100119738, + "grad_norm": 3.334040641784668, + "learning_rate": 1.2178233300853682e-05, + "loss": 0.5352, + "step": 533800 + }, + { + "epoch": 12.296214423873998, + "grad_norm": 2.8002822399139404, + "learning_rate": 1.2170961737031166e-05, + "loss": 0.5501, + "step": 534000 + }, + { + "epoch": 12.300819747628259, + "grad_norm": 2.5833489894866943, + "learning_rate": 1.216369017320865e-05, + "loss": 0.5483, + "step": 534200 + }, + { + "epoch": 12.305425071382519, + "grad_norm": 3.02528977394104, + "learning_rate": 1.2156418609386135e-05, + "loss": 0.5439, + "step": 534400 + }, + { + "epoch": 12.310030395136778, + "grad_norm": 3.139838933944702, + "learning_rate": 1.214914704556362e-05, + "loss": 0.5504, + "step": 534600 + }, + { + "epoch": 12.314635718891038, + "grad_norm": 2.3408186435699463, + "learning_rate": 1.2141875481741103e-05, + "loss": 0.5431, + "step": 534800 + }, + { + "epoch": 12.319241042645299, + "grad_norm": 3.896301746368408, + "learning_rate": 1.2134603917918587e-05, + "loss": 0.5497, + "step": 535000 + }, + { + "epoch": 12.323846366399557, + "grad_norm": 3.0924932956695557, + "learning_rate": 1.2127332354096073e-05, + "loss": 0.5636, + "step": 535200 + }, + { + "epoch": 12.328451690153818, + "grad_norm": 3.347902297973633, + "learning_rate": 1.2120060790273556e-05, + "loss": 0.5617, + "step": 535400 + }, + { + "epoch": 12.333057013908078, + "grad_norm": 3.351632833480835, + "learning_rate": 1.2112789226451042e-05, + "loss": 0.5495, + "step": 535600 + }, + { + "epoch": 12.337662337662337, + "grad_norm": 3.4726133346557617, + "learning_rate": 1.2105517662628526e-05, + "loss": 0.563, + "step": 535800 + }, + { + "epoch": 12.342267661416598, + "grad_norm": 3.684715509414673, + "learning_rate": 1.2098246098806008e-05, + "loss": 0.548, + "step": 536000 + }, + { + "epoch": 12.346872985170858, + "grad_norm": 2.476365327835083, + "learning_rate": 1.2091010892802605e-05, + "loss": 0.5503, + "step": 536200 + }, + { + "epoch": 12.351478308925117, + "grad_norm": 3.3055906295776367, + "learning_rate": 1.2083739328980091e-05, + "loss": 0.5412, + "step": 536400 + }, + { + "epoch": 12.356083632679377, + "grad_norm": 3.4834065437316895, + "learning_rate": 1.2076467765157575e-05, + "loss": 0.5416, + "step": 536600 + }, + { + "epoch": 12.360688956433638, + "grad_norm": 3.004270553588867, + "learning_rate": 1.2069196201335058e-05, + "loss": 0.5557, + "step": 536800 + }, + { + "epoch": 12.365294280187896, + "grad_norm": 2.7978525161743164, + "learning_rate": 1.2061924637512544e-05, + "loss": 0.5507, + "step": 537000 + }, + { + "epoch": 12.369899603942157, + "grad_norm": 3.2909445762634277, + "learning_rate": 1.2054653073690028e-05, + "loss": 0.5573, + "step": 537200 + }, + { + "epoch": 12.374504927696417, + "grad_norm": 3.068385124206543, + "learning_rate": 1.2047381509867512e-05, + "loss": 0.561, + "step": 537400 + }, + { + "epoch": 12.379110251450676, + "grad_norm": 3.6887574195861816, + "learning_rate": 1.2040109946044997e-05, + "loss": 0.5581, + "step": 537600 + }, + { + "epoch": 12.383715575204937, + "grad_norm": 3.1257615089416504, + "learning_rate": 1.203283838222248e-05, + "loss": 0.5471, + "step": 537800 + }, + { + "epoch": 12.388320898959197, + "grad_norm": 2.885606527328491, + "learning_rate": 1.2025566818399967e-05, + "loss": 0.558, + "step": 538000 + }, + { + "epoch": 12.392926222713458, + "grad_norm": 4.5534138679504395, + "learning_rate": 1.201829525457745e-05, + "loss": 0.5434, + "step": 538200 + }, + { + "epoch": 12.397531546467716, + "grad_norm": 4.204901695251465, + "learning_rate": 1.2011023690754933e-05, + "loss": 0.5512, + "step": 538400 + }, + { + "epoch": 12.402136870221977, + "grad_norm": 3.7075207233428955, + "learning_rate": 1.200375212693242e-05, + "loss": 0.5446, + "step": 538600 + }, + { + "epoch": 12.406742193976237, + "grad_norm": 2.795241117477417, + "learning_rate": 1.1996480563109902e-05, + "loss": 0.5551, + "step": 538800 + }, + { + "epoch": 12.411347517730496, + "grad_norm": 3.3998024463653564, + "learning_rate": 1.1989208999287388e-05, + "loss": 0.5498, + "step": 539000 + }, + { + "epoch": 12.415952841484756, + "grad_norm": 2.8881683349609375, + "learning_rate": 1.1981937435464872e-05, + "loss": 0.5478, + "step": 539200 + }, + { + "epoch": 12.420558165239017, + "grad_norm": 3.143094062805176, + "learning_rate": 1.1974665871642354e-05, + "loss": 0.5445, + "step": 539400 + }, + { + "epoch": 12.425163488993276, + "grad_norm": 3.2156903743743896, + "learning_rate": 1.196739430781984e-05, + "loss": 0.5572, + "step": 539600 + }, + { + "epoch": 12.429768812747536, + "grad_norm": 2.0139708518981934, + "learning_rate": 1.1960122743997325e-05, + "loss": 0.5578, + "step": 539800 + }, + { + "epoch": 12.434374136501797, + "grad_norm": 2.9531643390655518, + "learning_rate": 1.1952851180174809e-05, + "loss": 0.5555, + "step": 540000 + }, + { + "epoch": 12.438979460256055, + "grad_norm": 2.786808490753174, + "learning_rate": 1.1945579616352293e-05, + "loss": 0.5339, + "step": 540200 + }, + { + "epoch": 12.443584784010316, + "grad_norm": 3.243690013885498, + "learning_rate": 1.1938308052529777e-05, + "loss": 0.5459, + "step": 540400 + }, + { + "epoch": 12.448190107764576, + "grad_norm": 3.303994655609131, + "learning_rate": 1.1931036488707262e-05, + "loss": 0.5634, + "step": 540600 + }, + { + "epoch": 12.452795431518835, + "grad_norm": 4.073940277099609, + "learning_rate": 1.1923764924884746e-05, + "loss": 0.5491, + "step": 540800 + }, + { + "epoch": 12.457400755273095, + "grad_norm": 2.980377435684204, + "learning_rate": 1.191649336106223e-05, + "loss": 0.5501, + "step": 541000 + }, + { + "epoch": 12.462006079027356, + "grad_norm": 3.772876501083374, + "learning_rate": 1.1909221797239716e-05, + "loss": 0.5487, + "step": 541200 + }, + { + "epoch": 12.466611402781616, + "grad_norm": 2.804576873779297, + "learning_rate": 1.1901950233417198e-05, + "loss": 0.5479, + "step": 541400 + }, + { + "epoch": 12.471216726535875, + "grad_norm": 3.1938371658325195, + "learning_rate": 1.1894678669594684e-05, + "loss": 0.5541, + "step": 541600 + }, + { + "epoch": 12.475822050290136, + "grad_norm": 2.5504708290100098, + "learning_rate": 1.1887407105772169e-05, + "loss": 0.5465, + "step": 541800 + }, + { + "epoch": 12.480427374044396, + "grad_norm": 2.7534172534942627, + "learning_rate": 1.1880135541949651e-05, + "loss": 0.5542, + "step": 542000 + }, + { + "epoch": 12.485032697798655, + "grad_norm": 3.2549421787261963, + "learning_rate": 1.1872863978127137e-05, + "loss": 0.5521, + "step": 542200 + }, + { + "epoch": 12.489638021552915, + "grad_norm": 3.0159831047058105, + "learning_rate": 1.1865592414304621e-05, + "loss": 0.5503, + "step": 542400 + }, + { + "epoch": 12.494243345307176, + "grad_norm": 2.7797956466674805, + "learning_rate": 1.1858357208301218e-05, + "loss": 0.5564, + "step": 542600 + }, + { + "epoch": 12.498848669061434, + "grad_norm": 3.071063995361328, + "learning_rate": 1.18510856444787e-05, + "loss": 0.5411, + "step": 542800 + }, + { + "epoch": 12.503453992815695, + "grad_norm": 4.068755626678467, + "learning_rate": 1.1843814080656187e-05, + "loss": 0.5572, + "step": 543000 + }, + { + "epoch": 12.508059316569955, + "grad_norm": 3.225297689437866, + "learning_rate": 1.183654251683367e-05, + "loss": 0.5508, + "step": 543200 + }, + { + "epoch": 12.512664640324214, + "grad_norm": 2.821578025817871, + "learning_rate": 1.1829307310830268e-05, + "loss": 0.5474, + "step": 543400 + }, + { + "epoch": 12.517269964078475, + "grad_norm": 5.399388313293457, + "learning_rate": 1.1822035747007752e-05, + "loss": 0.5504, + "step": 543600 + }, + { + "epoch": 12.521875287832735, + "grad_norm": 3.5580897331237793, + "learning_rate": 1.1814764183185236e-05, + "loss": 0.5641, + "step": 543800 + }, + { + "epoch": 12.526480611586994, + "grad_norm": 3.240264892578125, + "learning_rate": 1.180749261936272e-05, + "loss": 0.5511, + "step": 544000 + }, + { + "epoch": 12.531085935341254, + "grad_norm": 3.2633004188537598, + "learning_rate": 1.1800221055540205e-05, + "loss": 0.5504, + "step": 544200 + }, + { + "epoch": 12.535691259095515, + "grad_norm": 3.4117608070373535, + "learning_rate": 1.1792949491717689e-05, + "loss": 0.5429, + "step": 544400 + }, + { + "epoch": 12.540296582849773, + "grad_norm": 3.1878907680511475, + "learning_rate": 1.1785677927895173e-05, + "loss": 0.5544, + "step": 544600 + }, + { + "epoch": 12.544901906604034, + "grad_norm": 3.49251389503479, + "learning_rate": 1.1778406364072659e-05, + "loss": 0.5402, + "step": 544800 + }, + { + "epoch": 12.549507230358294, + "grad_norm": 3.136068820953369, + "learning_rate": 1.1771134800250141e-05, + "loss": 0.5617, + "step": 545000 + }, + { + "epoch": 12.554112554112555, + "grad_norm": 3.5267179012298584, + "learning_rate": 1.1763863236427627e-05, + "loss": 0.5583, + "step": 545200 + }, + { + "epoch": 12.558717877866814, + "grad_norm": 2.639244794845581, + "learning_rate": 1.1756591672605112e-05, + "loss": 0.5556, + "step": 545400 + }, + { + "epoch": 12.563323201621074, + "grad_norm": 2.4130160808563232, + "learning_rate": 1.1749356466601708e-05, + "loss": 0.5378, + "step": 545600 + }, + { + "epoch": 12.567928525375335, + "grad_norm": 3.0723562240600586, + "learning_rate": 1.1742084902779191e-05, + "loss": 0.5493, + "step": 545800 + }, + { + "epoch": 12.572533849129593, + "grad_norm": 3.0826873779296875, + "learning_rate": 1.1734813338956677e-05, + "loss": 0.5478, + "step": 546000 + }, + { + "epoch": 12.577139172883854, + "grad_norm": 3.6943721771240234, + "learning_rate": 1.1727541775134161e-05, + "loss": 0.552, + "step": 546200 + }, + { + "epoch": 12.581744496638114, + "grad_norm": 3.1983044147491455, + "learning_rate": 1.1720270211311644e-05, + "loss": 0.5533, + "step": 546400 + }, + { + "epoch": 12.586349820392373, + "grad_norm": 2.8163883686065674, + "learning_rate": 1.171299864748913e-05, + "loss": 0.5551, + "step": 546600 + }, + { + "epoch": 12.590955144146633, + "grad_norm": 3.1819756031036377, + "learning_rate": 1.1705727083666614e-05, + "loss": 0.5637, + "step": 546800 + }, + { + "epoch": 12.595560467900894, + "grad_norm": 3.5395870208740234, + "learning_rate": 1.1698455519844098e-05, + "loss": 0.5517, + "step": 547000 + }, + { + "epoch": 12.600165791655153, + "grad_norm": 2.773688793182373, + "learning_rate": 1.1691183956021582e-05, + "loss": 0.5692, + "step": 547200 + }, + { + "epoch": 12.604771115409413, + "grad_norm": 3.0060763359069824, + "learning_rate": 1.1683912392199066e-05, + "loss": 0.5571, + "step": 547400 + }, + { + "epoch": 12.609376439163674, + "grad_norm": 3.25728702545166, + "learning_rate": 1.167664082837655e-05, + "loss": 0.5608, + "step": 547600 + }, + { + "epoch": 12.613981762917934, + "grad_norm": 3.356100559234619, + "learning_rate": 1.1669369264554035e-05, + "loss": 0.5368, + "step": 547800 + }, + { + "epoch": 12.618587086672193, + "grad_norm": 3.625145435333252, + "learning_rate": 1.166209770073152e-05, + "loss": 0.538, + "step": 548000 + }, + { + "epoch": 12.623192410426453, + "grad_norm": 2.9204094409942627, + "learning_rate": 1.1654826136909003e-05, + "loss": 0.546, + "step": 548200 + }, + { + "epoch": 12.627797734180714, + "grad_norm": 3.6132349967956543, + "learning_rate": 1.1647590930905602e-05, + "loss": 0.5592, + "step": 548400 + }, + { + "epoch": 12.632403057934972, + "grad_norm": 3.495785713195801, + "learning_rate": 1.1640319367083084e-05, + "loss": 0.539, + "step": 548600 + }, + { + "epoch": 12.637008381689233, + "grad_norm": 3.632181167602539, + "learning_rate": 1.163304780326057e-05, + "loss": 0.5363, + "step": 548800 + }, + { + "epoch": 12.641613705443493, + "grad_norm": 3.3912816047668457, + "learning_rate": 1.1625776239438055e-05, + "loss": 0.554, + "step": 549000 + }, + { + "epoch": 12.646219029197752, + "grad_norm": 3.4610326290130615, + "learning_rate": 1.1618504675615537e-05, + "loss": 0.5537, + "step": 549200 + }, + { + "epoch": 12.650824352952013, + "grad_norm": 3.4539434909820557, + "learning_rate": 1.1611233111793023e-05, + "loss": 0.5621, + "step": 549400 + }, + { + "epoch": 12.655429676706273, + "grad_norm": 3.00600528717041, + "learning_rate": 1.160399790578962e-05, + "loss": 0.5558, + "step": 549600 + }, + { + "epoch": 12.660035000460532, + "grad_norm": 3.060939311981201, + "learning_rate": 1.1596726341967104e-05, + "loss": 0.561, + "step": 549800 + }, + { + "epoch": 12.664640324214792, + "grad_norm": 3.2902770042419434, + "learning_rate": 1.1589454778144588e-05, + "loss": 0.5428, + "step": 550000 + }, + { + "epoch": 12.669245647969053, + "grad_norm": 3.1374452114105225, + "learning_rate": 1.1582183214322073e-05, + "loss": 0.5523, + "step": 550200 + }, + { + "epoch": 12.673850971723311, + "grad_norm": 2.6760544776916504, + "learning_rate": 1.1574911650499557e-05, + "loss": 0.555, + "step": 550400 + }, + { + "epoch": 12.678456295477572, + "grad_norm": 3.863650321960449, + "learning_rate": 1.1567640086677041e-05, + "loss": 0.5525, + "step": 550600 + }, + { + "epoch": 12.683061619231832, + "grad_norm": 2.675981044769287, + "learning_rate": 1.1560368522854525e-05, + "loss": 0.5486, + "step": 550800 + }, + { + "epoch": 12.687666942986091, + "grad_norm": 3.3591291904449463, + "learning_rate": 1.155309695903201e-05, + "loss": 0.5461, + "step": 551000 + }, + { + "epoch": 12.692272266740352, + "grad_norm": 2.852849006652832, + "learning_rate": 1.1545825395209494e-05, + "loss": 0.5527, + "step": 551200 + }, + { + "epoch": 12.696877590494612, + "grad_norm": 2.850482940673828, + "learning_rate": 1.1538553831386978e-05, + "loss": 0.5474, + "step": 551400 + }, + { + "epoch": 12.70148291424887, + "grad_norm": 3.2086989879608154, + "learning_rate": 1.1531282267564464e-05, + "loss": 0.549, + "step": 551600 + }, + { + "epoch": 12.706088238003131, + "grad_norm": 3.074154853820801, + "learning_rate": 1.152404706156106e-05, + "loss": 0.559, + "step": 551800 + }, + { + "epoch": 12.710693561757392, + "grad_norm": 3.1534781455993652, + "learning_rate": 1.1516775497738545e-05, + "loss": 0.5647, + "step": 552000 + }, + { + "epoch": 12.715298885511652, + "grad_norm": 3.174724817276001, + "learning_rate": 1.1509503933916027e-05, + "loss": 0.5527, + "step": 552200 + }, + { + "epoch": 12.719904209265911, + "grad_norm": 2.7721035480499268, + "learning_rate": 1.1502232370093513e-05, + "loss": 0.5488, + "step": 552400 + }, + { + "epoch": 12.724509533020171, + "grad_norm": 3.360368251800537, + "learning_rate": 1.1494960806270998e-05, + "loss": 0.5643, + "step": 552600 + }, + { + "epoch": 12.729114856774432, + "grad_norm": 3.283878803253174, + "learning_rate": 1.148768924244848e-05, + "loss": 0.5432, + "step": 552800 + }, + { + "epoch": 12.73372018052869, + "grad_norm": 3.110537052154541, + "learning_rate": 1.1480417678625966e-05, + "loss": 0.555, + "step": 553000 + }, + { + "epoch": 12.738325504282951, + "grad_norm": 4.010375499725342, + "learning_rate": 1.147314611480345e-05, + "loss": 0.5433, + "step": 553200 + }, + { + "epoch": 12.742930828037212, + "grad_norm": 3.154646873474121, + "learning_rate": 1.1465874550980934e-05, + "loss": 0.5519, + "step": 553400 + }, + { + "epoch": 12.74753615179147, + "grad_norm": 3.806518077850342, + "learning_rate": 1.1458602987158419e-05, + "loss": 0.5567, + "step": 553600 + }, + { + "epoch": 12.75214147554573, + "grad_norm": 3.772310972213745, + "learning_rate": 1.1451331423335903e-05, + "loss": 0.5504, + "step": 553800 + }, + { + "epoch": 12.756746799299991, + "grad_norm": 3.2461366653442383, + "learning_rate": 1.1444059859513387e-05, + "loss": 0.5575, + "step": 554000 + }, + { + "epoch": 12.76135212305425, + "grad_norm": 3.1284291744232178, + "learning_rate": 1.1436788295690871e-05, + "loss": 0.5541, + "step": 554200 + }, + { + "epoch": 12.76595744680851, + "grad_norm": 3.069737434387207, + "learning_rate": 1.1429516731868356e-05, + "loss": 0.5529, + "step": 554400 + }, + { + "epoch": 12.770562770562771, + "grad_norm": 3.285787582397461, + "learning_rate": 1.142224516804584e-05, + "loss": 0.5531, + "step": 554600 + }, + { + "epoch": 12.775168094317031, + "grad_norm": 2.772050142288208, + "learning_rate": 1.1415009962042437e-05, + "loss": 0.5438, + "step": 554800 + }, + { + "epoch": 12.77977341807129, + "grad_norm": 2.2192680835723877, + "learning_rate": 1.1407738398219921e-05, + "loss": 0.5319, + "step": 555000 + }, + { + "epoch": 12.78437874182555, + "grad_norm": 3.2888576984405518, + "learning_rate": 1.1400466834397407e-05, + "loss": 0.5462, + "step": 555200 + }, + { + "epoch": 12.788984065579811, + "grad_norm": 3.075023889541626, + "learning_rate": 1.139319527057489e-05, + "loss": 0.5449, + "step": 555400 + }, + { + "epoch": 12.79358938933407, + "grad_norm": 3.2394826412200928, + "learning_rate": 1.1385923706752374e-05, + "loss": 0.5461, + "step": 555600 + }, + { + "epoch": 12.79819471308833, + "grad_norm": 3.1171531677246094, + "learning_rate": 1.137865214292986e-05, + "loss": 0.5433, + "step": 555800 + }, + { + "epoch": 12.80280003684259, + "grad_norm": 2.5984578132629395, + "learning_rate": 1.1371380579107344e-05, + "loss": 0.5531, + "step": 556000 + }, + { + "epoch": 12.80740536059685, + "grad_norm": 4.881665229797363, + "learning_rate": 1.1364109015284826e-05, + "loss": 0.5432, + "step": 556200 + }, + { + "epoch": 12.81201068435111, + "grad_norm": 2.8773512840270996, + "learning_rate": 1.1356837451462312e-05, + "loss": 0.5667, + "step": 556400 + }, + { + "epoch": 12.81661600810537, + "grad_norm": 3.3617618083953857, + "learning_rate": 1.1349565887639796e-05, + "loss": 0.5393, + "step": 556600 + }, + { + "epoch": 12.821221331859629, + "grad_norm": 2.5510413646698, + "learning_rate": 1.134229432381728e-05, + "loss": 0.5449, + "step": 556800 + }, + { + "epoch": 12.82582665561389, + "grad_norm": 3.1804873943328857, + "learning_rate": 1.1335022759994765e-05, + "loss": 0.5515, + "step": 557000 + }, + { + "epoch": 12.83043197936815, + "grad_norm": 3.242882490158081, + "learning_rate": 1.1327751196172249e-05, + "loss": 0.5713, + "step": 557200 + }, + { + "epoch": 12.835037303122409, + "grad_norm": 2.9455573558807373, + "learning_rate": 1.1320479632349733e-05, + "loss": 0.5588, + "step": 557400 + }, + { + "epoch": 12.83964262687667, + "grad_norm": 3.158345937728882, + "learning_rate": 1.131324442634633e-05, + "loss": 0.5454, + "step": 557600 + }, + { + "epoch": 12.84424795063093, + "grad_norm": 3.215794563293457, + "learning_rate": 1.1305972862523814e-05, + "loss": 0.543, + "step": 557800 + }, + { + "epoch": 12.848853274385188, + "grad_norm": 3.158463954925537, + "learning_rate": 1.12987012987013e-05, + "loss": 0.5543, + "step": 558000 + }, + { + "epoch": 12.853458598139449, + "grad_norm": 2.367264747619629, + "learning_rate": 1.1291429734878783e-05, + "loss": 0.5444, + "step": 558200 + }, + { + "epoch": 12.85806392189371, + "grad_norm": 2.9616682529449463, + "learning_rate": 1.1284158171056267e-05, + "loss": 0.5505, + "step": 558400 + }, + { + "epoch": 12.86266924564797, + "grad_norm": 4.787532329559326, + "learning_rate": 1.1276886607233753e-05, + "loss": 0.542, + "step": 558600 + }, + { + "epoch": 12.867274569402229, + "grad_norm": 2.9759278297424316, + "learning_rate": 1.1269615043411235e-05, + "loss": 0.5531, + "step": 558800 + }, + { + "epoch": 12.871879893156489, + "grad_norm": 3.0211524963378906, + "learning_rate": 1.126234347958872e-05, + "loss": 0.5526, + "step": 559000 + }, + { + "epoch": 12.87648521691075, + "grad_norm": 3.0512332916259766, + "learning_rate": 1.1255071915766206e-05, + "loss": 0.5651, + "step": 559200 + }, + { + "epoch": 12.881090540665008, + "grad_norm": 3.1640422344207764, + "learning_rate": 1.124780035194369e-05, + "loss": 0.5473, + "step": 559400 + }, + { + "epoch": 12.885695864419269, + "grad_norm": 2.9496026039123535, + "learning_rate": 1.1240528788121174e-05, + "loss": 0.5571, + "step": 559600 + }, + { + "epoch": 12.89030118817353, + "grad_norm": 2.9672327041625977, + "learning_rate": 1.1233293582117771e-05, + "loss": 0.5536, + "step": 559800 + }, + { + "epoch": 12.894906511927788, + "grad_norm": 3.0200912952423096, + "learning_rate": 1.1226022018295255e-05, + "loss": 0.5478, + "step": 560000 + }, + { + "epoch": 12.899511835682048, + "grad_norm": 3.494211196899414, + "learning_rate": 1.121875045447274e-05, + "loss": 0.5472, + "step": 560200 + }, + { + "epoch": 12.904117159436309, + "grad_norm": 3.695265293121338, + "learning_rate": 1.1211478890650224e-05, + "loss": 0.5525, + "step": 560400 + }, + { + "epoch": 12.908722483190568, + "grad_norm": 2.6711409091949463, + "learning_rate": 1.1204207326827708e-05, + "loss": 0.5473, + "step": 560600 + }, + { + "epoch": 12.913327806944828, + "grad_norm": 3.254509925842285, + "learning_rate": 1.1196935763005192e-05, + "loss": 0.5604, + "step": 560800 + }, + { + "epoch": 12.917933130699089, + "grad_norm": 3.699026107788086, + "learning_rate": 1.1189664199182676e-05, + "loss": 0.5427, + "step": 561000 + }, + { + "epoch": 12.922538454453347, + "grad_norm": 3.1557929515838623, + "learning_rate": 1.118239263536016e-05, + "loss": 0.5558, + "step": 561200 + }, + { + "epoch": 12.927143778207608, + "grad_norm": 3.200070381164551, + "learning_rate": 1.1175121071537646e-05, + "loss": 0.5462, + "step": 561400 + }, + { + "epoch": 12.931749101961868, + "grad_norm": 2.912868022918701, + "learning_rate": 1.1167849507715129e-05, + "loss": 0.5445, + "step": 561600 + }, + { + "epoch": 12.936354425716129, + "grad_norm": 3.3618550300598145, + "learning_rate": 1.1160577943892613e-05, + "loss": 0.5478, + "step": 561800 + }, + { + "epoch": 12.940959749470387, + "grad_norm": 2.8849663734436035, + "learning_rate": 1.1153306380070099e-05, + "loss": 0.5437, + "step": 562000 + }, + { + "epoch": 12.945565073224648, + "grad_norm": 2.9502649307250977, + "learning_rate": 1.1146034816247582e-05, + "loss": 0.5529, + "step": 562200 + }, + { + "epoch": 12.950170396978908, + "grad_norm": 3.4335813522338867, + "learning_rate": 1.1138763252425066e-05, + "loss": 0.5609, + "step": 562400 + }, + { + "epoch": 12.954775720733167, + "grad_norm": 2.878819227218628, + "learning_rate": 1.1131491688602552e-05, + "loss": 0.5495, + "step": 562600 + }, + { + "epoch": 12.959381044487428, + "grad_norm": 3.1835694313049316, + "learning_rate": 1.1124220124780036e-05, + "loss": 0.5511, + "step": 562800 + }, + { + "epoch": 12.963986368241688, + "grad_norm": 3.6071760654449463, + "learning_rate": 1.111694856095752e-05, + "loss": 0.5501, + "step": 563000 + }, + { + "epoch": 12.968591691995947, + "grad_norm": 2.51938533782959, + "learning_rate": 1.1109676997135004e-05, + "loss": 0.5579, + "step": 563200 + }, + { + "epoch": 12.973197015750207, + "grad_norm": 3.228456497192383, + "learning_rate": 1.1102405433312489e-05, + "loss": 0.5607, + "step": 563400 + }, + { + "epoch": 12.977802339504468, + "grad_norm": 3.919980764389038, + "learning_rate": 1.1095133869489973e-05, + "loss": 0.5575, + "step": 563600 + }, + { + "epoch": 12.982407663258726, + "grad_norm": 3.511781930923462, + "learning_rate": 1.1087862305667457e-05, + "loss": 0.5527, + "step": 563800 + }, + { + "epoch": 12.987012987012987, + "grad_norm": 3.6616077423095703, + "learning_rate": 1.1080590741844941e-05, + "loss": 0.5486, + "step": 564000 + }, + { + "epoch": 12.991618310767247, + "grad_norm": 3.4475338459014893, + "learning_rate": 1.1073355535841538e-05, + "loss": 0.5548, + "step": 564200 + }, + { + "epoch": 12.996223634521506, + "grad_norm": 3.6094648838043213, + "learning_rate": 1.1066083972019022e-05, + "loss": 0.5447, + "step": 564400 + }, + { + "epoch": 13.0, + "eval_loss": 0.535080075263977, + "eval_runtime": 174.1646, + "eval_samples_per_second": 162.84, + "eval_steps_per_second": 10.18, + "step": 564564 + }, + { + "epoch": 13.000828958275767, + "grad_norm": 3.2227447032928467, + "learning_rate": 1.1058812408196507e-05, + "loss": 0.5408, + "step": 564600 + }, + { + "epoch": 13.005434282030027, + "grad_norm": 2.8370840549468994, + "learning_rate": 1.1051540844373992e-05, + "loss": 0.5382, + "step": 564800 + }, + { + "epoch": 13.010039605784286, + "grad_norm": 3.57401442527771, + "learning_rate": 1.1044269280551475e-05, + "loss": 0.5435, + "step": 565000 + }, + { + "epoch": 13.014644929538546, + "grad_norm": 3.077897310256958, + "learning_rate": 1.1037034074548072e-05, + "loss": 0.5478, + "step": 565200 + }, + { + "epoch": 13.019250253292807, + "grad_norm": 3.1777496337890625, + "learning_rate": 1.1029762510725556e-05, + "loss": 0.5281, + "step": 565400 + }, + { + "epoch": 13.023855577047067, + "grad_norm": 3.8741352558135986, + "learning_rate": 1.1022490946903042e-05, + "loss": 0.5499, + "step": 565600 + }, + { + "epoch": 13.028460900801326, + "grad_norm": 2.772387742996216, + "learning_rate": 1.1015219383080525e-05, + "loss": 0.5341, + "step": 565800 + }, + { + "epoch": 13.033066224555586, + "grad_norm": 2.9487922191619873, + "learning_rate": 1.1007947819258009e-05, + "loss": 0.5334, + "step": 566000 + }, + { + "epoch": 13.037671548309847, + "grad_norm": 2.795191764831543, + "learning_rate": 1.1000676255435495e-05, + "loss": 0.55, + "step": 566200 + }, + { + "epoch": 13.042276872064106, + "grad_norm": 2.9274868965148926, + "learning_rate": 1.0993404691612979e-05, + "loss": 0.5449, + "step": 566400 + }, + { + "epoch": 13.046882195818366, + "grad_norm": 2.8535702228546143, + "learning_rate": 1.0986133127790463e-05, + "loss": 0.5465, + "step": 566600 + }, + { + "epoch": 13.051487519572627, + "grad_norm": 2.9377224445343018, + "learning_rate": 1.0978861563967947e-05, + "loss": 0.5542, + "step": 566800 + }, + { + "epoch": 13.056092843326885, + "grad_norm": 3.308588743209839, + "learning_rate": 1.0971590000145432e-05, + "loss": 0.5451, + "step": 567000 + }, + { + "epoch": 13.060698167081146, + "grad_norm": 3.0085132122039795, + "learning_rate": 1.0964354794142028e-05, + "loss": 0.5511, + "step": 567200 + }, + { + "epoch": 13.065303490835406, + "grad_norm": 2.9090094566345215, + "learning_rate": 1.0957083230319513e-05, + "loss": 0.5345, + "step": 567400 + }, + { + "epoch": 13.069908814589665, + "grad_norm": 2.8537399768829346, + "learning_rate": 1.0949811666496997e-05, + "loss": 0.5553, + "step": 567600 + }, + { + "epoch": 13.074514138343925, + "grad_norm": 2.471475124359131, + "learning_rate": 1.0942540102674483e-05, + "loss": 0.5526, + "step": 567800 + }, + { + "epoch": 13.079119462098186, + "grad_norm": 3.00661301612854, + "learning_rate": 1.0935268538851965e-05, + "loss": 0.5402, + "step": 568000 + }, + { + "epoch": 13.083724785852445, + "grad_norm": 3.291968822479248, + "learning_rate": 1.092799697502945e-05, + "loss": 0.5389, + "step": 568200 + }, + { + "epoch": 13.088330109606705, + "grad_norm": 5.072547912597656, + "learning_rate": 1.0920725411206936e-05, + "loss": 0.5422, + "step": 568400 + }, + { + "epoch": 13.092935433360966, + "grad_norm": 2.9961328506469727, + "learning_rate": 1.0913453847384418e-05, + "loss": 0.5345, + "step": 568600 + }, + { + "epoch": 13.097540757115226, + "grad_norm": 2.3409829139709473, + "learning_rate": 1.0906182283561902e-05, + "loss": 0.5516, + "step": 568800 + }, + { + "epoch": 13.102146080869485, + "grad_norm": 2.485668897628784, + "learning_rate": 1.0898910719739388e-05, + "loss": 0.563, + "step": 569000 + }, + { + "epoch": 13.106751404623745, + "grad_norm": 2.9506261348724365, + "learning_rate": 1.089163915591687e-05, + "loss": 0.5452, + "step": 569200 + }, + { + "epoch": 13.111356728378006, + "grad_norm": 3.2650866508483887, + "learning_rate": 1.0884367592094357e-05, + "loss": 0.547, + "step": 569400 + }, + { + "epoch": 13.115962052132264, + "grad_norm": 4.075747489929199, + "learning_rate": 1.087709602827184e-05, + "loss": 0.5352, + "step": 569600 + }, + { + "epoch": 13.120567375886525, + "grad_norm": 3.905189275741577, + "learning_rate": 1.0869824464449325e-05, + "loss": 0.5458, + "step": 569800 + }, + { + "epoch": 13.125172699640785, + "grad_norm": 3.0449321269989014, + "learning_rate": 1.086255290062681e-05, + "loss": 0.5476, + "step": 570000 + }, + { + "epoch": 13.129778023395044, + "grad_norm": 5.308364391326904, + "learning_rate": 1.0855281336804293e-05, + "loss": 0.5482, + "step": 570200 + }, + { + "epoch": 13.134383347149305, + "grad_norm": 3.4296975135803223, + "learning_rate": 1.0848009772981778e-05, + "loss": 0.5381, + "step": 570400 + }, + { + "epoch": 13.138988670903565, + "grad_norm": 3.1632046699523926, + "learning_rate": 1.0840738209159262e-05, + "loss": 0.5482, + "step": 570600 + }, + { + "epoch": 13.143593994657824, + "grad_norm": 2.9227309226989746, + "learning_rate": 1.0833466645336746e-05, + "loss": 0.5479, + "step": 570800 + }, + { + "epoch": 13.148199318412084, + "grad_norm": 2.676102638244629, + "learning_rate": 1.0826195081514232e-05, + "loss": 0.5539, + "step": 571000 + }, + { + "epoch": 13.152804642166345, + "grad_norm": 3.266191244125366, + "learning_rate": 1.0818923517691715e-05, + "loss": 0.5409, + "step": 571200 + }, + { + "epoch": 13.157409965920603, + "grad_norm": 2.1656315326690674, + "learning_rate": 1.0811651953869199e-05, + "loss": 0.5465, + "step": 571400 + }, + { + "epoch": 13.162015289674864, + "grad_norm": 2.710066318511963, + "learning_rate": 1.0804380390046685e-05, + "loss": 0.5401, + "step": 571600 + }, + { + "epoch": 13.166620613429124, + "grad_norm": 2.9499149322509766, + "learning_rate": 1.0797108826224167e-05, + "loss": 0.5369, + "step": 571800 + }, + { + "epoch": 13.171225937183385, + "grad_norm": 3.0565083026885986, + "learning_rate": 1.0789837262401651e-05, + "loss": 0.5431, + "step": 572000 + }, + { + "epoch": 13.175831260937644, + "grad_norm": 2.4222073554992676, + "learning_rate": 1.0782602056398248e-05, + "loss": 0.5354, + "step": 572200 + }, + { + "epoch": 13.180436584691904, + "grad_norm": 3.2243409156799316, + "learning_rate": 1.0775330492575734e-05, + "loss": 0.5388, + "step": 572400 + }, + { + "epoch": 13.185041908446165, + "grad_norm": 3.4836127758026123, + "learning_rate": 1.0768058928753217e-05, + "loss": 0.5449, + "step": 572600 + }, + { + "epoch": 13.189647232200423, + "grad_norm": 2.60473370552063, + "learning_rate": 1.0760787364930703e-05, + "loss": 0.5426, + "step": 572800 + }, + { + "epoch": 13.194252555954684, + "grad_norm": 2.716947317123413, + "learning_rate": 1.0753515801108187e-05, + "loss": 0.5478, + "step": 573000 + }, + { + "epoch": 13.198857879708944, + "grad_norm": 3.650529623031616, + "learning_rate": 1.0746244237285671e-05, + "loss": 0.5549, + "step": 573200 + }, + { + "epoch": 13.203463203463203, + "grad_norm": 2.8756980895996094, + "learning_rate": 1.0738972673463155e-05, + "loss": 0.5316, + "step": 573400 + }, + { + "epoch": 13.208068527217463, + "grad_norm": 2.6897635459899902, + "learning_rate": 1.0731737467459752e-05, + "loss": 0.5562, + "step": 573600 + }, + { + "epoch": 13.212673850971724, + "grad_norm": 3.219444751739502, + "learning_rate": 1.0724465903637237e-05, + "loss": 0.5417, + "step": 573800 + }, + { + "epoch": 13.217279174725983, + "grad_norm": 3.117840051651001, + "learning_rate": 1.071719433981472e-05, + "loss": 0.5398, + "step": 574000 + }, + { + "epoch": 13.221884498480243, + "grad_norm": 4.030891418457031, + "learning_rate": 1.0709922775992205e-05, + "loss": 0.5512, + "step": 574200 + }, + { + "epoch": 13.226489822234504, + "grad_norm": 3.657562494277954, + "learning_rate": 1.070265121216969e-05, + "loss": 0.5485, + "step": 574400 + }, + { + "epoch": 13.231095145988762, + "grad_norm": 2.986097574234009, + "learning_rate": 1.0695379648347175e-05, + "loss": 0.5443, + "step": 574600 + }, + { + "epoch": 13.235700469743023, + "grad_norm": 3.578052043914795, + "learning_rate": 1.0688108084524658e-05, + "loss": 0.5537, + "step": 574800 + }, + { + "epoch": 13.240305793497283, + "grad_norm": 3.427138328552246, + "learning_rate": 1.0680836520702142e-05, + "loss": 0.5402, + "step": 575000 + }, + { + "epoch": 13.244911117251542, + "grad_norm": 3.066641330718994, + "learning_rate": 1.0673564956879628e-05, + "loss": 0.5465, + "step": 575200 + }, + { + "epoch": 13.249516441005802, + "grad_norm": 2.710906505584717, + "learning_rate": 1.066629339305711e-05, + "loss": 0.5569, + "step": 575400 + }, + { + "epoch": 13.254121764760063, + "grad_norm": 2.721404790878296, + "learning_rate": 1.0659021829234596e-05, + "loss": 0.5504, + "step": 575600 + }, + { + "epoch": 13.258727088514323, + "grad_norm": 4.378937244415283, + "learning_rate": 1.065175026541208e-05, + "loss": 0.5419, + "step": 575800 + }, + { + "epoch": 13.263332412268582, + "grad_norm": 2.9746248722076416, + "learning_rate": 1.0644478701589563e-05, + "loss": 0.5399, + "step": 576000 + }, + { + "epoch": 13.267937736022843, + "grad_norm": 3.093886613845825, + "learning_rate": 1.0637207137767049e-05, + "loss": 0.5506, + "step": 576200 + }, + { + "epoch": 13.272543059777103, + "grad_norm": 3.421165943145752, + "learning_rate": 1.0629935573944533e-05, + "loss": 0.5569, + "step": 576400 + }, + { + "epoch": 13.277148383531362, + "grad_norm": 2.879319906234741, + "learning_rate": 1.0622664010122017e-05, + "loss": 0.5379, + "step": 576600 + }, + { + "epoch": 13.281753707285622, + "grad_norm": 4.0503363609313965, + "learning_rate": 1.0615392446299501e-05, + "loss": 0.5518, + "step": 576800 + }, + { + "epoch": 13.286359031039883, + "grad_norm": 3.7506306171417236, + "learning_rate": 1.0608120882476986e-05, + "loss": 0.5454, + "step": 577000 + }, + { + "epoch": 13.290964354794141, + "grad_norm": 3.531977891921997, + "learning_rate": 1.0600849318654472e-05, + "loss": 0.5499, + "step": 577200 + }, + { + "epoch": 13.295569678548402, + "grad_norm": 3.0157346725463867, + "learning_rate": 1.0593614112651069e-05, + "loss": 0.5529, + "step": 577400 + }, + { + "epoch": 13.300175002302662, + "grad_norm": 2.793652057647705, + "learning_rate": 1.0586342548828551e-05, + "loss": 0.5458, + "step": 577600 + }, + { + "epoch": 13.304780326056921, + "grad_norm": 2.5756351947784424, + "learning_rate": 1.0579070985006035e-05, + "loss": 0.5509, + "step": 577800 + }, + { + "epoch": 13.309385649811182, + "grad_norm": 2.810288906097412, + "learning_rate": 1.0571799421183521e-05, + "loss": 0.5426, + "step": 578000 + }, + { + "epoch": 13.313990973565442, + "grad_norm": 2.495398759841919, + "learning_rate": 1.0564527857361004e-05, + "loss": 0.5526, + "step": 578200 + }, + { + "epoch": 13.3185962973197, + "grad_norm": 4.127913475036621, + "learning_rate": 1.0557256293538488e-05, + "loss": 0.5498, + "step": 578400 + }, + { + "epoch": 13.323201621073961, + "grad_norm": 3.074737071990967, + "learning_rate": 1.0549984729715974e-05, + "loss": 0.543, + "step": 578600 + }, + { + "epoch": 13.327806944828222, + "grad_norm": 2.928070306777954, + "learning_rate": 1.0542713165893456e-05, + "loss": 0.5449, + "step": 578800 + }, + { + "epoch": 13.332412268582482, + "grad_norm": 3.301532745361328, + "learning_rate": 1.0535441602070942e-05, + "loss": 0.5471, + "step": 579000 + }, + { + "epoch": 13.337017592336741, + "grad_norm": 3.355616331100464, + "learning_rate": 1.0528170038248427e-05, + "loss": 0.5447, + "step": 579200 + }, + { + "epoch": 13.341622916091001, + "grad_norm": 2.8991310596466064, + "learning_rate": 1.0520898474425909e-05, + "loss": 0.5405, + "step": 579400 + }, + { + "epoch": 13.346228239845262, + "grad_norm": 3.3360657691955566, + "learning_rate": 1.0513626910603395e-05, + "loss": 0.5585, + "step": 579600 + }, + { + "epoch": 13.35083356359952, + "grad_norm": 4.818231105804443, + "learning_rate": 1.050635534678088e-05, + "loss": 0.5512, + "step": 579800 + }, + { + "epoch": 13.355438887353781, + "grad_norm": 4.0637078285217285, + "learning_rate": 1.0499083782958362e-05, + "loss": 0.5499, + "step": 580000 + }, + { + "epoch": 13.360044211108042, + "grad_norm": 4.462865352630615, + "learning_rate": 1.0491812219135848e-05, + "loss": 0.5425, + "step": 580200 + }, + { + "epoch": 13.3646495348623, + "grad_norm": 2.5351686477661133, + "learning_rate": 1.0484540655313332e-05, + "loss": 0.5363, + "step": 580400 + }, + { + "epoch": 13.36925485861656, + "grad_norm": 3.533302068710327, + "learning_rate": 1.0477269091490818e-05, + "loss": 0.5424, + "step": 580600 + }, + { + "epoch": 13.373860182370821, + "grad_norm": 3.593005418777466, + "learning_rate": 1.04699975276683e-05, + "loss": 0.5491, + "step": 580800 + }, + { + "epoch": 13.37846550612508, + "grad_norm": 3.0207858085632324, + "learning_rate": 1.0462725963845784e-05, + "loss": 0.5451, + "step": 581000 + }, + { + "epoch": 13.38307082987934, + "grad_norm": 3.600573778152466, + "learning_rate": 1.045545440002327e-05, + "loss": 0.5409, + "step": 581200 + }, + { + "epoch": 13.3876761536336, + "grad_norm": 3.4477622509002686, + "learning_rate": 1.0448182836200753e-05, + "loss": 0.5515, + "step": 581400 + }, + { + "epoch": 13.39228147738786, + "grad_norm": 4.029882431030273, + "learning_rate": 1.044094763019735e-05, + "loss": 0.557, + "step": 581600 + }, + { + "epoch": 13.39688680114212, + "grad_norm": 3.970107316970825, + "learning_rate": 1.0433676066374834e-05, + "loss": 0.545, + "step": 581800 + }, + { + "epoch": 13.40149212489638, + "grad_norm": 3.111429214477539, + "learning_rate": 1.042640450255232e-05, + "loss": 0.5443, + "step": 582000 + }, + { + "epoch": 13.406097448650641, + "grad_norm": 3.304234027862549, + "learning_rate": 1.0419132938729802e-05, + "loss": 0.5425, + "step": 582200 + }, + { + "epoch": 13.4107027724049, + "grad_norm": 3.118661642074585, + "learning_rate": 1.0411861374907288e-05, + "loss": 0.5426, + "step": 582400 + }, + { + "epoch": 13.41530809615916, + "grad_norm": 3.274920701980591, + "learning_rate": 1.0404589811084773e-05, + "loss": 0.5526, + "step": 582600 + }, + { + "epoch": 13.41991341991342, + "grad_norm": 2.8108534812927246, + "learning_rate": 1.0397318247262255e-05, + "loss": 0.5496, + "step": 582800 + }, + { + "epoch": 13.42451874366768, + "grad_norm": 3.452916383743286, + "learning_rate": 1.0390046683439741e-05, + "loss": 0.5546, + "step": 583000 + }, + { + "epoch": 13.42912406742194, + "grad_norm": 2.889641284942627, + "learning_rate": 1.0382775119617225e-05, + "loss": 0.5399, + "step": 583200 + }, + { + "epoch": 13.4337293911762, + "grad_norm": 2.9781227111816406, + "learning_rate": 1.037550355579471e-05, + "loss": 0.5442, + "step": 583400 + }, + { + "epoch": 13.438334714930459, + "grad_norm": 2.3209664821624756, + "learning_rate": 1.0368231991972194e-05, + "loss": 0.5563, + "step": 583600 + }, + { + "epoch": 13.44294003868472, + "grad_norm": 3.272317409515381, + "learning_rate": 1.036099678596879e-05, + "loss": 0.5424, + "step": 583800 + }, + { + "epoch": 13.44754536243898, + "grad_norm": 3.984229803085327, + "learning_rate": 1.0353725222146275e-05, + "loss": 0.5377, + "step": 584000 + }, + { + "epoch": 13.452150686193239, + "grad_norm": 2.723288059234619, + "learning_rate": 1.034645365832376e-05, + "loss": 0.5353, + "step": 584200 + }, + { + "epoch": 13.4567560099475, + "grad_norm": 2.800933361053467, + "learning_rate": 1.0339218452320358e-05, + "loss": 0.5375, + "step": 584400 + }, + { + "epoch": 13.46136133370176, + "grad_norm": 8.197662353515625, + "learning_rate": 1.033194688849784e-05, + "loss": 0.5404, + "step": 584600 + }, + { + "epoch": 13.465966657456018, + "grad_norm": 2.963289737701416, + "learning_rate": 1.0324675324675324e-05, + "loss": 0.5413, + "step": 584800 + }, + { + "epoch": 13.470571981210279, + "grad_norm": 3.9458508491516113, + "learning_rate": 1.031740376085281e-05, + "loss": 0.5431, + "step": 585000 + }, + { + "epoch": 13.47517730496454, + "grad_norm": 2.880087375640869, + "learning_rate": 1.0310132197030293e-05, + "loss": 0.5559, + "step": 585200 + }, + { + "epoch": 13.479782628718798, + "grad_norm": 2.817814826965332, + "learning_rate": 1.0302860633207779e-05, + "loss": 0.5403, + "step": 585400 + }, + { + "epoch": 13.484387952473059, + "grad_norm": 3.2126080989837646, + "learning_rate": 1.0295589069385263e-05, + "loss": 0.536, + "step": 585600 + }, + { + "epoch": 13.488993276227319, + "grad_norm": 2.991262674331665, + "learning_rate": 1.0288317505562746e-05, + "loss": 0.5428, + "step": 585800 + }, + { + "epoch": 13.49359859998158, + "grad_norm": 2.65507435798645, + "learning_rate": 1.0281045941740231e-05, + "loss": 0.5384, + "step": 586000 + }, + { + "epoch": 13.498203923735838, + "grad_norm": 3.388256788253784, + "learning_rate": 1.0273774377917716e-05, + "loss": 0.5431, + "step": 586200 + }, + { + "epoch": 13.502809247490099, + "grad_norm": 3.2963063716888428, + "learning_rate": 1.0266502814095198e-05, + "loss": 0.5506, + "step": 586400 + }, + { + "epoch": 13.50741457124436, + "grad_norm": 3.947801351547241, + "learning_rate": 1.0259231250272684e-05, + "loss": 0.5419, + "step": 586600 + }, + { + "epoch": 13.512019894998618, + "grad_norm": 3.5557472705841064, + "learning_rate": 1.0251959686450168e-05, + "loss": 0.5538, + "step": 586800 + }, + { + "epoch": 13.516625218752878, + "grad_norm": 3.8408093452453613, + "learning_rate": 1.0244688122627653e-05, + "loss": 0.5394, + "step": 587000 + }, + { + "epoch": 13.521230542507139, + "grad_norm": 3.790393352508545, + "learning_rate": 1.0237416558805137e-05, + "loss": 0.5567, + "step": 587200 + }, + { + "epoch": 13.525835866261398, + "grad_norm": 2.450249195098877, + "learning_rate": 1.0230144994982621e-05, + "loss": 0.5475, + "step": 587400 + }, + { + "epoch": 13.530441190015658, + "grad_norm": 3.272376537322998, + "learning_rate": 1.0222873431160107e-05, + "loss": 0.5481, + "step": 587600 + }, + { + "epoch": 13.535046513769919, + "grad_norm": 3.2694225311279297, + "learning_rate": 1.021560186733759e-05, + "loss": 0.5467, + "step": 587800 + }, + { + "epoch": 13.539651837524177, + "grad_norm": 2.739555835723877, + "learning_rate": 1.0208330303515074e-05, + "loss": 0.541, + "step": 588000 + }, + { + "epoch": 13.544257161278438, + "grad_norm": 3.357004404067993, + "learning_rate": 1.020105873969256e-05, + "loss": 0.5549, + "step": 588200 + }, + { + "epoch": 13.548862485032698, + "grad_norm": 3.012406349182129, + "learning_rate": 1.0193823533689156e-05, + "loss": 0.5503, + "step": 588400 + }, + { + "epoch": 13.553467808786957, + "grad_norm": 2.7790162563323975, + "learning_rate": 1.0186551969866639e-05, + "loss": 0.5447, + "step": 588600 + }, + { + "epoch": 13.558073132541217, + "grad_norm": 3.0401885509490967, + "learning_rate": 1.0179280406044125e-05, + "loss": 0.5352, + "step": 588800 + }, + { + "epoch": 13.562678456295478, + "grad_norm": 3.2460315227508545, + "learning_rate": 1.0172008842221609e-05, + "loss": 0.5387, + "step": 589000 + }, + { + "epoch": 13.567283780049738, + "grad_norm": 3.198957681655884, + "learning_rate": 1.0164737278399092e-05, + "loss": 0.5492, + "step": 589200 + }, + { + "epoch": 13.571889103803997, + "grad_norm": 3.1791601181030273, + "learning_rate": 1.0157465714576578e-05, + "loss": 0.5488, + "step": 589400 + }, + { + "epoch": 13.576494427558258, + "grad_norm": 3.927855968475342, + "learning_rate": 1.0150194150754062e-05, + "loss": 0.5459, + "step": 589600 + }, + { + "epoch": 13.581099751312518, + "grad_norm": 3.2088983058929443, + "learning_rate": 1.0142922586931544e-05, + "loss": 0.5425, + "step": 589800 + }, + { + "epoch": 13.585705075066777, + "grad_norm": 3.1823244094848633, + "learning_rate": 1.013565102310903e-05, + "loss": 0.5521, + "step": 590000 + }, + { + "epoch": 13.590310398821037, + "grad_norm": 3.0992443561553955, + "learning_rate": 1.0128379459286514e-05, + "loss": 0.5455, + "step": 590200 + }, + { + "epoch": 13.594915722575298, + "grad_norm": 3.4294822216033936, + "learning_rate": 1.0121107895463999e-05, + "loss": 0.54, + "step": 590400 + }, + { + "epoch": 13.599521046329556, + "grad_norm": 3.575033187866211, + "learning_rate": 1.0113836331641483e-05, + "loss": 0.553, + "step": 590600 + }, + { + "epoch": 13.604126370083817, + "grad_norm": 2.823061466217041, + "learning_rate": 1.0106564767818967e-05, + "loss": 0.5422, + "step": 590800 + }, + { + "epoch": 13.608731693838077, + "grad_norm": 3.334573268890381, + "learning_rate": 1.0099293203996453e-05, + "loss": 0.5412, + "step": 591000 + }, + { + "epoch": 13.613337017592336, + "grad_norm": 3.7110838890075684, + "learning_rate": 1.0092021640173936e-05, + "loss": 0.5439, + "step": 591200 + }, + { + "epoch": 13.617942341346597, + "grad_norm": 2.9446306228637695, + "learning_rate": 1.008475007635142e-05, + "loss": 0.551, + "step": 591400 + }, + { + "epoch": 13.622547665100857, + "grad_norm": 3.2592220306396484, + "learning_rate": 1.0077478512528906e-05, + "loss": 0.546, + "step": 591600 + }, + { + "epoch": 13.627152988855116, + "grad_norm": 2.9190750122070312, + "learning_rate": 1.0070206948706388e-05, + "loss": 0.5515, + "step": 591800 + }, + { + "epoch": 13.631758312609376, + "grad_norm": 3.803069829940796, + "learning_rate": 1.0062935384883874e-05, + "loss": 0.5438, + "step": 592000 + }, + { + "epoch": 13.636363636363637, + "grad_norm": 3.252239942550659, + "learning_rate": 1.0055663821061358e-05, + "loss": 0.547, + "step": 592200 + }, + { + "epoch": 13.640968960117895, + "grad_norm": 3.070131778717041, + "learning_rate": 1.004839225723884e-05, + "loss": 0.5504, + "step": 592400 + }, + { + "epoch": 13.645574283872156, + "grad_norm": 3.4407927989959717, + "learning_rate": 1.0041120693416327e-05, + "loss": 0.5491, + "step": 592600 + }, + { + "epoch": 13.650179607626416, + "grad_norm": 3.2702760696411133, + "learning_rate": 1.0033849129593811e-05, + "loss": 0.5437, + "step": 592800 + }, + { + "epoch": 13.654784931380677, + "grad_norm": 3.0138094425201416, + "learning_rate": 1.0026577565771295e-05, + "loss": 0.5406, + "step": 593000 + }, + { + "epoch": 13.659390255134936, + "grad_norm": 3.4657397270202637, + "learning_rate": 1.0019342359767892e-05, + "loss": 0.5364, + "step": 593200 + }, + { + "epoch": 13.663995578889196, + "grad_norm": 2.7210536003112793, + "learning_rate": 1.0012070795945376e-05, + "loss": 0.5348, + "step": 593400 + }, + { + "epoch": 13.668600902643457, + "grad_norm": 2.796694278717041, + "learning_rate": 1.000479923212286e-05, + "loss": 0.5356, + "step": 593600 + }, + { + "epoch": 13.673206226397715, + "grad_norm": 3.0048344135284424, + "learning_rate": 9.997527668300345e-06, + "loss": 0.547, + "step": 593800 + }, + { + "epoch": 13.677811550151976, + "grad_norm": 3.452331781387329, + "learning_rate": 9.990292462296942e-06, + "loss": 0.5483, + "step": 594000 + }, + { + "epoch": 13.682416873906236, + "grad_norm": 2.8550798892974854, + "learning_rate": 9.983020898474426e-06, + "loss": 0.5466, + "step": 594200 + }, + { + "epoch": 13.687022197660495, + "grad_norm": 3.329484701156616, + "learning_rate": 9.97574933465191e-06, + "loss": 0.5404, + "step": 594400 + }, + { + "epoch": 13.691627521414755, + "grad_norm": 2.8781020641326904, + "learning_rate": 9.968477770829396e-06, + "loss": 0.5518, + "step": 594600 + }, + { + "epoch": 13.696232845169016, + "grad_norm": 3.1248257160186768, + "learning_rate": 9.961206207006879e-06, + "loss": 0.5421, + "step": 594800 + }, + { + "epoch": 13.700838168923275, + "grad_norm": 3.2857155799865723, + "learning_rate": 9.953934643184364e-06, + "loss": 0.5479, + "step": 595000 + }, + { + "epoch": 13.705443492677535, + "grad_norm": 3.693159818649292, + "learning_rate": 9.946663079361849e-06, + "loss": 0.5427, + "step": 595200 + }, + { + "epoch": 13.710048816431796, + "grad_norm": 3.3811700344085693, + "learning_rate": 9.939391515539331e-06, + "loss": 0.5495, + "step": 595400 + }, + { + "epoch": 13.714654140186056, + "grad_norm": 3.110826253890991, + "learning_rate": 9.932119951716817e-06, + "loss": 0.5327, + "step": 595600 + }, + { + "epoch": 13.719259463940315, + "grad_norm": 3.164827346801758, + "learning_rate": 9.924848387894301e-06, + "loss": 0.5373, + "step": 595800 + }, + { + "epoch": 13.723864787694575, + "grad_norm": 3.971219062805176, + "learning_rate": 9.917576824071784e-06, + "loss": 0.5416, + "step": 596000 + }, + { + "epoch": 13.728470111448836, + "grad_norm": 3.429321527481079, + "learning_rate": 9.91030526024927e-06, + "loss": 0.5397, + "step": 596200 + }, + { + "epoch": 13.733075435203094, + "grad_norm": 2.479889392852783, + "learning_rate": 9.903033696426754e-06, + "loss": 0.5494, + "step": 596400 + }, + { + "epoch": 13.737680758957355, + "grad_norm": 2.7341201305389404, + "learning_rate": 9.895762132604238e-06, + "loss": 0.5354, + "step": 596600 + }, + { + "epoch": 13.742286082711615, + "grad_norm": 3.743123769760132, + "learning_rate": 9.888490568781722e-06, + "loss": 0.5502, + "step": 596800 + }, + { + "epoch": 13.746891406465874, + "grad_norm": 2.8174068927764893, + "learning_rate": 9.88125536277832e-06, + "loss": 0.5447, + "step": 597000 + }, + { + "epoch": 13.751496730220135, + "grad_norm": 2.5518455505371094, + "learning_rate": 9.873983798955804e-06, + "loss": 0.5406, + "step": 597200 + }, + { + "epoch": 13.756102053974395, + "grad_norm": 2.6613545417785645, + "learning_rate": 9.866712235133288e-06, + "loss": 0.5436, + "step": 597400 + }, + { + "epoch": 13.760707377728654, + "grad_norm": 3.2584588527679443, + "learning_rate": 9.859440671310772e-06, + "loss": 0.5454, + "step": 597600 + }, + { + "epoch": 13.765312701482914, + "grad_norm": 3.6804134845733643, + "learning_rate": 9.852169107488256e-06, + "loss": 0.5415, + "step": 597800 + }, + { + "epoch": 13.769918025237175, + "grad_norm": 3.93707013130188, + "learning_rate": 9.844933901484853e-06, + "loss": 0.5405, + "step": 598000 + }, + { + "epoch": 13.774523348991433, + "grad_norm": 2.835728645324707, + "learning_rate": 9.837662337662339e-06, + "loss": 0.5355, + "step": 598200 + }, + { + "epoch": 13.779128672745694, + "grad_norm": 2.6848623752593994, + "learning_rate": 9.830390773839822e-06, + "loss": 0.5337, + "step": 598400 + }, + { + "epoch": 13.783733996499954, + "grad_norm": 3.4841365814208984, + "learning_rate": 9.823119210017307e-06, + "loss": 0.531, + "step": 598600 + }, + { + "epoch": 13.788339320254213, + "grad_norm": 2.816138982772827, + "learning_rate": 9.815847646194792e-06, + "loss": 0.5315, + "step": 598800 + }, + { + "epoch": 13.792944644008474, + "grad_norm": 2.922630786895752, + "learning_rate": 9.808576082372274e-06, + "loss": 0.5528, + "step": 599000 + }, + { + "epoch": 13.797549967762734, + "grad_norm": 3.093204975128174, + "learning_rate": 9.80130451854976e-06, + "loss": 0.5525, + "step": 599200 + }, + { + "epoch": 13.802155291516993, + "grad_norm": 3.1738221645355225, + "learning_rate": 9.794032954727244e-06, + "loss": 0.5269, + "step": 599400 + }, + { + "epoch": 13.806760615271253, + "grad_norm": 2.908632755279541, + "learning_rate": 9.786761390904727e-06, + "loss": 0.5307, + "step": 599600 + }, + { + "epoch": 13.811365939025514, + "grad_norm": 3.4265527725219727, + "learning_rate": 9.779489827082213e-06, + "loss": 0.5327, + "step": 599800 + }, + { + "epoch": 13.815971262779774, + "grad_norm": 3.252572774887085, + "learning_rate": 9.772218263259697e-06, + "loss": 0.5453, + "step": 600000 + }, + { + "epoch": 13.820576586534033, + "grad_norm": 3.13742733001709, + "learning_rate": 9.764946699437181e-06, + "loss": 0.546, + "step": 600200 + }, + { + "epoch": 13.825181910288293, + "grad_norm": 2.9839015007019043, + "learning_rate": 9.757675135614665e-06, + "loss": 0.5444, + "step": 600400 + }, + { + "epoch": 13.829787234042554, + "grad_norm": 2.5981061458587646, + "learning_rate": 9.75040357179215e-06, + "loss": 0.5294, + "step": 600600 + }, + { + "epoch": 13.834392557796813, + "grad_norm": 2.8224525451660156, + "learning_rate": 9.743132007969634e-06, + "loss": 0.5433, + "step": 600800 + }, + { + "epoch": 13.838997881551073, + "grad_norm": 2.841012477874756, + "learning_rate": 9.735860444147118e-06, + "loss": 0.5346, + "step": 601000 + }, + { + "epoch": 13.843603205305334, + "grad_norm": 5.748844146728516, + "learning_rate": 9.728588880324602e-06, + "loss": 0.5509, + "step": 601200 + }, + { + "epoch": 13.848208529059592, + "grad_norm": 2.9748475551605225, + "learning_rate": 9.721317316502088e-06, + "loss": 0.5386, + "step": 601400 + }, + { + "epoch": 13.852813852813853, + "grad_norm": 3.0845344066619873, + "learning_rate": 9.71404575267957e-06, + "loss": 0.5288, + "step": 601600 + }, + { + "epoch": 13.857419176568113, + "grad_norm": 2.9533259868621826, + "learning_rate": 9.706774188857057e-06, + "loss": 0.5563, + "step": 601800 + }, + { + "epoch": 13.862024500322372, + "grad_norm": 4.201800346374512, + "learning_rate": 9.699502625034541e-06, + "loss": 0.5445, + "step": 602000 + }, + { + "epoch": 13.866629824076632, + "grad_norm": 2.9285476207733154, + "learning_rate": 9.692231061212023e-06, + "loss": 0.5451, + "step": 602200 + }, + { + "epoch": 13.871235147830893, + "grad_norm": 4.119074821472168, + "learning_rate": 9.68495949738951e-06, + "loss": 0.5433, + "step": 602400 + }, + { + "epoch": 13.875840471585153, + "grad_norm": 4.315847396850586, + "learning_rate": 9.677687933566994e-06, + "loss": 0.5429, + "step": 602600 + }, + { + "epoch": 13.880445795339412, + "grad_norm": 3.370650053024292, + "learning_rate": 9.670416369744478e-06, + "loss": 0.538, + "step": 602800 + }, + { + "epoch": 13.885051119093673, + "grad_norm": 3.967122793197632, + "learning_rate": 9.663144805921962e-06, + "loss": 0.5346, + "step": 603000 + }, + { + "epoch": 13.889656442847933, + "grad_norm": 3.766850233078003, + "learning_rate": 9.655873242099446e-06, + "loss": 0.5345, + "step": 603200 + }, + { + "epoch": 13.894261766602192, + "grad_norm": 2.5367722511291504, + "learning_rate": 9.64860167827693e-06, + "loss": 0.5518, + "step": 603400 + }, + { + "epoch": 13.898867090356452, + "grad_norm": 3.3279225826263428, + "learning_rate": 9.641330114454415e-06, + "loss": 0.5508, + "step": 603600 + }, + { + "epoch": 13.903472414110713, + "grad_norm": 2.8402557373046875, + "learning_rate": 9.634058550631899e-06, + "loss": 0.5419, + "step": 603800 + }, + { + "epoch": 13.908077737864971, + "grad_norm": 2.7564351558685303, + "learning_rate": 9.626786986809383e-06, + "loss": 0.545, + "step": 604000 + }, + { + "epoch": 13.912683061619232, + "grad_norm": 2.57075572013855, + "learning_rate": 9.61955178080598e-06, + "loss": 0.5444, + "step": 604200 + }, + { + "epoch": 13.917288385373492, + "grad_norm": 3.027113676071167, + "learning_rate": 9.612280216983464e-06, + "loss": 0.5438, + "step": 604400 + }, + { + "epoch": 13.921893709127751, + "grad_norm": 3.521167516708374, + "learning_rate": 9.60500865316095e-06, + "loss": 0.5461, + "step": 604600 + }, + { + "epoch": 13.926499032882012, + "grad_norm": 3.0881049633026123, + "learning_rate": 9.597737089338434e-06, + "loss": 0.5481, + "step": 604800 + }, + { + "epoch": 13.931104356636272, + "grad_norm": 2.756943941116333, + "learning_rate": 9.590465525515917e-06, + "loss": 0.551, + "step": 605000 + }, + { + "epoch": 13.93570968039053, + "grad_norm": 2.7377870082855225, + "learning_rate": 9.583193961693403e-06, + "loss": 0.5483, + "step": 605200 + }, + { + "epoch": 13.940315004144791, + "grad_norm": 3.1983232498168945, + "learning_rate": 9.575922397870887e-06, + "loss": 0.5511, + "step": 605400 + }, + { + "epoch": 13.944920327899052, + "grad_norm": 2.8796114921569824, + "learning_rate": 9.56865083404837e-06, + "loss": 0.5355, + "step": 605600 + }, + { + "epoch": 13.94952565165331, + "grad_norm": 3.955488443374634, + "learning_rate": 9.561379270225855e-06, + "loss": 0.5489, + "step": 605800 + }, + { + "epoch": 13.95413097540757, + "grad_norm": 3.366757869720459, + "learning_rate": 9.55410770640334e-06, + "loss": 0.5402, + "step": 606000 + }, + { + "epoch": 13.958736299161831, + "grad_norm": 3.5042965412139893, + "learning_rate": 9.546836142580824e-06, + "loss": 0.5414, + "step": 606200 + }, + { + "epoch": 13.963341622916092, + "grad_norm": 4.029479026794434, + "learning_rate": 9.539564578758308e-06, + "loss": 0.5524, + "step": 606400 + }, + { + "epoch": 13.96794694667035, + "grad_norm": 3.883596897125244, + "learning_rate": 9.532293014935792e-06, + "loss": 0.5432, + "step": 606600 + }, + { + "epoch": 13.972552270424611, + "grad_norm": 3.188150644302368, + "learning_rate": 9.52505780893239e-06, + "loss": 0.5396, + "step": 606800 + }, + { + "epoch": 13.977157594178871, + "grad_norm": 2.3923470973968506, + "learning_rate": 9.517786245109873e-06, + "loss": 0.5358, + "step": 607000 + }, + { + "epoch": 13.98176291793313, + "grad_norm": 2.5199859142303467, + "learning_rate": 9.510514681287358e-06, + "loss": 0.5378, + "step": 607200 + }, + { + "epoch": 13.98636824168739, + "grad_norm": 3.514549970626831, + "learning_rate": 9.503279475283955e-06, + "loss": 0.5471, + "step": 607400 + }, + { + "epoch": 13.990973565441651, + "grad_norm": 3.0256316661834717, + "learning_rate": 9.496007911461439e-06, + "loss": 0.5447, + "step": 607600 + }, + { + "epoch": 13.99557888919591, + "grad_norm": 2.8945305347442627, + "learning_rate": 9.488736347638923e-06, + "loss": 0.5516, + "step": 607800 + }, + { + "epoch": 14.0, + "eval_loss": 0.5243151783943176, + "eval_runtime": 171.7907, + "eval_samples_per_second": 165.09, + "eval_steps_per_second": 10.321, + "step": 607992 + }, + { + "epoch": 14.00018421295017, + "grad_norm": 3.234140634536743, + "learning_rate": 9.481464783816407e-06, + "loss": 0.5465, + "step": 608000 + }, + { + "epoch": 14.00478953670443, + "grad_norm": 3.9970383644104004, + "learning_rate": 9.474193219993893e-06, + "loss": 0.5352, + "step": 608200 + }, + { + "epoch": 14.00939486045869, + "grad_norm": 2.708507537841797, + "learning_rate": 9.466921656171377e-06, + "loss": 0.5386, + "step": 608400 + }, + { + "epoch": 14.01400018421295, + "grad_norm": 3.3150854110717773, + "learning_rate": 9.45965009234886e-06, + "loss": 0.5356, + "step": 608600 + }, + { + "epoch": 14.01860550796721, + "grad_norm": 2.4977152347564697, + "learning_rate": 9.452378528526346e-06, + "loss": 0.5303, + "step": 608800 + }, + { + "epoch": 14.02321083172147, + "grad_norm": 2.6058592796325684, + "learning_rate": 9.44510696470383e-06, + "loss": 0.5452, + "step": 609000 + }, + { + "epoch": 14.02781615547573, + "grad_norm": 3.491718292236328, + "learning_rate": 9.437835400881313e-06, + "loss": 0.5431, + "step": 609200 + }, + { + "epoch": 14.03242147922999, + "grad_norm": 3.100562810897827, + "learning_rate": 9.430563837058798e-06, + "loss": 0.534, + "step": 609400 + }, + { + "epoch": 14.03702680298425, + "grad_norm": 3.014887809753418, + "learning_rate": 9.423292273236283e-06, + "loss": 0.5268, + "step": 609600 + }, + { + "epoch": 14.04163212673851, + "grad_norm": 4.120047569274902, + "learning_rate": 9.416020709413767e-06, + "loss": 0.5492, + "step": 609800 + }, + { + "epoch": 14.04623745049277, + "grad_norm": 3.292614698410034, + "learning_rate": 9.408749145591251e-06, + "loss": 0.5406, + "step": 610000 + }, + { + "epoch": 14.05084277424703, + "grad_norm": 2.325287342071533, + "learning_rate": 9.401477581768735e-06, + "loss": 0.5308, + "step": 610200 + }, + { + "epoch": 14.055448098001289, + "grad_norm": 2.72371244430542, + "learning_rate": 9.394242375765332e-06, + "loss": 0.5306, + "step": 610400 + }, + { + "epoch": 14.06005342175555, + "grad_norm": 2.5140206813812256, + "learning_rate": 9.386970811942816e-06, + "loss": 0.5412, + "step": 610600 + }, + { + "epoch": 14.06465874550981, + "grad_norm": 2.95316481590271, + "learning_rate": 9.3796992481203e-06, + "loss": 0.5408, + "step": 610800 + }, + { + "epoch": 14.069264069264069, + "grad_norm": 3.183243989944458, + "learning_rate": 9.372427684297785e-06, + "loss": 0.5371, + "step": 611000 + }, + { + "epoch": 14.07386939301833, + "grad_norm": 2.708509683609009, + "learning_rate": 9.365156120475269e-06, + "loss": 0.5479, + "step": 611200 + }, + { + "epoch": 14.07847471677259, + "grad_norm": 3.455313205718994, + "learning_rate": 9.357884556652753e-06, + "loss": 0.5429, + "step": 611400 + }, + { + "epoch": 14.083080040526848, + "grad_norm": 3.839599609375, + "learning_rate": 9.35061299283024e-06, + "loss": 0.5455, + "step": 611600 + }, + { + "epoch": 14.087685364281109, + "grad_norm": 3.0706214904785156, + "learning_rate": 9.343341429007722e-06, + "loss": 0.5382, + "step": 611800 + }, + { + "epoch": 14.09229068803537, + "grad_norm": 2.855938673019409, + "learning_rate": 9.336069865185206e-06, + "loss": 0.5346, + "step": 612000 + }, + { + "epoch": 14.096896011789628, + "grad_norm": 2.6702816486358643, + "learning_rate": 9.328798301362692e-06, + "loss": 0.5304, + "step": 612200 + }, + { + "epoch": 14.101501335543889, + "grad_norm": 2.650740146636963, + "learning_rate": 9.321526737540176e-06, + "loss": 0.549, + "step": 612400 + }, + { + "epoch": 14.106106659298149, + "grad_norm": 4.2644124031066895, + "learning_rate": 9.31425517371766e-06, + "loss": 0.5448, + "step": 612600 + }, + { + "epoch": 14.110711983052408, + "grad_norm": 4.463499546051025, + "learning_rate": 9.306983609895145e-06, + "loss": 0.5472, + "step": 612800 + }, + { + "epoch": 14.115317306806668, + "grad_norm": 3.099343776702881, + "learning_rate": 9.299712046072629e-06, + "loss": 0.5316, + "step": 613000 + }, + { + "epoch": 14.119922630560929, + "grad_norm": 3.3200466632843018, + "learning_rate": 9.292440482250113e-06, + "loss": 0.5424, + "step": 613200 + }, + { + "epoch": 14.12452795431519, + "grad_norm": 4.582034587860107, + "learning_rate": 9.285168918427597e-06, + "loss": 0.5322, + "step": 613400 + }, + { + "epoch": 14.129133278069448, + "grad_norm": 2.7270848751068115, + "learning_rate": 9.277897354605081e-06, + "loss": 0.5405, + "step": 613600 + }, + { + "epoch": 14.133738601823708, + "grad_norm": 3.561002731323242, + "learning_rate": 9.270625790782566e-06, + "loss": 0.5403, + "step": 613800 + }, + { + "epoch": 14.138343925577969, + "grad_norm": 2.916008710861206, + "learning_rate": 9.26335422696005e-06, + "loss": 0.5363, + "step": 614000 + }, + { + "epoch": 14.142949249332228, + "grad_norm": 3.4459245204925537, + "learning_rate": 9.256082663137536e-06, + "loss": 0.5467, + "step": 614200 + }, + { + "epoch": 14.147554573086488, + "grad_norm": 4.279280662536621, + "learning_rate": 9.248811099315018e-06, + "loss": 0.5397, + "step": 614400 + }, + { + "epoch": 14.152159896840748, + "grad_norm": 3.383220672607422, + "learning_rate": 9.241539535492503e-06, + "loss": 0.5442, + "step": 614600 + }, + { + "epoch": 14.156765220595007, + "grad_norm": 2.799022912979126, + "learning_rate": 9.234267971669988e-06, + "loss": 0.5455, + "step": 614800 + }, + { + "epoch": 14.161370544349268, + "grad_norm": 3.1966896057128906, + "learning_rate": 9.226996407847471e-06, + "loss": 0.5399, + "step": 615000 + }, + { + "epoch": 14.165975868103528, + "grad_norm": 3.2406864166259766, + "learning_rate": 9.219724844024955e-06, + "loss": 0.543, + "step": 615200 + }, + { + "epoch": 14.170581191857787, + "grad_norm": 2.572612762451172, + "learning_rate": 9.212453280202441e-06, + "loss": 0.5397, + "step": 615400 + }, + { + "epoch": 14.175186515612047, + "grad_norm": 2.9783430099487305, + "learning_rate": 9.205181716379925e-06, + "loss": 0.5372, + "step": 615600 + }, + { + "epoch": 14.179791839366308, + "grad_norm": 3.033306837081909, + "learning_rate": 9.19791015255741e-06, + "loss": 0.5424, + "step": 615800 + }, + { + "epoch": 14.184397163120567, + "grad_norm": 3.275662660598755, + "learning_rate": 9.190638588734894e-06, + "loss": 0.5333, + "step": 616000 + }, + { + "epoch": 14.189002486874827, + "grad_norm": 2.4978132247924805, + "learning_rate": 9.183367024912378e-06, + "loss": 0.5341, + "step": 616200 + }, + { + "epoch": 14.193607810629087, + "grad_norm": 2.4442851543426514, + "learning_rate": 9.176095461089862e-06, + "loss": 0.5327, + "step": 616400 + }, + { + "epoch": 14.198213134383348, + "grad_norm": 3.3033714294433594, + "learning_rate": 9.168823897267346e-06, + "loss": 0.527, + "step": 616600 + }, + { + "epoch": 14.202818458137607, + "grad_norm": 3.127436399459839, + "learning_rate": 9.16155233344483e-06, + "loss": 0.5371, + "step": 616800 + }, + { + "epoch": 14.207423781891867, + "grad_norm": 3.2301642894744873, + "learning_rate": 9.154280769622315e-06, + "loss": 0.537, + "step": 617000 + }, + { + "epoch": 14.212029105646128, + "grad_norm": 3.059586763381958, + "learning_rate": 9.147009205799799e-06, + "loss": 0.5396, + "step": 617200 + }, + { + "epoch": 14.216634429400386, + "grad_norm": 3.4187586307525635, + "learning_rate": 9.139773999796396e-06, + "loss": 0.5429, + "step": 617400 + }, + { + "epoch": 14.221239753154647, + "grad_norm": 2.5766658782958984, + "learning_rate": 9.132502435973882e-06, + "loss": 0.5241, + "step": 617600 + }, + { + "epoch": 14.225845076908907, + "grad_norm": 3.2574641704559326, + "learning_rate": 9.125230872151364e-06, + "loss": 0.5498, + "step": 617800 + }, + { + "epoch": 14.230450400663166, + "grad_norm": 3.0240838527679443, + "learning_rate": 9.117995666147961e-06, + "loss": 0.5397, + "step": 618000 + }, + { + "epoch": 14.235055724417427, + "grad_norm": 3.250147819519043, + "learning_rate": 9.110724102325446e-06, + "loss": 0.5319, + "step": 618200 + }, + { + "epoch": 14.239661048171687, + "grad_norm": 3.770249128341675, + "learning_rate": 9.103488896322042e-06, + "loss": 0.5388, + "step": 618400 + }, + { + "epoch": 14.244266371925946, + "grad_norm": 3.1490142345428467, + "learning_rate": 9.096217332499528e-06, + "loss": 0.5409, + "step": 618600 + }, + { + "epoch": 14.248871695680206, + "grad_norm": 3.5395240783691406, + "learning_rate": 9.088945768677011e-06, + "loss": 0.5395, + "step": 618800 + }, + { + "epoch": 14.253477019434467, + "grad_norm": 3.340034246444702, + "learning_rate": 9.081674204854495e-06, + "loss": 0.5388, + "step": 619000 + }, + { + "epoch": 14.258082343188725, + "grad_norm": 2.908026933670044, + "learning_rate": 9.074402641031981e-06, + "loss": 0.5424, + "step": 619200 + }, + { + "epoch": 14.262687666942986, + "grad_norm": 2.7248940467834473, + "learning_rate": 9.067131077209465e-06, + "loss": 0.5236, + "step": 619400 + }, + { + "epoch": 14.267292990697246, + "grad_norm": 3.4366588592529297, + "learning_rate": 9.05985951338695e-06, + "loss": 0.5526, + "step": 619600 + }, + { + "epoch": 14.271898314451505, + "grad_norm": 3.0255722999572754, + "learning_rate": 9.052587949564434e-06, + "loss": 0.5282, + "step": 619800 + }, + { + "epoch": 14.276503638205766, + "grad_norm": 3.2476325035095215, + "learning_rate": 9.045316385741918e-06, + "loss": 0.5384, + "step": 620000 + }, + { + "epoch": 14.281108961960026, + "grad_norm": 3.1371731758117676, + "learning_rate": 9.038044821919402e-06, + "loss": 0.539, + "step": 620200 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 2.5684289932250977, + "learning_rate": 9.030773258096886e-06, + "loss": 0.5288, + "step": 620400 + }, + { + "epoch": 14.290319609468545, + "grad_norm": 3.3922548294067383, + "learning_rate": 9.023501694274372e-06, + "loss": 0.5344, + "step": 620600 + }, + { + "epoch": 14.294924933222806, + "grad_norm": 2.9498867988586426, + "learning_rate": 9.016230130451855e-06, + "loss": 0.5327, + "step": 620800 + }, + { + "epoch": 14.299530256977066, + "grad_norm": 3.503101110458374, + "learning_rate": 9.008958566629339e-06, + "loss": 0.5408, + "step": 621000 + }, + { + "epoch": 14.304135580731325, + "grad_norm": 2.9323580265045166, + "learning_rate": 9.001687002806825e-06, + "loss": 0.5407, + "step": 621200 + }, + { + "epoch": 14.308740904485585, + "grad_norm": 3.0948574542999268, + "learning_rate": 8.994415438984307e-06, + "loss": 0.543, + "step": 621400 + }, + { + "epoch": 14.313346228239846, + "grad_norm": 2.787778854370117, + "learning_rate": 8.987180232980904e-06, + "loss": 0.5334, + "step": 621600 + }, + { + "epoch": 14.317951551994105, + "grad_norm": 2.2373883724212646, + "learning_rate": 8.979908669158389e-06, + "loss": 0.5355, + "step": 621800 + }, + { + "epoch": 14.322556875748365, + "grad_norm": 3.0317771434783936, + "learning_rate": 8.972637105335874e-06, + "loss": 0.5372, + "step": 622000 + }, + { + "epoch": 14.327162199502625, + "grad_norm": 3.486945152282715, + "learning_rate": 8.965365541513357e-06, + "loss": 0.5294, + "step": 622200 + }, + { + "epoch": 14.331767523256884, + "grad_norm": 3.084419012069702, + "learning_rate": 8.958093977690843e-06, + "loss": 0.545, + "step": 622400 + }, + { + "epoch": 14.336372847011145, + "grad_norm": 3.271340847015381, + "learning_rate": 8.950822413868327e-06, + "loss": 0.5436, + "step": 622600 + }, + { + "epoch": 14.340978170765405, + "grad_norm": 2.878502607345581, + "learning_rate": 8.943550850045811e-06, + "loss": 0.5316, + "step": 622800 + }, + { + "epoch": 14.345583494519666, + "grad_norm": 3.714315414428711, + "learning_rate": 8.936279286223296e-06, + "loss": 0.5538, + "step": 623000 + }, + { + "epoch": 14.350188818273924, + "grad_norm": 3.77945876121521, + "learning_rate": 8.92900772240078e-06, + "loss": 0.5426, + "step": 623200 + }, + { + "epoch": 14.354794142028185, + "grad_norm": 3.177495241165161, + "learning_rate": 8.921736158578264e-06, + "loss": 0.5407, + "step": 623400 + }, + { + "epoch": 14.359399465782445, + "grad_norm": 2.679957151412964, + "learning_rate": 8.914464594755748e-06, + "loss": 0.5439, + "step": 623600 + }, + { + "epoch": 14.364004789536704, + "grad_norm": 3.1106204986572266, + "learning_rate": 8.907193030933232e-06, + "loss": 0.5319, + "step": 623800 + }, + { + "epoch": 14.368610113290964, + "grad_norm": 4.358025550842285, + "learning_rate": 8.899921467110718e-06, + "loss": 0.5272, + "step": 624000 + }, + { + "epoch": 14.373215437045225, + "grad_norm": 3.5645134449005127, + "learning_rate": 8.892649903288201e-06, + "loss": 0.5265, + "step": 624200 + }, + { + "epoch": 14.377820760799484, + "grad_norm": 3.1196436882019043, + "learning_rate": 8.885378339465685e-06, + "loss": 0.5261, + "step": 624400 + }, + { + "epoch": 14.382426084553744, + "grad_norm": 3.206526517868042, + "learning_rate": 8.878106775643171e-06, + "loss": 0.5461, + "step": 624600 + }, + { + "epoch": 14.387031408308005, + "grad_norm": 3.2251393795013428, + "learning_rate": 8.870835211820654e-06, + "loss": 0.5388, + "step": 624800 + }, + { + "epoch": 14.391636732062263, + "grad_norm": 2.812065362930298, + "learning_rate": 8.86360000581725e-06, + "loss": 0.5345, + "step": 625000 + }, + { + "epoch": 14.396242055816524, + "grad_norm": 3.157151460647583, + "learning_rate": 8.856328441994735e-06, + "loss": 0.5436, + "step": 625200 + }, + { + "epoch": 14.400847379570784, + "grad_norm": 3.392270088195801, + "learning_rate": 8.84905687817222e-06, + "loss": 0.527, + "step": 625400 + }, + { + "epoch": 14.405452703325043, + "grad_norm": 2.6218199729919434, + "learning_rate": 8.841785314349703e-06, + "loss": 0.5309, + "step": 625600 + }, + { + "epoch": 14.410058027079303, + "grad_norm": 2.878702402114868, + "learning_rate": 8.834513750527189e-06, + "loss": 0.5339, + "step": 625800 + }, + { + "epoch": 14.414663350833564, + "grad_norm": 3.8115878105163574, + "learning_rate": 8.827242186704673e-06, + "loss": 0.5313, + "step": 626000 + }, + { + "epoch": 14.419268674587823, + "grad_norm": 3.6281392574310303, + "learning_rate": 8.819970622882157e-06, + "loss": 0.5352, + "step": 626200 + }, + { + "epoch": 14.423873998342083, + "grad_norm": 3.010791301727295, + "learning_rate": 8.812699059059642e-06, + "loss": 0.5449, + "step": 626400 + }, + { + "epoch": 14.428479322096344, + "grad_norm": 3.1626334190368652, + "learning_rate": 8.805427495237126e-06, + "loss": 0.5427, + "step": 626600 + }, + { + "epoch": 14.433084645850604, + "grad_norm": 4.617162704467773, + "learning_rate": 8.79815593141461e-06, + "loss": 0.5296, + "step": 626800 + }, + { + "epoch": 14.437689969604863, + "grad_norm": 2.7026705741882324, + "learning_rate": 8.790884367592094e-06, + "loss": 0.5386, + "step": 627000 + }, + { + "epoch": 14.442295293359123, + "grad_norm": 2.913951873779297, + "learning_rate": 8.783612803769579e-06, + "loss": 0.5436, + "step": 627200 + }, + { + "epoch": 14.446900617113384, + "grad_norm": 3.2576162815093994, + "learning_rate": 8.776341239947064e-06, + "loss": 0.5406, + "step": 627400 + }, + { + "epoch": 14.451505940867643, + "grad_norm": 2.7531659603118896, + "learning_rate": 8.769069676124547e-06, + "loss": 0.5552, + "step": 627600 + }, + { + "epoch": 14.456111264621903, + "grad_norm": 3.288334608078003, + "learning_rate": 8.761798112302031e-06, + "loss": 0.5285, + "step": 627800 + }, + { + "epoch": 14.460716588376163, + "grad_norm": 2.9675936698913574, + "learning_rate": 8.754526548479517e-06, + "loss": 0.5303, + "step": 628000 + }, + { + "epoch": 14.465321912130422, + "grad_norm": 3.810732364654541, + "learning_rate": 8.747254984657e-06, + "loss": 0.5309, + "step": 628200 + }, + { + "epoch": 14.469927235884683, + "grad_norm": 3.084245204925537, + "learning_rate": 8.739983420834486e-06, + "loss": 0.53, + "step": 628400 + }, + { + "epoch": 14.474532559638943, + "grad_norm": 3.4375557899475098, + "learning_rate": 8.73274821483108e-06, + "loss": 0.5417, + "step": 628600 + }, + { + "epoch": 14.479137883393202, + "grad_norm": 4.032174587249756, + "learning_rate": 8.725476651008567e-06, + "loss": 0.5392, + "step": 628800 + }, + { + "epoch": 14.483743207147462, + "grad_norm": 3.3805575370788574, + "learning_rate": 8.71820508718605e-06, + "loss": 0.5307, + "step": 629000 + }, + { + "epoch": 14.488348530901723, + "grad_norm": 3.0205650329589844, + "learning_rate": 8.710933523363535e-06, + "loss": 0.5383, + "step": 629200 + }, + { + "epoch": 14.492953854655982, + "grad_norm": 2.994614601135254, + "learning_rate": 8.70366195954102e-06, + "loss": 0.543, + "step": 629400 + }, + { + "epoch": 14.497559178410242, + "grad_norm": 3.1769533157348633, + "learning_rate": 8.696390395718504e-06, + "loss": 0.5316, + "step": 629600 + }, + { + "epoch": 14.502164502164502, + "grad_norm": 2.6771903038024902, + "learning_rate": 8.689118831895988e-06, + "loss": 0.5426, + "step": 629800 + }, + { + "epoch": 14.506769825918763, + "grad_norm": 2.680912494659424, + "learning_rate": 8.681847268073472e-06, + "loss": 0.5232, + "step": 630000 + }, + { + "epoch": 14.511375149673022, + "grad_norm": 2.9476630687713623, + "learning_rate": 8.674575704250958e-06, + "loss": 0.5455, + "step": 630200 + }, + { + "epoch": 14.515980473427282, + "grad_norm": 4.284652233123779, + "learning_rate": 8.66730414042844e-06, + "loss": 0.5425, + "step": 630400 + }, + { + "epoch": 14.520585797181543, + "grad_norm": 3.3897085189819336, + "learning_rate": 8.660032576605925e-06, + "loss": 0.5245, + "step": 630600 + }, + { + "epoch": 14.525191120935801, + "grad_norm": 2.838503122329712, + "learning_rate": 8.65276101278341e-06, + "loss": 0.5334, + "step": 630800 + }, + { + "epoch": 14.529796444690062, + "grad_norm": 2.7507781982421875, + "learning_rate": 8.645489448960893e-06, + "loss": 0.5292, + "step": 631000 + }, + { + "epoch": 14.534401768444322, + "grad_norm": 4.586523532867432, + "learning_rate": 8.638217885138377e-06, + "loss": 0.543, + "step": 631200 + }, + { + "epoch": 14.539007092198581, + "grad_norm": 2.6941652297973633, + "learning_rate": 8.630946321315863e-06, + "loss": 0.5372, + "step": 631400 + }, + { + "epoch": 14.543612415952841, + "grad_norm": 2.9810523986816406, + "learning_rate": 8.62371111531246e-06, + "loss": 0.5382, + "step": 631600 + }, + { + "epoch": 14.548217739707102, + "grad_norm": 2.956925868988037, + "learning_rate": 8.616439551489943e-06, + "loss": 0.5463, + "step": 631800 + }, + { + "epoch": 14.55282306346136, + "grad_norm": 2.96683669090271, + "learning_rate": 8.609167987667429e-06, + "loss": 0.5295, + "step": 632000 + }, + { + "epoch": 14.557428387215621, + "grad_norm": 3.31773042678833, + "learning_rate": 8.601896423844913e-06, + "loss": 0.5294, + "step": 632200 + }, + { + "epoch": 14.562033710969882, + "grad_norm": 3.2458245754241943, + "learning_rate": 8.594624860022395e-06, + "loss": 0.5335, + "step": 632400 + }, + { + "epoch": 14.56663903472414, + "grad_norm": 3.0680081844329834, + "learning_rate": 8.587353296199881e-06, + "loss": 0.547, + "step": 632600 + }, + { + "epoch": 14.5712443584784, + "grad_norm": 3.1158411502838135, + "learning_rate": 8.580081732377365e-06, + "loss": 0.555, + "step": 632800 + }, + { + "epoch": 14.575849682232661, + "grad_norm": 4.015247821807861, + "learning_rate": 8.57281016855485e-06, + "loss": 0.5307, + "step": 633000 + }, + { + "epoch": 14.58045500598692, + "grad_norm": 3.309248924255371, + "learning_rate": 8.565538604732334e-06, + "loss": 0.5387, + "step": 633200 + }, + { + "epoch": 14.58506032974118, + "grad_norm": 3.6629464626312256, + "learning_rate": 8.558267040909818e-06, + "loss": 0.5384, + "step": 633400 + }, + { + "epoch": 14.589665653495441, + "grad_norm": 3.6036899089813232, + "learning_rate": 8.550995477087304e-06, + "loss": 0.5417, + "step": 633600 + }, + { + "epoch": 14.594270977249701, + "grad_norm": 2.716386556625366, + "learning_rate": 8.543723913264787e-06, + "loss": 0.5361, + "step": 633800 + }, + { + "epoch": 14.59887630100396, + "grad_norm": 2.9629690647125244, + "learning_rate": 8.53645234944227e-06, + "loss": 0.5406, + "step": 634000 + }, + { + "epoch": 14.60348162475822, + "grad_norm": 3.4902899265289307, + "learning_rate": 8.529180785619757e-06, + "loss": 0.5288, + "step": 634200 + }, + { + "epoch": 14.608086948512481, + "grad_norm": 3.307722568511963, + "learning_rate": 8.52190922179724e-06, + "loss": 0.54, + "step": 634400 + }, + { + "epoch": 14.61269227226674, + "grad_norm": 2.7474825382232666, + "learning_rate": 8.514637657974723e-06, + "loss": 0.5514, + "step": 634600 + }, + { + "epoch": 14.617297596021, + "grad_norm": 3.100318431854248, + "learning_rate": 8.50736609415221e-06, + "loss": 0.5373, + "step": 634800 + }, + { + "epoch": 14.62190291977526, + "grad_norm": 3.0019752979278564, + "learning_rate": 8.500094530329692e-06, + "loss": 0.5312, + "step": 635000 + }, + { + "epoch": 14.62650824352952, + "grad_norm": 3.607322931289673, + "learning_rate": 8.492822966507178e-06, + "loss": 0.5222, + "step": 635200 + }, + { + "epoch": 14.63111356728378, + "grad_norm": 2.8540308475494385, + "learning_rate": 8.485551402684662e-06, + "loss": 0.5392, + "step": 635400 + }, + { + "epoch": 14.63571889103804, + "grad_norm": 2.7987473011016846, + "learning_rate": 8.478279838862145e-06, + "loss": 0.5433, + "step": 635600 + }, + { + "epoch": 14.6403242147923, + "grad_norm": 3.4120616912841797, + "learning_rate": 8.471044632858741e-06, + "loss": 0.5306, + "step": 635800 + }, + { + "epoch": 14.64492953854656, + "grad_norm": 2.813546657562256, + "learning_rate": 8.463773069036227e-06, + "loss": 0.5319, + "step": 636000 + }, + { + "epoch": 14.64953486230082, + "grad_norm": 3.185399293899536, + "learning_rate": 8.456501505213712e-06, + "loss": 0.5474, + "step": 636200 + }, + { + "epoch": 14.654140186055079, + "grad_norm": 3.0868401527404785, + "learning_rate": 8.449229941391194e-06, + "loss": 0.5404, + "step": 636400 + }, + { + "epoch": 14.65874550980934, + "grad_norm": 3.2732975482940674, + "learning_rate": 8.44195837756868e-06, + "loss": 0.536, + "step": 636600 + }, + { + "epoch": 14.6633508335636, + "grad_norm": 3.1363329887390137, + "learning_rate": 8.434686813746164e-06, + "loss": 0.5395, + "step": 636800 + }, + { + "epoch": 14.66795615731786, + "grad_norm": 3.2439472675323486, + "learning_rate": 8.42741524992365e-06, + "loss": 0.5405, + "step": 637000 + }, + { + "epoch": 14.672561481072119, + "grad_norm": 2.4419994354248047, + "learning_rate": 8.420180043920247e-06, + "loss": 0.5228, + "step": 637200 + }, + { + "epoch": 14.67716680482638, + "grad_norm": 4.0651044845581055, + "learning_rate": 8.41290848009773e-06, + "loss": 0.5441, + "step": 637400 + }, + { + "epoch": 14.68177212858064, + "grad_norm": 3.520622730255127, + "learning_rate": 8.405636916275214e-06, + "loss": 0.5334, + "step": 637600 + }, + { + "epoch": 14.686377452334899, + "grad_norm": 2.7496581077575684, + "learning_rate": 8.3983653524527e-06, + "loss": 0.5408, + "step": 637800 + }, + { + "epoch": 14.69098277608916, + "grad_norm": 2.8176567554473877, + "learning_rate": 8.391093788630182e-06, + "loss": 0.5329, + "step": 638000 + }, + { + "epoch": 14.69558809984342, + "grad_norm": 3.117527484893799, + "learning_rate": 8.383822224807668e-06, + "loss": 0.5327, + "step": 638200 + }, + { + "epoch": 14.700193423597678, + "grad_norm": 3.2691383361816406, + "learning_rate": 8.376550660985152e-06, + "loss": 0.5375, + "step": 638400 + }, + { + "epoch": 14.704798747351939, + "grad_norm": 3.023637533187866, + "learning_rate": 8.369279097162635e-06, + "loss": 0.535, + "step": 638600 + }, + { + "epoch": 14.7094040711062, + "grad_norm": 4.642063140869141, + "learning_rate": 8.36200753334012e-06, + "loss": 0.5324, + "step": 638800 + }, + { + "epoch": 14.714009394860458, + "grad_norm": 3.2362873554229736, + "learning_rate": 8.354735969517605e-06, + "loss": 0.5345, + "step": 639000 + }, + { + "epoch": 14.718614718614718, + "grad_norm": 3.0052409172058105, + "learning_rate": 8.347464405695088e-06, + "loss": 0.5431, + "step": 639200 + }, + { + "epoch": 14.723220042368979, + "grad_norm": 2.905585765838623, + "learning_rate": 8.340229199691684e-06, + "loss": 0.5339, + "step": 639400 + }, + { + "epoch": 14.727825366123238, + "grad_norm": 3.1975319385528564, + "learning_rate": 8.33295763586917e-06, + "loss": 0.5351, + "step": 639600 + }, + { + "epoch": 14.732430689877498, + "grad_norm": 3.070645809173584, + "learning_rate": 8.325686072046655e-06, + "loss": 0.5395, + "step": 639800 + }, + { + "epoch": 14.737036013631759, + "grad_norm": 4.0564351081848145, + "learning_rate": 8.318414508224139e-06, + "loss": 0.542, + "step": 640000 + }, + { + "epoch": 14.741641337386017, + "grad_norm": 3.2501471042633057, + "learning_rate": 8.311142944401623e-06, + "loss": 0.5381, + "step": 640200 + }, + { + "epoch": 14.746246661140278, + "grad_norm": 2.8298182487487793, + "learning_rate": 8.303871380579107e-06, + "loss": 0.5309, + "step": 640400 + }, + { + "epoch": 14.750851984894538, + "grad_norm": 2.915964365005493, + "learning_rate": 8.296599816756593e-06, + "loss": 0.5458, + "step": 640600 + }, + { + "epoch": 14.755457308648799, + "grad_norm": 3.227104425430298, + "learning_rate": 8.289328252934076e-06, + "loss": 0.5375, + "step": 640800 + }, + { + "epoch": 14.760062632403057, + "grad_norm": 3.489304304122925, + "learning_rate": 8.28205668911156e-06, + "loss": 0.5386, + "step": 641000 + }, + { + "epoch": 14.764667956157318, + "grad_norm": 2.84167218208313, + "learning_rate": 8.274785125289046e-06, + "loss": 0.5288, + "step": 641200 + }, + { + "epoch": 14.769273279911578, + "grad_norm": 3.9559543132781982, + "learning_rate": 8.267513561466528e-06, + "loss": 0.5479, + "step": 641400 + }, + { + "epoch": 14.773878603665837, + "grad_norm": 3.0262818336486816, + "learning_rate": 8.260241997644014e-06, + "loss": 0.5233, + "step": 641600 + }, + { + "epoch": 14.778483927420098, + "grad_norm": 2.9113521575927734, + "learning_rate": 8.252970433821499e-06, + "loss": 0.545, + "step": 641800 + }, + { + "epoch": 14.783089251174358, + "grad_norm": 3.577045440673828, + "learning_rate": 8.245698869998981e-06, + "loss": 0.5376, + "step": 642000 + }, + { + "epoch": 14.787694574928617, + "grad_norm": 2.9530036449432373, + "learning_rate": 8.238427306176467e-06, + "loss": 0.5402, + "step": 642200 + }, + { + "epoch": 14.792299898682877, + "grad_norm": 2.98638916015625, + "learning_rate": 8.231155742353951e-06, + "loss": 0.5326, + "step": 642400 + }, + { + "epoch": 14.796905222437138, + "grad_norm": 3.20383620262146, + "learning_rate": 8.223884178531434e-06, + "loss": 0.527, + "step": 642600 + }, + { + "epoch": 14.801510546191397, + "grad_norm": 3.5670669078826904, + "learning_rate": 8.21661261470892e-06, + "loss": 0.54, + "step": 642800 + }, + { + "epoch": 14.806115869945657, + "grad_norm": 3.084299087524414, + "learning_rate": 8.209377408705517e-06, + "loss": 0.5414, + "step": 643000 + }, + { + "epoch": 14.810721193699917, + "grad_norm": 3.610342264175415, + "learning_rate": 8.202105844883e-06, + "loss": 0.547, + "step": 643200 + }, + { + "epoch": 14.815326517454178, + "grad_norm": 3.762600898742676, + "learning_rate": 8.194834281060485e-06, + "loss": 0.5448, + "step": 643400 + }, + { + "epoch": 14.819931841208437, + "grad_norm": 2.416067600250244, + "learning_rate": 8.18756271723797e-06, + "loss": 0.5423, + "step": 643600 + }, + { + "epoch": 14.824537164962697, + "grad_norm": 3.5616204738616943, + "learning_rate": 8.180291153415453e-06, + "loss": 0.5233, + "step": 643800 + }, + { + "epoch": 14.829142488716958, + "grad_norm": 2.5570333003997803, + "learning_rate": 8.17301958959294e-06, + "loss": 0.537, + "step": 644000 + }, + { + "epoch": 14.833747812471216, + "grad_norm": 3.0077576637268066, + "learning_rate": 8.165784383589536e-06, + "loss": 0.546, + "step": 644200 + }, + { + "epoch": 14.838353136225477, + "grad_norm": 2.393003463745117, + "learning_rate": 8.158512819767019e-06, + "loss": 0.5329, + "step": 644400 + }, + { + "epoch": 14.842958459979737, + "grad_norm": 3.575730085372925, + "learning_rate": 8.151241255944503e-06, + "loss": 0.5481, + "step": 644600 + }, + { + "epoch": 14.847563783733996, + "grad_norm": 3.534426689147949, + "learning_rate": 8.143969692121989e-06, + "loss": 0.5373, + "step": 644800 + }, + { + "epoch": 14.852169107488256, + "grad_norm": 2.990290880203247, + "learning_rate": 8.136698128299471e-06, + "loss": 0.5358, + "step": 645000 + }, + { + "epoch": 14.856774431242517, + "grad_norm": 2.6674997806549072, + "learning_rate": 8.129426564476957e-06, + "loss": 0.545, + "step": 645200 + }, + { + "epoch": 14.861379754996776, + "grad_norm": 2.798583745956421, + "learning_rate": 8.122155000654442e-06, + "loss": 0.5366, + "step": 645400 + }, + { + "epoch": 14.865985078751036, + "grad_norm": 2.496399164199829, + "learning_rate": 8.114883436831924e-06, + "loss": 0.5419, + "step": 645600 + }, + { + "epoch": 14.870590402505297, + "grad_norm": 2.99106502532959, + "learning_rate": 8.10761187300941e-06, + "loss": 0.5414, + "step": 645800 + }, + { + "epoch": 14.875195726259555, + "grad_norm": 3.370936393737793, + "learning_rate": 8.100340309186894e-06, + "loss": 0.5246, + "step": 646000 + }, + { + "epoch": 14.879801050013816, + "grad_norm": 2.571732521057129, + "learning_rate": 8.093068745364378e-06, + "loss": 0.5323, + "step": 646200 + }, + { + "epoch": 14.884406373768076, + "grad_norm": 4.192697525024414, + "learning_rate": 8.085797181541863e-06, + "loss": 0.5265, + "step": 646400 + }, + { + "epoch": 14.889011697522335, + "grad_norm": 3.2478342056274414, + "learning_rate": 8.078525617719347e-06, + "loss": 0.5266, + "step": 646600 + }, + { + "epoch": 14.893617021276595, + "grad_norm": 3.7629292011260986, + "learning_rate": 8.071254053896831e-06, + "loss": 0.5455, + "step": 646800 + }, + { + "epoch": 14.898222345030856, + "grad_norm": 3.1777334213256836, + "learning_rate": 8.063982490074315e-06, + "loss": 0.5255, + "step": 647000 + }, + { + "epoch": 14.902827668785115, + "grad_norm": 3.3058884143829346, + "learning_rate": 8.0567109262518e-06, + "loss": 0.5519, + "step": 647200 + }, + { + "epoch": 14.907432992539375, + "grad_norm": 3.7439780235290527, + "learning_rate": 8.049439362429285e-06, + "loss": 0.545, + "step": 647400 + }, + { + "epoch": 14.912038316293636, + "grad_norm": 3.925713062286377, + "learning_rate": 8.042167798606768e-06, + "loss": 0.5236, + "step": 647600 + }, + { + "epoch": 14.916643640047896, + "grad_norm": 3.196169376373291, + "learning_rate": 8.034896234784254e-06, + "loss": 0.541, + "step": 647800 + }, + { + "epoch": 14.921248963802155, + "grad_norm": 2.940185308456421, + "learning_rate": 8.027624670961738e-06, + "loss": 0.5409, + "step": 648000 + }, + { + "epoch": 14.925854287556415, + "grad_norm": 3.2344326972961426, + "learning_rate": 8.02035310713922e-06, + "loss": 0.5228, + "step": 648200 + }, + { + "epoch": 14.930459611310676, + "grad_norm": 2.691751718521118, + "learning_rate": 8.013081543316707e-06, + "loss": 0.5243, + "step": 648400 + }, + { + "epoch": 14.935064935064934, + "grad_norm": 2.5507559776306152, + "learning_rate": 8.00580997949419e-06, + "loss": 0.5326, + "step": 648600 + }, + { + "epoch": 14.939670258819195, + "grad_norm": 3.069321870803833, + "learning_rate": 7.998538415671673e-06, + "loss": 0.5364, + "step": 648800 + }, + { + "epoch": 14.944275582573455, + "grad_norm": 2.938601016998291, + "learning_rate": 7.99126685184916e-06, + "loss": 0.5257, + "step": 649000 + }, + { + "epoch": 14.948880906327714, + "grad_norm": 2.886845588684082, + "learning_rate": 7.983995288026643e-06, + "loss": 0.5415, + "step": 649200 + }, + { + "epoch": 14.953486230081975, + "grad_norm": 2.869224786758423, + "learning_rate": 7.976723724204128e-06, + "loss": 0.5408, + "step": 649400 + }, + { + "epoch": 14.958091553836235, + "grad_norm": 2.6893749237060547, + "learning_rate": 7.969452160381612e-06, + "loss": 0.5284, + "step": 649600 + }, + { + "epoch": 14.962696877590494, + "grad_norm": 3.181957244873047, + "learning_rate": 7.962180596559096e-06, + "loss": 0.5446, + "step": 649800 + }, + { + "epoch": 14.967302201344754, + "grad_norm": 2.8158302307128906, + "learning_rate": 7.95490903273658e-06, + "loss": 0.5428, + "step": 650000 + }, + { + "epoch": 14.971907525099015, + "grad_norm": 3.031609296798706, + "learning_rate": 7.947673826733177e-06, + "loss": 0.5318, + "step": 650200 + }, + { + "epoch": 14.976512848853275, + "grad_norm": 2.578882932662964, + "learning_rate": 7.940402262910661e-06, + "loss": 0.5252, + "step": 650400 + }, + { + "epoch": 14.981118172607534, + "grad_norm": 3.891084671020508, + "learning_rate": 7.933130699088146e-06, + "loss": 0.5412, + "step": 650600 + }, + { + "epoch": 14.985723496361794, + "grad_norm": 4.118905544281006, + "learning_rate": 7.925859135265632e-06, + "loss": 0.5389, + "step": 650800 + }, + { + "epoch": 14.990328820116055, + "grad_norm": 3.9632790088653564, + "learning_rate": 7.918587571443114e-06, + "loss": 0.527, + "step": 651000 + }, + { + "epoch": 14.994934143870314, + "grad_norm": 3.387319564819336, + "learning_rate": 7.9113160076206e-06, + "loss": 0.5273, + "step": 651200 + }, + { + "epoch": 14.999539467624574, + "grad_norm": 2.79263973236084, + "learning_rate": 7.904044443798084e-06, + "loss": 0.539, + "step": 651400 + }, + { + "epoch": 15.0, + "eval_loss": 0.5169693231582642, + "eval_runtime": 168.9237, + "eval_samples_per_second": 167.892, + "eval_steps_per_second": 10.496, + "step": 651420 + }, + { + "epoch": 15.004144791378835, + "grad_norm": 3.222738742828369, + "learning_rate": 7.896772879975567e-06, + "loss": 0.5368, + "step": 651600 + }, + { + "epoch": 15.008750115133093, + "grad_norm": 3.698350191116333, + "learning_rate": 7.889501316153053e-06, + "loss": 0.5365, + "step": 651800 + }, + { + "epoch": 15.013355438887354, + "grad_norm": 3.877946376800537, + "learning_rate": 7.882229752330537e-06, + "loss": 0.5331, + "step": 652000 + }, + { + "epoch": 15.017960762641614, + "grad_norm": 3.619192600250244, + "learning_rate": 7.87495818850802e-06, + "loss": 0.5337, + "step": 652200 + }, + { + "epoch": 15.022566086395873, + "grad_norm": 2.9322800636291504, + "learning_rate": 7.867686624685505e-06, + "loss": 0.5217, + "step": 652400 + }, + { + "epoch": 15.027171410150133, + "grad_norm": 2.9945201873779297, + "learning_rate": 7.86041506086299e-06, + "loss": 0.5297, + "step": 652600 + }, + { + "epoch": 15.031776733904394, + "grad_norm": 2.359719753265381, + "learning_rate": 7.853143497040474e-06, + "loss": 0.5316, + "step": 652800 + }, + { + "epoch": 15.036382057658653, + "grad_norm": 3.135897636413574, + "learning_rate": 7.845871933217958e-06, + "loss": 0.534, + "step": 653000 + }, + { + "epoch": 15.040987381412913, + "grad_norm": 3.2203567028045654, + "learning_rate": 7.838600369395442e-06, + "loss": 0.5371, + "step": 653200 + }, + { + "epoch": 15.045592705167174, + "grad_norm": 3.355985164642334, + "learning_rate": 7.831328805572926e-06, + "loss": 0.5294, + "step": 653400 + }, + { + "epoch": 15.050198028921432, + "grad_norm": 3.7770304679870605, + "learning_rate": 7.82405724175041e-06, + "loss": 0.5409, + "step": 653600 + }, + { + "epoch": 15.054803352675693, + "grad_norm": 9.291865348815918, + "learning_rate": 7.816785677927895e-06, + "loss": 0.5363, + "step": 653800 + }, + { + "epoch": 15.059408676429953, + "grad_norm": 2.6774072647094727, + "learning_rate": 7.80951411410538e-06, + "loss": 0.5297, + "step": 654000 + }, + { + "epoch": 15.064014000184214, + "grad_norm": 3.5901434421539307, + "learning_rate": 7.802242550282863e-06, + "loss": 0.527, + "step": 654200 + }, + { + "epoch": 15.068619323938472, + "grad_norm": 3.092885971069336, + "learning_rate": 7.79497098646035e-06, + "loss": 0.5316, + "step": 654400 + }, + { + "epoch": 15.073224647692733, + "grad_norm": 2.6505744457244873, + "learning_rate": 7.787735780456946e-06, + "loss": 0.523, + "step": 654600 + }, + { + "epoch": 15.077829971446993, + "grad_norm": 2.4430651664733887, + "learning_rate": 7.78046421663443e-06, + "loss": 0.5332, + "step": 654800 + }, + { + "epoch": 15.082435295201252, + "grad_norm": 3.4239096641540527, + "learning_rate": 7.773192652811913e-06, + "loss": 0.5329, + "step": 655000 + }, + { + "epoch": 15.087040618955513, + "grad_norm": 3.358081340789795, + "learning_rate": 7.765921088989399e-06, + "loss": 0.5242, + "step": 655200 + }, + { + "epoch": 15.091645942709773, + "grad_norm": 3.774402141571045, + "learning_rate": 7.758685882985996e-06, + "loss": 0.5316, + "step": 655400 + }, + { + "epoch": 15.096251266464032, + "grad_norm": 3.171030282974243, + "learning_rate": 7.75141431916348e-06, + "loss": 0.545, + "step": 655600 + }, + { + "epoch": 15.100856590218292, + "grad_norm": 2.9989264011383057, + "learning_rate": 7.744142755340964e-06, + "loss": 0.5374, + "step": 655800 + }, + { + "epoch": 15.105461913972553, + "grad_norm": 2.5736443996429443, + "learning_rate": 7.736871191518448e-06, + "loss": 0.5254, + "step": 656000 + }, + { + "epoch": 15.110067237726811, + "grad_norm": 3.9181089401245117, + "learning_rate": 7.729599627695933e-06, + "loss": 0.5276, + "step": 656200 + }, + { + "epoch": 15.114672561481072, + "grad_norm": 2.8905580043792725, + "learning_rate": 7.722328063873417e-06, + "loss": 0.5338, + "step": 656400 + }, + { + "epoch": 15.119277885235332, + "grad_norm": 3.0359580516815186, + "learning_rate": 7.715056500050901e-06, + "loss": 0.5302, + "step": 656600 + }, + { + "epoch": 15.123883208989591, + "grad_norm": 3.0906269550323486, + "learning_rate": 7.707784936228385e-06, + "loss": 0.5233, + "step": 656800 + }, + { + "epoch": 15.128488532743852, + "grad_norm": 2.517125129699707, + "learning_rate": 7.70051337240587e-06, + "loss": 0.5382, + "step": 657000 + }, + { + "epoch": 15.133093856498112, + "grad_norm": 3.945773124694824, + "learning_rate": 7.693241808583354e-06, + "loss": 0.5329, + "step": 657200 + }, + { + "epoch": 15.137699180252373, + "grad_norm": 3.2435407638549805, + "learning_rate": 7.68597024476084e-06, + "loss": 0.5289, + "step": 657400 + }, + { + "epoch": 15.142304504006631, + "grad_norm": 2.7934975624084473, + "learning_rate": 7.678698680938324e-06, + "loss": 0.5373, + "step": 657600 + }, + { + "epoch": 15.146909827760892, + "grad_norm": 2.8237783908843994, + "learning_rate": 7.671427117115806e-06, + "loss": 0.5276, + "step": 657800 + }, + { + "epoch": 15.151515151515152, + "grad_norm": 4.111272811889648, + "learning_rate": 7.664155553293292e-06, + "loss": 0.5264, + "step": 658000 + }, + { + "epoch": 15.156120475269411, + "grad_norm": 3.4282681941986084, + "learning_rate": 7.656883989470776e-06, + "loss": 0.5343, + "step": 658200 + }, + { + "epoch": 15.160725799023671, + "grad_norm": 2.7267274856567383, + "learning_rate": 7.649612425648259e-06, + "loss": 0.5311, + "step": 658400 + }, + { + "epoch": 15.165331122777932, + "grad_norm": 3.3111186027526855, + "learning_rate": 7.642340861825745e-06, + "loss": 0.5329, + "step": 658600 + }, + { + "epoch": 15.16993644653219, + "grad_norm": 2.9832489490509033, + "learning_rate": 7.635069298003229e-06, + "loss": 0.5277, + "step": 658800 + }, + { + "epoch": 15.174541770286451, + "grad_norm": 3.5788393020629883, + "learning_rate": 7.6277977341807125e-06, + "loss": 0.5254, + "step": 659000 + }, + { + "epoch": 15.179147094040712, + "grad_norm": 2.7909393310546875, + "learning_rate": 7.6205261703581975e-06, + "loss": 0.5332, + "step": 659200 + }, + { + "epoch": 15.18375241779497, + "grad_norm": 3.0156285762786865, + "learning_rate": 7.613254606535683e-06, + "loss": 0.5396, + "step": 659400 + }, + { + "epoch": 15.18835774154923, + "grad_norm": 3.3758177757263184, + "learning_rate": 7.605983042713165e-06, + "loss": 0.5257, + "step": 659600 + }, + { + "epoch": 15.192963065303491, + "grad_norm": 3.161837100982666, + "learning_rate": 7.59871147889065e-06, + "loss": 0.5314, + "step": 659800 + }, + { + "epoch": 15.19756838905775, + "grad_norm": 4.073904037475586, + "learning_rate": 7.591476272887247e-06, + "loss": 0.5275, + "step": 660000 + }, + { + "epoch": 15.20217371281201, + "grad_norm": 3.1027348041534424, + "learning_rate": 7.584204709064732e-06, + "loss": 0.5367, + "step": 660200 + }, + { + "epoch": 15.206779036566271, + "grad_norm": 2.5127763748168945, + "learning_rate": 7.5769331452422155e-06, + "loss": 0.5321, + "step": 660400 + }, + { + "epoch": 15.21138436032053, + "grad_norm": 3.317880153656006, + "learning_rate": 7.5696615814197e-06, + "loss": 0.535, + "step": 660600 + }, + { + "epoch": 15.21598968407479, + "grad_norm": 3.18585467338562, + "learning_rate": 7.562390017597185e-06, + "loss": 0.5364, + "step": 660800 + }, + { + "epoch": 15.22059500782905, + "grad_norm": 3.1549441814422607, + "learning_rate": 7.555118453774668e-06, + "loss": 0.5288, + "step": 661000 + }, + { + "epoch": 15.225200331583311, + "grad_norm": 3.0933032035827637, + "learning_rate": 7.547846889952153e-06, + "loss": 0.5315, + "step": 661200 + }, + { + "epoch": 15.22980565533757, + "grad_norm": 3.4483559131622314, + "learning_rate": 7.5405753261296375e-06, + "loss": 0.5372, + "step": 661400 + }, + { + "epoch": 15.23441097909183, + "grad_norm": 3.425278663635254, + "learning_rate": 7.5333037623071225e-06, + "loss": 0.5268, + "step": 661600 + }, + { + "epoch": 15.23901630284609, + "grad_norm": 2.674301862716675, + "learning_rate": 7.526032198484606e-06, + "loss": 0.523, + "step": 661800 + }, + { + "epoch": 15.24362162660035, + "grad_norm": 4.0348358154296875, + "learning_rate": 7.518760634662091e-06, + "loss": 0.5256, + "step": 662000 + }, + { + "epoch": 15.24822695035461, + "grad_norm": 2.689831018447876, + "learning_rate": 7.511489070839575e-06, + "loss": 0.5198, + "step": 662200 + }, + { + "epoch": 15.25283227410887, + "grad_norm": 3.238309860229492, + "learning_rate": 7.5042175070170586e-06, + "loss": 0.5295, + "step": 662400 + }, + { + "epoch": 15.25743759786313, + "grad_norm": 2.907236337661743, + "learning_rate": 7.496945943194544e-06, + "loss": 0.5316, + "step": 662600 + }, + { + "epoch": 15.26204292161739, + "grad_norm": 2.706756114959717, + "learning_rate": 7.4897107371911405e-06, + "loss": 0.522, + "step": 662800 + }, + { + "epoch": 15.26664824537165, + "grad_norm": 3.2358510494232178, + "learning_rate": 7.482439173368625e-06, + "loss": 0.5382, + "step": 663000 + }, + { + "epoch": 15.271253569125909, + "grad_norm": 2.505164861679077, + "learning_rate": 7.475167609546109e-06, + "loss": 0.5304, + "step": 663200 + }, + { + "epoch": 15.27585889288017, + "grad_norm": 2.7011849880218506, + "learning_rate": 7.467896045723593e-06, + "loss": 0.5381, + "step": 663400 + }, + { + "epoch": 15.28046421663443, + "grad_norm": 4.1736159324646, + "learning_rate": 7.460624481901077e-06, + "loss": 0.53, + "step": 663600 + }, + { + "epoch": 15.285069540388688, + "grad_norm": 2.8597121238708496, + "learning_rate": 7.4533529180785625e-06, + "loss": 0.5273, + "step": 663800 + }, + { + "epoch": 15.289674864142949, + "grad_norm": 4.301202297210693, + "learning_rate": 7.446081354256047e-06, + "loss": 0.5306, + "step": 664000 + }, + { + "epoch": 15.29428018789721, + "grad_norm": 3.257692575454712, + "learning_rate": 7.438809790433531e-06, + "loss": 0.5249, + "step": 664200 + }, + { + "epoch": 15.29888551165147, + "grad_norm": 4.112732887268066, + "learning_rate": 7.431538226611015e-06, + "loss": 0.5303, + "step": 664400 + }, + { + "epoch": 15.303490835405729, + "grad_norm": 3.026270627975464, + "learning_rate": 7.424266662788499e-06, + "loss": 0.5309, + "step": 664600 + }, + { + "epoch": 15.30809615915999, + "grad_norm": 3.302058219909668, + "learning_rate": 7.417031456785096e-06, + "loss": 0.5272, + "step": 664800 + }, + { + "epoch": 15.31270148291425, + "grad_norm": 3.1343607902526855, + "learning_rate": 7.409759892962581e-06, + "loss": 0.543, + "step": 665000 + }, + { + "epoch": 15.317306806668508, + "grad_norm": 3.4846322536468506, + "learning_rate": 7.402488329140065e-06, + "loss": 0.5345, + "step": 665200 + }, + { + "epoch": 15.321912130422769, + "grad_norm": 2.883300304412842, + "learning_rate": 7.395216765317549e-06, + "loss": 0.5179, + "step": 665400 + }, + { + "epoch": 15.32651745417703, + "grad_norm": 3.417581081390381, + "learning_rate": 7.387945201495034e-06, + "loss": 0.5259, + "step": 665600 + }, + { + "epoch": 15.331122777931288, + "grad_norm": 2.906959056854248, + "learning_rate": 7.380673637672518e-06, + "loss": 0.5253, + "step": 665800 + }, + { + "epoch": 15.335728101685548, + "grad_norm": 2.635720729827881, + "learning_rate": 7.3734020738500024e-06, + "loss": 0.5321, + "step": 666000 + }, + { + "epoch": 15.340333425439809, + "grad_norm": 2.8805277347564697, + "learning_rate": 7.366130510027487e-06, + "loss": 0.5336, + "step": 666200 + }, + { + "epoch": 15.344938749194068, + "grad_norm": 2.8946115970611572, + "learning_rate": 7.358858946204971e-06, + "loss": 0.5299, + "step": 666400 + }, + { + "epoch": 15.349544072948328, + "grad_norm": 3.4212236404418945, + "learning_rate": 7.351587382382456e-06, + "loss": 0.5307, + "step": 666600 + }, + { + "epoch": 15.354149396702589, + "grad_norm": 3.027736186981201, + "learning_rate": 7.344315818559939e-06, + "loss": 0.5367, + "step": 666800 + }, + { + "epoch": 15.358754720456847, + "grad_norm": 2.96783185005188, + "learning_rate": 7.3370442547374235e-06, + "loss": 0.5288, + "step": 667000 + }, + { + "epoch": 15.363360044211108, + "grad_norm": 3.2669103145599365, + "learning_rate": 7.329772690914909e-06, + "loss": 0.5318, + "step": 667200 + }, + { + "epoch": 15.367965367965368, + "grad_norm": 3.050448417663574, + "learning_rate": 7.322501127092393e-06, + "loss": 0.534, + "step": 667400 + }, + { + "epoch": 15.372570691719627, + "grad_norm": 4.087128162384033, + "learning_rate": 7.315229563269877e-06, + "loss": 0.5263, + "step": 667600 + }, + { + "epoch": 15.377176015473887, + "grad_norm": 3.5883004665374756, + "learning_rate": 7.307957999447361e-06, + "loss": 0.5277, + "step": 667800 + }, + { + "epoch": 15.381781339228148, + "grad_norm": 3.088322162628174, + "learning_rate": 7.3006864356248455e-06, + "loss": 0.5414, + "step": 668000 + }, + { + "epoch": 15.386386662982408, + "grad_norm": 3.5304698944091797, + "learning_rate": 7.2934148718023305e-06, + "loss": 0.5338, + "step": 668200 + }, + { + "epoch": 15.390991986736667, + "grad_norm": 3.154963731765747, + "learning_rate": 7.286143307979814e-06, + "loss": 0.5417, + "step": 668400 + }, + { + "epoch": 15.395597310490928, + "grad_norm": 3.262460231781006, + "learning_rate": 7.278871744157298e-06, + "loss": 0.5163, + "step": 668600 + }, + { + "epoch": 15.400202634245188, + "grad_norm": 3.046994209289551, + "learning_rate": 7.271600180334783e-06, + "loss": 0.5291, + "step": 668800 + }, + { + "epoch": 15.404807957999447, + "grad_norm": 3.0019094944000244, + "learning_rate": 7.264328616512267e-06, + "loss": 0.5308, + "step": 669000 + }, + { + "epoch": 15.409413281753707, + "grad_norm": 2.577397584915161, + "learning_rate": 7.257057052689752e-06, + "loss": 0.5328, + "step": 669200 + }, + { + "epoch": 15.414018605507968, + "grad_norm": 3.689873218536377, + "learning_rate": 7.249785488867236e-06, + "loss": 0.5362, + "step": 669400 + }, + { + "epoch": 15.418623929262226, + "grad_norm": 3.1702044010162354, + "learning_rate": 7.24251392504472e-06, + "loss": 0.5278, + "step": 669600 + }, + { + "epoch": 15.423229253016487, + "grad_norm": 2.942192316055298, + "learning_rate": 7.235278719041317e-06, + "loss": 0.5438, + "step": 669800 + }, + { + "epoch": 15.427834576770747, + "grad_norm": 3.0847814083099365, + "learning_rate": 7.228007155218802e-06, + "loss": 0.5425, + "step": 670000 + }, + { + "epoch": 15.432439900525006, + "grad_norm": 2.7735490798950195, + "learning_rate": 7.220735591396285e-06, + "loss": 0.5362, + "step": 670200 + }, + { + "epoch": 15.437045224279267, + "grad_norm": 3.166234254837036, + "learning_rate": 7.21346402757377e-06, + "loss": 0.528, + "step": 670400 + }, + { + "epoch": 15.441650548033527, + "grad_norm": 4.274187088012695, + "learning_rate": 7.206192463751255e-06, + "loss": 0.5324, + "step": 670600 + }, + { + "epoch": 15.446255871787788, + "grad_norm": 3.542017698287964, + "learning_rate": 7.198920899928739e-06, + "loss": 0.5365, + "step": 670800 + }, + { + "epoch": 15.450861195542046, + "grad_norm": 3.5981078147888184, + "learning_rate": 7.191649336106223e-06, + "loss": 0.5345, + "step": 671000 + }, + { + "epoch": 15.455466519296307, + "grad_norm": 2.7863192558288574, + "learning_rate": 7.184377772283707e-06, + "loss": 0.5415, + "step": 671200 + }, + { + "epoch": 15.460071843050567, + "grad_norm": 3.475003957748413, + "learning_rate": 7.177106208461192e-06, + "loss": 0.5261, + "step": 671400 + }, + { + "epoch": 15.464677166804826, + "grad_norm": 2.9916470050811768, + "learning_rate": 7.169834644638677e-06, + "loss": 0.5382, + "step": 671600 + }, + { + "epoch": 15.469282490559086, + "grad_norm": 3.3957395553588867, + "learning_rate": 7.162563080816161e-06, + "loss": 0.5401, + "step": 671800 + }, + { + "epoch": 15.473887814313347, + "grad_norm": 3.301769495010376, + "learning_rate": 7.155291516993644e-06, + "loss": 0.529, + "step": 672000 + }, + { + "epoch": 15.478493138067606, + "grad_norm": 3.8502883911132812, + "learning_rate": 7.148019953171129e-06, + "loss": 0.5284, + "step": 672200 + }, + { + "epoch": 15.483098461821866, + "grad_norm": 2.8070068359375, + "learning_rate": 7.1407483893486135e-06, + "loss": 0.5286, + "step": 672400 + }, + { + "epoch": 15.487703785576127, + "grad_norm": 3.680961847305298, + "learning_rate": 7.133476825526099e-06, + "loss": 0.5374, + "step": 672600 + }, + { + "epoch": 15.492309109330385, + "grad_norm": 3.303067922592163, + "learning_rate": 7.126205261703582e-06, + "loss": 0.5306, + "step": 672800 + }, + { + "epoch": 15.496914433084646, + "grad_norm": 3.2984859943389893, + "learning_rate": 7.118933697881066e-06, + "loss": 0.5258, + "step": 673000 + }, + { + "epoch": 15.501519756838906, + "grad_norm": 3.2153737545013428, + "learning_rate": 7.111662134058551e-06, + "loss": 0.5297, + "step": 673200 + }, + { + "epoch": 15.506125080593165, + "grad_norm": 2.8928987979888916, + "learning_rate": 7.1043905702360355e-06, + "loss": 0.5369, + "step": 673400 + }, + { + "epoch": 15.510730404347425, + "grad_norm": 3.484666109085083, + "learning_rate": 7.097119006413519e-06, + "loss": 0.5319, + "step": 673600 + }, + { + "epoch": 15.515335728101686, + "grad_norm": 3.3581833839416504, + "learning_rate": 7.089883800410116e-06, + "loss": 0.5299, + "step": 673800 + }, + { + "epoch": 15.519941051855945, + "grad_norm": 3.2200675010681152, + "learning_rate": 7.082612236587601e-06, + "loss": 0.5264, + "step": 674000 + }, + { + "epoch": 15.524546375610205, + "grad_norm": 3.4437928199768066, + "learning_rate": 7.075340672765085e-06, + "loss": 0.5304, + "step": 674200 + }, + { + "epoch": 15.529151699364466, + "grad_norm": 2.679928779602051, + "learning_rate": 7.06806910894257e-06, + "loss": 0.5364, + "step": 674400 + }, + { + "epoch": 15.533757023118724, + "grad_norm": 3.410003662109375, + "learning_rate": 7.0607975451200535e-06, + "loss": 0.5329, + "step": 674600 + }, + { + "epoch": 15.538362346872985, + "grad_norm": 2.5245554447174072, + "learning_rate": 7.053525981297538e-06, + "loss": 0.5294, + "step": 674800 + }, + { + "epoch": 15.542967670627245, + "grad_norm": 3.400965452194214, + "learning_rate": 7.046290775294135e-06, + "loss": 0.5289, + "step": 675000 + }, + { + "epoch": 15.547572994381506, + "grad_norm": 3.5481555461883545, + "learning_rate": 7.03901921147162e-06, + "loss": 0.534, + "step": 675200 + }, + { + "epoch": 15.552178318135764, + "grad_norm": 3.425252914428711, + "learning_rate": 7.031747647649104e-06, + "loss": 0.5249, + "step": 675400 + }, + { + "epoch": 15.556783641890025, + "grad_norm": 3.6418352127075195, + "learning_rate": 7.024476083826587e-06, + "loss": 0.5178, + "step": 675600 + }, + { + "epoch": 15.561388965644285, + "grad_norm": 2.9544789791107178, + "learning_rate": 7.017204520004072e-06, + "loss": 0.5337, + "step": 675800 + }, + { + "epoch": 15.565994289398544, + "grad_norm": 2.8568692207336426, + "learning_rate": 7.0099329561815565e-06, + "loss": 0.5304, + "step": 676000 + }, + { + "epoch": 15.570599613152805, + "grad_norm": 3.2933669090270996, + "learning_rate": 7.002661392359042e-06, + "loss": 0.5202, + "step": 676200 + }, + { + "epoch": 15.575204936907065, + "grad_norm": 2.9382436275482178, + "learning_rate": 6.995389828536525e-06, + "loss": 0.5256, + "step": 676400 + }, + { + "epoch": 15.579810260661324, + "grad_norm": 2.4080753326416016, + "learning_rate": 6.988118264714009e-06, + "loss": 0.5245, + "step": 676600 + }, + { + "epoch": 15.584415584415584, + "grad_norm": 2.816089630126953, + "learning_rate": 6.980846700891494e-06, + "loss": 0.5256, + "step": 676800 + }, + { + "epoch": 15.589020908169845, + "grad_norm": 2.963587999343872, + "learning_rate": 6.9735751370689785e-06, + "loss": 0.5329, + "step": 677000 + }, + { + "epoch": 15.593626231924103, + "grad_norm": 3.300342559814453, + "learning_rate": 6.966303573246462e-06, + "loss": 0.5376, + "step": 677200 + }, + { + "epoch": 15.598231555678364, + "grad_norm": 3.2802958488464355, + "learning_rate": 6.959032009423947e-06, + "loss": 0.528, + "step": 677400 + }, + { + "epoch": 15.602836879432624, + "grad_norm": 2.6798970699310303, + "learning_rate": 6.951760445601431e-06, + "loss": 0.5237, + "step": 677600 + }, + { + "epoch": 15.607442203186885, + "grad_norm": 3.517439603805542, + "learning_rate": 6.944488881778916e-06, + "loss": 0.5408, + "step": 677800 + }, + { + "epoch": 15.612047526941144, + "grad_norm": 3.87048602104187, + "learning_rate": 6.9372173179564e-06, + "loss": 0.534, + "step": 678000 + }, + { + "epoch": 15.616652850695404, + "grad_norm": 2.775428295135498, + "learning_rate": 6.929945754133884e-06, + "loss": 0.5248, + "step": 678200 + }, + { + "epoch": 15.621258174449665, + "grad_norm": 2.7201571464538574, + "learning_rate": 6.922710548130481e-06, + "loss": 0.5338, + "step": 678400 + }, + { + "epoch": 15.625863498203923, + "grad_norm": 2.8548715114593506, + "learning_rate": 6.915438984307966e-06, + "loss": 0.5275, + "step": 678600 + }, + { + "epoch": 15.630468821958184, + "grad_norm": 2.6835644245147705, + "learning_rate": 6.90816742048545e-06, + "loss": 0.532, + "step": 678800 + }, + { + "epoch": 15.635074145712444, + "grad_norm": 2.5075621604919434, + "learning_rate": 6.900895856662933e-06, + "loss": 0.5155, + "step": 679000 + }, + { + "epoch": 15.639679469466703, + "grad_norm": 3.476208448410034, + "learning_rate": 6.89366065065953e-06, + "loss": 0.5243, + "step": 679200 + }, + { + "epoch": 15.644284793220963, + "grad_norm": 2.8655552864074707, + "learning_rate": 6.886389086837015e-06, + "loss": 0.5279, + "step": 679400 + }, + { + "epoch": 15.648890116975224, + "grad_norm": 2.7669894695281982, + "learning_rate": 6.8791175230144996e-06, + "loss": 0.5381, + "step": 679600 + }, + { + "epoch": 15.653495440729483, + "grad_norm": 4.442414283752441, + "learning_rate": 6.871845959191985e-06, + "loss": 0.5195, + "step": 679800 + }, + { + "epoch": 15.658100764483743, + "grad_norm": 2.7395546436309814, + "learning_rate": 6.864574395369468e-06, + "loss": 0.5274, + "step": 680000 + }, + { + "epoch": 15.662706088238004, + "grad_norm": 3.1513760089874268, + "learning_rate": 6.857302831546952e-06, + "loss": 0.5258, + "step": 680200 + }, + { + "epoch": 15.667311411992262, + "grad_norm": 3.7221457958221436, + "learning_rate": 6.850031267724437e-06, + "loss": 0.5325, + "step": 680400 + }, + { + "epoch": 15.671916735746523, + "grad_norm": 2.821197271347046, + "learning_rate": 6.8427597039019215e-06, + "loss": 0.5167, + "step": 680600 + }, + { + "epoch": 15.676522059500783, + "grad_norm": 3.9420833587646484, + "learning_rate": 6.835488140079406e-06, + "loss": 0.5301, + "step": 680800 + }, + { + "epoch": 15.681127383255042, + "grad_norm": 3.045525312423706, + "learning_rate": 6.82821657625689e-06, + "loss": 0.5269, + "step": 681000 + }, + { + "epoch": 15.685732707009302, + "grad_norm": 3.1435394287109375, + "learning_rate": 6.820945012434374e-06, + "loss": 0.5343, + "step": 681200 + }, + { + "epoch": 15.690338030763563, + "grad_norm": 3.5429775714874268, + "learning_rate": 6.813673448611859e-06, + "loss": 0.5335, + "step": 681400 + }, + { + "epoch": 15.694943354517823, + "grad_norm": 3.320526361465454, + "learning_rate": 6.8064018847893435e-06, + "loss": 0.5305, + "step": 681600 + }, + { + "epoch": 15.699548678272082, + "grad_norm": 3.505725860595703, + "learning_rate": 6.799130320966827e-06, + "loss": 0.5378, + "step": 681800 + }, + { + "epoch": 15.704154002026343, + "grad_norm": 3.6867668628692627, + "learning_rate": 6.791858757144312e-06, + "loss": 0.5328, + "step": 682000 + }, + { + "epoch": 15.708759325780603, + "grad_norm": 3.264688491821289, + "learning_rate": 6.784587193321796e-06, + "loss": 0.5392, + "step": 682200 + }, + { + "epoch": 15.713364649534862, + "grad_norm": 2.858060121536255, + "learning_rate": 6.77731562949928e-06, + "loss": 0.5387, + "step": 682400 + }, + { + "epoch": 15.717969973289122, + "grad_norm": 3.1449389457702637, + "learning_rate": 6.7700440656767646e-06, + "loss": 0.5224, + "step": 682600 + }, + { + "epoch": 15.722575297043383, + "grad_norm": 4.474485397338867, + "learning_rate": 6.762772501854249e-06, + "loss": 0.5496, + "step": 682800 + }, + { + "epoch": 15.727180620797641, + "grad_norm": 3.6581742763519287, + "learning_rate": 6.755500938031734e-06, + "loss": 0.5347, + "step": 683000 + }, + { + "epoch": 15.731785944551902, + "grad_norm": 3.4204602241516113, + "learning_rate": 6.748229374209218e-06, + "loss": 0.5319, + "step": 683200 + }, + { + "epoch": 15.736391268306162, + "grad_norm": 3.710782289505005, + "learning_rate": 6.7409578103867014e-06, + "loss": 0.5362, + "step": 683400 + }, + { + "epoch": 15.740996592060421, + "grad_norm": 3.196833610534668, + "learning_rate": 6.7336862465641865e-06, + "loss": 0.5305, + "step": 683600 + }, + { + "epoch": 15.745601915814682, + "grad_norm": 3.187162160873413, + "learning_rate": 6.726414682741671e-06, + "loss": 0.5248, + "step": 683800 + }, + { + "epoch": 15.750207239568942, + "grad_norm": 3.471301794052124, + "learning_rate": 6.719143118919155e-06, + "loss": 0.5289, + "step": 684000 + }, + { + "epoch": 15.7548125633232, + "grad_norm": 3.092719316482544, + "learning_rate": 6.711871555096639e-06, + "loss": 0.5483, + "step": 684200 + }, + { + "epoch": 15.759417887077461, + "grad_norm": 2.3404221534729004, + "learning_rate": 6.704599991274123e-06, + "loss": 0.5327, + "step": 684400 + }, + { + "epoch": 15.764023210831722, + "grad_norm": 3.6259121894836426, + "learning_rate": 6.69736478527072e-06, + "loss": 0.53, + "step": 684600 + }, + { + "epoch": 15.768628534585982, + "grad_norm": 3.0392374992370605, + "learning_rate": 6.690129579267317e-06, + "loss": 0.5328, + "step": 684800 + }, + { + "epoch": 15.773233858340241, + "grad_norm": 3.2242770195007324, + "learning_rate": 6.682858015444802e-06, + "loss": 0.5337, + "step": 685000 + }, + { + "epoch": 15.777839182094501, + "grad_norm": 2.7651524543762207, + "learning_rate": 6.6755864516222865e-06, + "loss": 0.5286, + "step": 685200 + }, + { + "epoch": 15.782444505848762, + "grad_norm": 2.7037301063537598, + "learning_rate": 6.66831488779977e-06, + "loss": 0.5247, + "step": 685400 + }, + { + "epoch": 15.78704982960302, + "grad_norm": 3.071185350418091, + "learning_rate": 6.661043323977255e-06, + "loss": 0.5305, + "step": 685600 + }, + { + "epoch": 15.791655153357281, + "grad_norm": 3.1269609928131104, + "learning_rate": 6.653771760154739e-06, + "loss": 0.523, + "step": 685800 + }, + { + "epoch": 15.796260477111542, + "grad_norm": 2.9973862171173096, + "learning_rate": 6.646500196332223e-06, + "loss": 0.5328, + "step": 686000 + }, + { + "epoch": 15.8008658008658, + "grad_norm": 2.5962016582489014, + "learning_rate": 6.6392286325097076e-06, + "loss": 0.5325, + "step": 686200 + }, + { + "epoch": 15.80547112462006, + "grad_norm": 3.33072566986084, + "learning_rate": 6.631957068687192e-06, + "loss": 0.5284, + "step": 686400 + }, + { + "epoch": 15.810076448374321, + "grad_norm": 2.7956624031066895, + "learning_rate": 6.624685504864677e-06, + "loss": 0.5206, + "step": 686600 + }, + { + "epoch": 15.81468177212858, + "grad_norm": 3.8837435245513916, + "learning_rate": 6.617413941042161e-06, + "loss": 0.5238, + "step": 686800 + }, + { + "epoch": 15.81928709588284, + "grad_norm": 3.072110652923584, + "learning_rate": 6.6101423772196445e-06, + "loss": 0.5346, + "step": 687000 + }, + { + "epoch": 15.823892419637101, + "grad_norm": 2.895686388015747, + "learning_rate": 6.6028708133971295e-06, + "loss": 0.5327, + "step": 687200 + }, + { + "epoch": 15.82849774339136, + "grad_norm": 3.0027430057525635, + "learning_rate": 6.595599249574614e-06, + "loss": 0.5186, + "step": 687400 + }, + { + "epoch": 15.83310306714562, + "grad_norm": 2.7608888149261475, + "learning_rate": 6.588327685752098e-06, + "loss": 0.5347, + "step": 687600 + }, + { + "epoch": 15.83770839089988, + "grad_norm": 3.104914903640747, + "learning_rate": 6.581056121929582e-06, + "loss": 0.5289, + "step": 687800 + }, + { + "epoch": 15.84231371465414, + "grad_norm": 4.210239410400391, + "learning_rate": 6.573784558107066e-06, + "loss": 0.542, + "step": 688000 + }, + { + "epoch": 15.8469190384084, + "grad_norm": 2.743356704711914, + "learning_rate": 6.566512994284551e-06, + "loss": 0.5315, + "step": 688200 + }, + { + "epoch": 15.85152436216266, + "grad_norm": 3.4717984199523926, + "learning_rate": 6.559241430462036e-06, + "loss": 0.5337, + "step": 688400 + }, + { + "epoch": 15.85612968591692, + "grad_norm": 3.4075236320495605, + "learning_rate": 6.551969866639519e-06, + "loss": 0.5363, + "step": 688600 + }, + { + "epoch": 15.86073500967118, + "grad_norm": 4.167804718017578, + "learning_rate": 6.544698302817004e-06, + "loss": 0.5274, + "step": 688800 + }, + { + "epoch": 15.86534033342544, + "grad_norm": 3.3863329887390137, + "learning_rate": 6.537426738994488e-06, + "loss": 0.5284, + "step": 689000 + }, + { + "epoch": 15.8699456571797, + "grad_norm": 3.17704439163208, + "learning_rate": 6.5301551751719726e-06, + "loss": 0.5336, + "step": 689200 + }, + { + "epoch": 15.874550980933959, + "grad_norm": 2.546211004257202, + "learning_rate": 6.522883611349458e-06, + "loss": 0.5344, + "step": 689400 + }, + { + "epoch": 15.87915630468822, + "grad_norm": 3.9471852779388428, + "learning_rate": 6.515612047526941e-06, + "loss": 0.5266, + "step": 689600 + }, + { + "epoch": 15.88376162844248, + "grad_norm": 2.803940773010254, + "learning_rate": 6.508340483704425e-06, + "loss": 0.5422, + "step": 689800 + }, + { + "epoch": 15.888366952196739, + "grad_norm": 3.0068013668060303, + "learning_rate": 6.50106891988191e-06, + "loss": 0.5318, + "step": 690000 + }, + { + "epoch": 15.892972275951, + "grad_norm": 3.8956100940704346, + "learning_rate": 6.4937973560593945e-06, + "loss": 0.5329, + "step": 690200 + }, + { + "epoch": 15.89757759970526, + "grad_norm": 2.6682162284851074, + "learning_rate": 6.486562150055991e-06, + "loss": 0.5214, + "step": 690400 + }, + { + "epoch": 15.902182923459518, + "grad_norm": 3.1031453609466553, + "learning_rate": 6.479290586233476e-06, + "loss": 0.5215, + "step": 690600 + }, + { + "epoch": 15.906788247213779, + "grad_norm": 3.070067882537842, + "learning_rate": 6.47201902241096e-06, + "loss": 0.5341, + "step": 690800 + }, + { + "epoch": 15.91139357096804, + "grad_norm": 3.4854178428649902, + "learning_rate": 6.464747458588444e-06, + "loss": 0.5161, + "step": 691000 + }, + { + "epoch": 15.9159988947223, + "grad_norm": 2.887986421585083, + "learning_rate": 6.457475894765929e-06, + "loss": 0.5282, + "step": 691200 + }, + { + "epoch": 15.920604218476559, + "grad_norm": 3.0503897666931152, + "learning_rate": 6.4502043309434125e-06, + "loss": 0.5358, + "step": 691400 + }, + { + "epoch": 15.925209542230819, + "grad_norm": 2.886718988418579, + "learning_rate": 6.442932767120897e-06, + "loss": 0.5237, + "step": 691600 + }, + { + "epoch": 15.92981486598508, + "grad_norm": 3.217623710632324, + "learning_rate": 6.435661203298382e-06, + "loss": 0.5236, + "step": 691800 + }, + { + "epoch": 15.934420189739338, + "grad_norm": 3.3502800464630127, + "learning_rate": 6.428389639475866e-06, + "loss": 0.531, + "step": 692000 + }, + { + "epoch": 15.939025513493599, + "grad_norm": 2.8795714378356934, + "learning_rate": 6.42111807565335e-06, + "loss": 0.5261, + "step": 692200 + }, + { + "epoch": 15.94363083724786, + "grad_norm": 3.2952957153320312, + "learning_rate": 6.4138465118308345e-06, + "loss": 0.5382, + "step": 692400 + }, + { + "epoch": 15.948236161002118, + "grad_norm": 3.00901460647583, + "learning_rate": 6.406574948008319e-06, + "loss": 0.5271, + "step": 692600 + }, + { + "epoch": 15.952841484756378, + "grad_norm": 2.900832414627075, + "learning_rate": 6.399303384185804e-06, + "loss": 0.5227, + "step": 692800 + }, + { + "epoch": 15.957446808510639, + "grad_norm": 3.579869031906128, + "learning_rate": 6.392031820363287e-06, + "loss": 0.5219, + "step": 693000 + }, + { + "epoch": 15.962052132264898, + "grad_norm": 2.8938448429107666, + "learning_rate": 6.384760256540771e-06, + "loss": 0.5369, + "step": 693200 + }, + { + "epoch": 15.966657456019158, + "grad_norm": 3.0627634525299072, + "learning_rate": 6.377488692718256e-06, + "loss": 0.5327, + "step": 693400 + }, + { + "epoch": 15.971262779773419, + "grad_norm": 3.0290048122406006, + "learning_rate": 6.370217128895741e-06, + "loss": 0.5283, + "step": 693600 + }, + { + "epoch": 15.975868103527677, + "grad_norm": 2.843632698059082, + "learning_rate": 6.362945565073225e-06, + "loss": 0.5388, + "step": 693800 + }, + { + "epoch": 15.980473427281938, + "grad_norm": 3.2812376022338867, + "learning_rate": 6.355674001250709e-06, + "loss": 0.5199, + "step": 694000 + }, + { + "epoch": 15.985078751036198, + "grad_norm": 3.616701126098633, + "learning_rate": 6.348438795247306e-06, + "loss": 0.5359, + "step": 694200 + }, + { + "epoch": 15.989684074790457, + "grad_norm": 3.0894737243652344, + "learning_rate": 6.34116723142479e-06, + "loss": 0.5268, + "step": 694400 + }, + { + "epoch": 15.994289398544717, + "grad_norm": 3.021001100540161, + "learning_rate": 6.333895667602275e-06, + "loss": 0.5346, + "step": 694600 + }, + { + "epoch": 15.998894722298978, + "grad_norm": 3.486922264099121, + "learning_rate": 6.326624103779759e-06, + "loss": 0.5314, + "step": 694800 + }, + { + "epoch": 16.0, + "eval_loss": 0.5152611136436462, + "eval_runtime": 161.1159, + "eval_samples_per_second": 176.029, + "eval_steps_per_second": 11.005, + "step": 694848 + }, + { + "epoch": 16.003500046053237, + "grad_norm": 3.235459566116333, + "learning_rate": 6.319352539957243e-06, + "loss": 0.5261, + "step": 695000 + }, + { + "epoch": 16.0081053698075, + "grad_norm": 2.7268731594085693, + "learning_rate": 6.312080976134728e-06, + "loss": 0.5294, + "step": 695200 + }, + { + "epoch": 16.012710693561758, + "grad_norm": 2.6637632846832275, + "learning_rate": 6.304809412312212e-06, + "loss": 0.5276, + "step": 695400 + }, + { + "epoch": 16.017316017316016, + "grad_norm": 2.3737680912017822, + "learning_rate": 6.297537848489696e-06, + "loss": 0.5162, + "step": 695600 + }, + { + "epoch": 16.02192134107028, + "grad_norm": 2.3051598072052, + "learning_rate": 6.2902662846671806e-06, + "loss": 0.5141, + "step": 695800 + }, + { + "epoch": 16.026526664824537, + "grad_norm": 3.3139994144439697, + "learning_rate": 6.282994720844665e-06, + "loss": 0.5175, + "step": 696000 + }, + { + "epoch": 16.031131988578796, + "grad_norm": 2.9257097244262695, + "learning_rate": 6.27572315702215e-06, + "loss": 0.5121, + "step": 696200 + }, + { + "epoch": 16.035737312333058, + "grad_norm": 3.4968149662017822, + "learning_rate": 6.268451593199633e-06, + "loss": 0.5392, + "step": 696400 + }, + { + "epoch": 16.040342636087317, + "grad_norm": 2.8969199657440186, + "learning_rate": 6.2611800293771174e-06, + "loss": 0.5296, + "step": 696600 + }, + { + "epoch": 16.044947959841576, + "grad_norm": 2.9978652000427246, + "learning_rate": 6.253944823373714e-06, + "loss": 0.5304, + "step": 696800 + }, + { + "epoch": 16.049553283595838, + "grad_norm": 3.812542676925659, + "learning_rate": 6.246673259551199e-06, + "loss": 0.526, + "step": 697000 + }, + { + "epoch": 16.054158607350097, + "grad_norm": 3.203511953353882, + "learning_rate": 6.239401695728684e-06, + "loss": 0.5188, + "step": 697200 + }, + { + "epoch": 16.058763931104355, + "grad_norm": 3.406111478805542, + "learning_rate": 6.232130131906168e-06, + "loss": 0.5199, + "step": 697400 + }, + { + "epoch": 16.063369254858618, + "grad_norm": 3.4208409786224365, + "learning_rate": 6.224858568083652e-06, + "loss": 0.5317, + "step": 697600 + }, + { + "epoch": 16.067974578612876, + "grad_norm": 3.2451865673065186, + "learning_rate": 6.217587004261136e-06, + "loss": 0.5201, + "step": 697800 + }, + { + "epoch": 16.072579902367135, + "grad_norm": 2.8705670833587646, + "learning_rate": 6.210315440438621e-06, + "loss": 0.53, + "step": 698000 + }, + { + "epoch": 16.077185226121397, + "grad_norm": 3.8681085109710693, + "learning_rate": 6.203043876616106e-06, + "loss": 0.5249, + "step": 698200 + }, + { + "epoch": 16.081790549875656, + "grad_norm": 4.016938209533691, + "learning_rate": 6.195772312793589e-06, + "loss": 0.5138, + "step": 698400 + }, + { + "epoch": 16.086395873629915, + "grad_norm": 2.935976266860962, + "learning_rate": 6.188500748971074e-06, + "loss": 0.5238, + "step": 698600 + }, + { + "epoch": 16.091001197384177, + "grad_norm": 3.2641563415527344, + "learning_rate": 6.181229185148558e-06, + "loss": 0.5174, + "step": 698800 + }, + { + "epoch": 16.095606521138436, + "grad_norm": 3.6100635528564453, + "learning_rate": 6.173993979145155e-06, + "loss": 0.5299, + "step": 699000 + }, + { + "epoch": 16.100211844892694, + "grad_norm": 3.5034003257751465, + "learning_rate": 6.16672241532264e-06, + "loss": 0.5293, + "step": 699200 + }, + { + "epoch": 16.104817168646957, + "grad_norm": 2.800861358642578, + "learning_rate": 6.159450851500124e-06, + "loss": 0.5214, + "step": 699400 + }, + { + "epoch": 16.109422492401215, + "grad_norm": 3.0961642265319824, + "learning_rate": 6.152179287677608e-06, + "loss": 0.5254, + "step": 699600 + }, + { + "epoch": 16.114027816155474, + "grad_norm": 2.8042900562286377, + "learning_rate": 6.144907723855093e-06, + "loss": 0.5155, + "step": 699800 + }, + { + "epoch": 16.118633139909736, + "grad_norm": 3.172004461288452, + "learning_rate": 6.137636160032577e-06, + "loss": 0.5195, + "step": 700000 + }, + { + "epoch": 16.123238463663995, + "grad_norm": 3.4041786193847656, + "learning_rate": 6.1303645962100605e-06, + "loss": 0.5156, + "step": 700200 + }, + { + "epoch": 16.127843787418257, + "grad_norm": 3.219374895095825, + "learning_rate": 6.1230930323875455e-06, + "loss": 0.5299, + "step": 700400 + }, + { + "epoch": 16.132449111172516, + "grad_norm": 2.8847038745880127, + "learning_rate": 6.11582146856503e-06, + "loss": 0.5128, + "step": 700600 + }, + { + "epoch": 16.137054434926775, + "grad_norm": 2.9731578826904297, + "learning_rate": 6.108622620380739e-06, + "loss": 0.5165, + "step": 700800 + }, + { + "epoch": 16.141659758681037, + "grad_norm": 3.0595650672912598, + "learning_rate": 6.1013510565582236e-06, + "loss": 0.5284, + "step": 701000 + }, + { + "epoch": 16.146265082435296, + "grad_norm": 2.8716259002685547, + "learning_rate": 6.094079492735709e-06, + "loss": 0.519, + "step": 701200 + }, + { + "epoch": 16.150870406189554, + "grad_norm": 3.1690585613250732, + "learning_rate": 6.086807928913192e-06, + "loss": 0.529, + "step": 701400 + }, + { + "epoch": 16.155475729943817, + "grad_norm": 2.8554840087890625, + "learning_rate": 6.079536365090676e-06, + "loss": 0.5153, + "step": 701600 + }, + { + "epoch": 16.160081053698075, + "grad_norm": 2.701791524887085, + "learning_rate": 6.072264801268161e-06, + "loss": 0.5385, + "step": 701800 + }, + { + "epoch": 16.164686377452334, + "grad_norm": 2.9476139545440674, + "learning_rate": 6.0649932374456455e-06, + "loss": 0.5344, + "step": 702000 + }, + { + "epoch": 16.169291701206596, + "grad_norm": 3.677595853805542, + "learning_rate": 6.057721673623129e-06, + "loss": 0.5302, + "step": 702200 + }, + { + "epoch": 16.173897024960855, + "grad_norm": 3.322723388671875, + "learning_rate": 6.050450109800614e-06, + "loss": 0.5301, + "step": 702400 + }, + { + "epoch": 16.178502348715114, + "grad_norm": 3.177597761154175, + "learning_rate": 6.043178545978098e-06, + "loss": 0.5293, + "step": 702600 + }, + { + "epoch": 16.183107672469376, + "grad_norm": 2.9423975944519043, + "learning_rate": 6.035906982155583e-06, + "loss": 0.5263, + "step": 702800 + }, + { + "epoch": 16.187712996223635, + "grad_norm": 3.1583240032196045, + "learning_rate": 6.028635418333067e-06, + "loss": 0.5101, + "step": 703000 + }, + { + "epoch": 16.192318319977893, + "grad_norm": 2.8584578037261963, + "learning_rate": 6.021363854510551e-06, + "loss": 0.523, + "step": 703200 + }, + { + "epoch": 16.196923643732156, + "grad_norm": 2.663959503173828, + "learning_rate": 6.014092290688036e-06, + "loss": 0.5341, + "step": 703400 + }, + { + "epoch": 16.201528967486414, + "grad_norm": 3.1101975440979004, + "learning_rate": 6.00682072686552e-06, + "loss": 0.5324, + "step": 703600 + }, + { + "epoch": 16.206134291240673, + "grad_norm": 3.0388479232788086, + "learning_rate": 5.9995491630430035e-06, + "loss": 0.5287, + "step": 703800 + }, + { + "epoch": 16.210739614994935, + "grad_norm": 2.731867790222168, + "learning_rate": 5.9922775992204886e-06, + "loss": 0.5305, + "step": 704000 + }, + { + "epoch": 16.215344938749194, + "grad_norm": 3.142996311187744, + "learning_rate": 5.985006035397973e-06, + "loss": 0.5323, + "step": 704200 + }, + { + "epoch": 16.219950262503453, + "grad_norm": 3.147834062576294, + "learning_rate": 5.977734471575458e-06, + "loss": 0.5253, + "step": 704400 + }, + { + "epoch": 16.224555586257715, + "grad_norm": 2.3879802227020264, + "learning_rate": 5.970462907752941e-06, + "loss": 0.5245, + "step": 704600 + }, + { + "epoch": 16.229160910011974, + "grad_norm": 2.7794997692108154, + "learning_rate": 5.9631913439304254e-06, + "loss": 0.5124, + "step": 704800 + }, + { + "epoch": 16.233766233766232, + "grad_norm": 3.1171607971191406, + "learning_rate": 5.9559197801079105e-06, + "loss": 0.5293, + "step": 705000 + }, + { + "epoch": 16.238371557520495, + "grad_norm": 3.1548678874969482, + "learning_rate": 5.948648216285395e-06, + "loss": 0.523, + "step": 705200 + }, + { + "epoch": 16.242976881274753, + "grad_norm": 3.2496373653411865, + "learning_rate": 5.941376652462878e-06, + "loss": 0.5198, + "step": 705400 + }, + { + "epoch": 16.247582205029012, + "grad_norm": 3.0726122856140137, + "learning_rate": 5.934105088640363e-06, + "loss": 0.5216, + "step": 705600 + }, + { + "epoch": 16.252187528783274, + "grad_norm": 3.4089269638061523, + "learning_rate": 5.92686988263696e-06, + "loss": 0.5391, + "step": 705800 + }, + { + "epoch": 16.256792852537533, + "grad_norm": 2.813822031021118, + "learning_rate": 5.919598318814444e-06, + "loss": 0.5194, + "step": 706000 + }, + { + "epoch": 16.26139817629179, + "grad_norm": 3.2995693683624268, + "learning_rate": 5.912326754991929e-06, + "loss": 0.5242, + "step": 706200 + }, + { + "epoch": 16.266003500046054, + "grad_norm": 3.632739543914795, + "learning_rate": 5.905055191169413e-06, + "loss": 0.5327, + "step": 706400 + }, + { + "epoch": 16.270608823800313, + "grad_norm": 2.6692042350769043, + "learning_rate": 5.897783627346897e-06, + "loss": 0.5227, + "step": 706600 + }, + { + "epoch": 16.27521414755457, + "grad_norm": 3.4040117263793945, + "learning_rate": 5.890512063524382e-06, + "loss": 0.524, + "step": 706800 + }, + { + "epoch": 16.279819471308834, + "grad_norm": 2.4502665996551514, + "learning_rate": 5.883240499701866e-06, + "loss": 0.5203, + "step": 707000 + }, + { + "epoch": 16.284424795063092, + "grad_norm": 3.1462149620056152, + "learning_rate": 5.8759689358793504e-06, + "loss": 0.5262, + "step": 707200 + }, + { + "epoch": 16.289030118817355, + "grad_norm": 3.5388402938842773, + "learning_rate": 5.868697372056835e-06, + "loss": 0.5278, + "step": 707400 + }, + { + "epoch": 16.293635442571613, + "grad_norm": 2.5811402797698975, + "learning_rate": 5.861425808234319e-06, + "loss": 0.5255, + "step": 707600 + }, + { + "epoch": 16.298240766325872, + "grad_norm": 2.9123117923736572, + "learning_rate": 5.854154244411804e-06, + "loss": 0.5326, + "step": 707800 + }, + { + "epoch": 16.302846090080134, + "grad_norm": 3.3094441890716553, + "learning_rate": 5.846919038408401e-06, + "loss": 0.5216, + "step": 708000 + }, + { + "epoch": 16.307451413834393, + "grad_norm": 3.1968655586242676, + "learning_rate": 5.839647474585885e-06, + "loss": 0.5238, + "step": 708200 + }, + { + "epoch": 16.31205673758865, + "grad_norm": 3.054832935333252, + "learning_rate": 5.8323759107633685e-06, + "loss": 0.5256, + "step": 708400 + }, + { + "epoch": 16.316662061342914, + "grad_norm": 2.9398386478424072, + "learning_rate": 5.8251043469408535e-06, + "loss": 0.5221, + "step": 708600 + }, + { + "epoch": 16.321267385097173, + "grad_norm": 2.4585120677948, + "learning_rate": 5.817832783118338e-06, + "loss": 0.5256, + "step": 708800 + }, + { + "epoch": 16.32587270885143, + "grad_norm": 3.5368127822875977, + "learning_rate": 5.810561219295822e-06, + "loss": 0.5297, + "step": 709000 + }, + { + "epoch": 16.330478032605694, + "grad_norm": 3.043287515640259, + "learning_rate": 5.803289655473306e-06, + "loss": 0.5319, + "step": 709200 + }, + { + "epoch": 16.335083356359952, + "grad_norm": 3.1237127780914307, + "learning_rate": 5.79601809165079e-06, + "loss": 0.5316, + "step": 709400 + }, + { + "epoch": 16.33968868011421, + "grad_norm": 3.545804500579834, + "learning_rate": 5.7887465278282755e-06, + "loss": 0.5328, + "step": 709600 + }, + { + "epoch": 16.344294003868473, + "grad_norm": 3.1963136196136475, + "learning_rate": 5.78147496400576e-06, + "loss": 0.5225, + "step": 709800 + }, + { + "epoch": 16.348899327622732, + "grad_norm": 3.5646724700927734, + "learning_rate": 5.774203400183243e-06, + "loss": 0.5219, + "step": 710000 + }, + { + "epoch": 16.35350465137699, + "grad_norm": 3.244006395339966, + "learning_rate": 5.766931836360728e-06, + "loss": 0.5275, + "step": 710200 + }, + { + "epoch": 16.358109975131253, + "grad_norm": 3.042506456375122, + "learning_rate": 5.759660272538212e-06, + "loss": 0.5267, + "step": 710400 + }, + { + "epoch": 16.36271529888551, + "grad_norm": 3.204932689666748, + "learning_rate": 5.752425066534809e-06, + "loss": 0.5237, + "step": 710600 + }, + { + "epoch": 16.36732062263977, + "grad_norm": 4.271747589111328, + "learning_rate": 5.7451535027122935e-06, + "loss": 0.5332, + "step": 710800 + }, + { + "epoch": 16.371925946394033, + "grad_norm": 2.5158495903015137, + "learning_rate": 5.737881938889778e-06, + "loss": 0.5368, + "step": 711000 + }, + { + "epoch": 16.37653127014829, + "grad_norm": 3.7071948051452637, + "learning_rate": 5.730610375067262e-06, + "loss": 0.5206, + "step": 711200 + }, + { + "epoch": 16.38113659390255, + "grad_norm": 3.480548858642578, + "learning_rate": 5.723338811244747e-06, + "loss": 0.5189, + "step": 711400 + }, + { + "epoch": 16.385741917656812, + "grad_norm": 3.0035040378570557, + "learning_rate": 5.716067247422231e-06, + "loss": 0.527, + "step": 711600 + }, + { + "epoch": 16.39034724141107, + "grad_norm": 3.1353304386138916, + "learning_rate": 5.7087956835997146e-06, + "loss": 0.5278, + "step": 711800 + }, + { + "epoch": 16.39495256516533, + "grad_norm": 3.6932437419891357, + "learning_rate": 5.7015241197772e-06, + "loss": 0.5242, + "step": 712000 + }, + { + "epoch": 16.399557888919592, + "grad_norm": 3.175496816635132, + "learning_rate": 5.694252555954684e-06, + "loss": 0.5197, + "step": 712200 + }, + { + "epoch": 16.40416321267385, + "grad_norm": 3.2287960052490234, + "learning_rate": 5.686980992132168e-06, + "loss": 0.5237, + "step": 712400 + }, + { + "epoch": 16.40876853642811, + "grad_norm": 3.57425594329834, + "learning_rate": 5.679709428309652e-06, + "loss": 0.5292, + "step": 712600 + }, + { + "epoch": 16.41337386018237, + "grad_norm": 3.088691234588623, + "learning_rate": 5.6724378644871365e-06, + "loss": 0.5241, + "step": 712800 + }, + { + "epoch": 16.41797918393663, + "grad_norm": 2.952932119369507, + "learning_rate": 5.665202658483733e-06, + "loss": 0.5209, + "step": 713000 + }, + { + "epoch": 16.42258450769089, + "grad_norm": 3.005553960800171, + "learning_rate": 5.6579310946612185e-06, + "loss": 0.5168, + "step": 713200 + }, + { + "epoch": 16.42718983144515, + "grad_norm": 3.226658821105957, + "learning_rate": 5.650659530838703e-06, + "loss": 0.525, + "step": 713400 + }, + { + "epoch": 16.43179515519941, + "grad_norm": 2.5592401027679443, + "learning_rate": 5.643387967016186e-06, + "loss": 0.5251, + "step": 713600 + }, + { + "epoch": 16.436400478953672, + "grad_norm": 3.2084832191467285, + "learning_rate": 5.636116403193671e-06, + "loss": 0.5257, + "step": 713800 + }, + { + "epoch": 16.44100580270793, + "grad_norm": 3.55430006980896, + "learning_rate": 5.628844839371155e-06, + "loss": 0.5308, + "step": 714000 + }, + { + "epoch": 16.44561112646219, + "grad_norm": 3.086090564727783, + "learning_rate": 5.62157327554864e-06, + "loss": 0.5316, + "step": 714200 + }, + { + "epoch": 16.450216450216452, + "grad_norm": 3.0008442401885986, + "learning_rate": 5.614301711726124e-06, + "loss": 0.5259, + "step": 714400 + }, + { + "epoch": 16.45482177397071, + "grad_norm": 2.9424831867218018, + "learning_rate": 5.607030147903608e-06, + "loss": 0.5265, + "step": 714600 + }, + { + "epoch": 16.45942709772497, + "grad_norm": 3.0486032962799072, + "learning_rate": 5.599758584081093e-06, + "loss": 0.5277, + "step": 714800 + }, + { + "epoch": 16.46403242147923, + "grad_norm": 2.7830045223236084, + "learning_rate": 5.592487020258577e-06, + "loss": 0.5324, + "step": 715000 + }, + { + "epoch": 16.46863774523349, + "grad_norm": 3.4023308753967285, + "learning_rate": 5.585215456436061e-06, + "loss": 0.5357, + "step": 715200 + }, + { + "epoch": 16.47324306898775, + "grad_norm": 3.0721235275268555, + "learning_rate": 5.577943892613546e-06, + "loss": 0.5222, + "step": 715400 + }, + { + "epoch": 16.47784839274201, + "grad_norm": 3.580204486846924, + "learning_rate": 5.57067232879103e-06, + "loss": 0.5206, + "step": 715600 + }, + { + "epoch": 16.48245371649627, + "grad_norm": 3.269157648086548, + "learning_rate": 5.563400764968514e-06, + "loss": 0.5242, + "step": 715800 + }, + { + "epoch": 16.48705904025053, + "grad_norm": 4.157374858856201, + "learning_rate": 5.556129201145998e-06, + "loss": 0.5103, + "step": 716000 + }, + { + "epoch": 16.49166436400479, + "grad_norm": 3.5531797409057617, + "learning_rate": 5.548857637323483e-06, + "loss": 0.5272, + "step": 716200 + }, + { + "epoch": 16.49626968775905, + "grad_norm": 3.210127353668213, + "learning_rate": 5.541586073500967e-06, + "loss": 0.5267, + "step": 716400 + }, + { + "epoch": 16.50087501151331, + "grad_norm": 3.227973461151123, + "learning_rate": 5.534314509678452e-06, + "loss": 0.5219, + "step": 716600 + }, + { + "epoch": 16.50548033526757, + "grad_norm": 3.346400499343872, + "learning_rate": 5.527042945855936e-06, + "loss": 0.532, + "step": 716800 + }, + { + "epoch": 16.51008565902183, + "grad_norm": 3.1097564697265625, + "learning_rate": 5.51977138203342e-06, + "loss": 0.5206, + "step": 717000 + }, + { + "epoch": 16.514690982776088, + "grad_norm": 3.079420328140259, + "learning_rate": 5.5124998182109046e-06, + "loss": 0.5197, + "step": 717200 + }, + { + "epoch": 16.51929630653035, + "grad_norm": 2.728698968887329, + "learning_rate": 5.505228254388389e-06, + "loss": 0.5102, + "step": 717400 + }, + { + "epoch": 16.52390163028461, + "grad_norm": 2.969097137451172, + "learning_rate": 5.497993048384986e-06, + "loss": 0.517, + "step": 717600 + }, + { + "epoch": 16.528506954038868, + "grad_norm": 4.207714557647705, + "learning_rate": 5.490721484562471e-06, + "loss": 0.5329, + "step": 717800 + }, + { + "epoch": 16.53311227779313, + "grad_norm": 3.510308027267456, + "learning_rate": 5.483449920739954e-06, + "loss": 0.5141, + "step": 718000 + }, + { + "epoch": 16.53771760154739, + "grad_norm": 3.0636706352233887, + "learning_rate": 5.476178356917438e-06, + "loss": 0.5215, + "step": 718200 + }, + { + "epoch": 16.542322925301647, + "grad_norm": 3.4108800888061523, + "learning_rate": 5.468943150914035e-06, + "loss": 0.5338, + "step": 718400 + }, + { + "epoch": 16.54692824905591, + "grad_norm": 3.3463311195373535, + "learning_rate": 5.46167158709152e-06, + "loss": 0.5261, + "step": 718600 + }, + { + "epoch": 16.55153357281017, + "grad_norm": 3.2347726821899414, + "learning_rate": 5.4544000232690045e-06, + "loss": 0.5086, + "step": 718800 + }, + { + "epoch": 16.556138896564427, + "grad_norm": 3.8457565307617188, + "learning_rate": 5.447128459446489e-06, + "loss": 0.5202, + "step": 719000 + }, + { + "epoch": 16.56074422031869, + "grad_norm": 2.905299425125122, + "learning_rate": 5.439856895623973e-06, + "loss": 0.5187, + "step": 719200 + }, + { + "epoch": 16.565349544072948, + "grad_norm": 3.7381389141082764, + "learning_rate": 5.432585331801457e-06, + "loss": 0.5302, + "step": 719400 + }, + { + "epoch": 16.569954867827207, + "grad_norm": 2.6707417964935303, + "learning_rate": 5.425313767978942e-06, + "loss": 0.534, + "step": 719600 + }, + { + "epoch": 16.57456019158147, + "grad_norm": 2.757823944091797, + "learning_rate": 5.418042204156426e-06, + "loss": 0.5162, + "step": 719800 + }, + { + "epoch": 16.579165515335728, + "grad_norm": 2.5773825645446777, + "learning_rate": 5.41077064033391e-06, + "loss": 0.527, + "step": 720000 + }, + { + "epoch": 16.583770839089986, + "grad_norm": 3.3267204761505127, + "learning_rate": 5.403499076511395e-06, + "loss": 0.5241, + "step": 720200 + }, + { + "epoch": 16.58837616284425, + "grad_norm": 2.8863368034362793, + "learning_rate": 5.396227512688879e-06, + "loss": 0.5274, + "step": 720400 + }, + { + "epoch": 16.592981486598507, + "grad_norm": 3.51645827293396, + "learning_rate": 5.388955948866363e-06, + "loss": 0.5319, + "step": 720600 + }, + { + "epoch": 16.59758681035277, + "grad_norm": 3.1848063468933105, + "learning_rate": 5.381684385043848e-06, + "loss": 0.5271, + "step": 720800 + }, + { + "epoch": 16.602192134107028, + "grad_norm": 3.438476085662842, + "learning_rate": 5.374412821221332e-06, + "loss": 0.5299, + "step": 721000 + }, + { + "epoch": 16.606797457861287, + "grad_norm": 3.2890145778656006, + "learning_rate": 5.367141257398817e-06, + "loss": 0.5313, + "step": 721200 + }, + { + "epoch": 16.61140278161555, + "grad_norm": 3.603616952896118, + "learning_rate": 5.3598696935763e-06, + "loss": 0.5249, + "step": 721400 + }, + { + "epoch": 16.616008105369808, + "grad_norm": 3.5745153427124023, + "learning_rate": 5.3525981297537845e-06, + "loss": 0.5234, + "step": 721600 + }, + { + "epoch": 16.620613429124067, + "grad_norm": 2.906506061553955, + "learning_rate": 5.3453265659312695e-06, + "loss": 0.5255, + "step": 721800 + }, + { + "epoch": 16.62521875287833, + "grad_norm": 3.5672848224639893, + "learning_rate": 5.338055002108754e-06, + "loss": 0.5171, + "step": 722000 + }, + { + "epoch": 16.629824076632588, + "grad_norm": 2.794722080230713, + "learning_rate": 5.330783438286238e-06, + "loss": 0.528, + "step": 722200 + }, + { + "epoch": 16.634429400386846, + "grad_norm": 2.808533191680908, + "learning_rate": 5.323511874463722e-06, + "loss": 0.5162, + "step": 722400 + }, + { + "epoch": 16.63903472414111, + "grad_norm": 2.948071241378784, + "learning_rate": 5.316240310641206e-06, + "loss": 0.522, + "step": 722600 + }, + { + "epoch": 16.643640047895367, + "grad_norm": 3.144495964050293, + "learning_rate": 5.3089687468186915e-06, + "loss": 0.5295, + "step": 722800 + }, + { + "epoch": 16.648245371649626, + "grad_norm": 3.648850440979004, + "learning_rate": 5.301697182996175e-06, + "loss": 0.5313, + "step": 723000 + }, + { + "epoch": 16.652850695403888, + "grad_norm": 3.740377902984619, + "learning_rate": 5.294425619173659e-06, + "loss": 0.5188, + "step": 723200 + }, + { + "epoch": 16.657456019158147, + "grad_norm": 3.226428270339966, + "learning_rate": 5.287154055351144e-06, + "loss": 0.5244, + "step": 723400 + }, + { + "epoch": 16.662061342912406, + "grad_norm": 3.131943941116333, + "learning_rate": 5.279882491528628e-06, + "loss": 0.5309, + "step": 723600 + }, + { + "epoch": 16.666666666666668, + "grad_norm": 3.345491409301758, + "learning_rate": 5.2726109277061126e-06, + "loss": 0.5319, + "step": 723800 + }, + { + "epoch": 16.671271990420927, + "grad_norm": 2.5959975719451904, + "learning_rate": 5.265339363883597e-06, + "loss": 0.5249, + "step": 724000 + }, + { + "epoch": 16.675877314175185, + "grad_norm": 2.993213415145874, + "learning_rate": 5.258067800061081e-06, + "loss": 0.5219, + "step": 724200 + }, + { + "epoch": 16.680482637929448, + "grad_norm": 3.5764873027801514, + "learning_rate": 5.250796236238566e-06, + "loss": 0.5264, + "step": 724400 + }, + { + "epoch": 16.685087961683706, + "grad_norm": 3.0356667041778564, + "learning_rate": 5.24352467241605e-06, + "loss": 0.5139, + "step": 724600 + }, + { + "epoch": 16.689693285437965, + "grad_norm": 2.6434290409088135, + "learning_rate": 5.236253108593534e-06, + "loss": 0.5276, + "step": 724800 + }, + { + "epoch": 16.694298609192227, + "grad_norm": 3.0571916103363037, + "learning_rate": 5.228981544771019e-06, + "loss": 0.5153, + "step": 725000 + }, + { + "epoch": 16.698903932946486, + "grad_norm": 3.077500104904175, + "learning_rate": 5.221709980948503e-06, + "loss": 0.5254, + "step": 725200 + }, + { + "epoch": 16.703509256700745, + "grad_norm": 3.5419986248016357, + "learning_rate": 5.214438417125988e-06, + "loss": 0.5232, + "step": 725400 + }, + { + "epoch": 16.708114580455007, + "grad_norm": 2.9213297367095947, + "learning_rate": 5.207166853303471e-06, + "loss": 0.5284, + "step": 725600 + }, + { + "epoch": 16.712719904209266, + "grad_norm": 3.5840189456939697, + "learning_rate": 5.199895289480956e-06, + "loss": 0.532, + "step": 725800 + }, + { + "epoch": 16.717325227963524, + "grad_norm": 3.3302419185638428, + "learning_rate": 5.1926600834775525e-06, + "loss": 0.5257, + "step": 726000 + }, + { + "epoch": 16.721930551717787, + "grad_norm": 3.81793212890625, + "learning_rate": 5.1854248774741494e-06, + "loss": 0.5225, + "step": 726200 + }, + { + "epoch": 16.726535875472045, + "grad_norm": 3.0137243270874023, + "learning_rate": 5.1781533136516345e-06, + "loss": 0.5172, + "step": 726400 + }, + { + "epoch": 16.731141199226304, + "grad_norm": 2.965055465698242, + "learning_rate": 5.170881749829119e-06, + "loss": 0.5248, + "step": 726600 + }, + { + "epoch": 16.735746522980566, + "grad_norm": 2.7242608070373535, + "learning_rate": 5.163610186006602e-06, + "loss": 0.5156, + "step": 726800 + }, + { + "epoch": 16.740351846734825, + "grad_norm": 2.787821054458618, + "learning_rate": 5.156338622184087e-06, + "loss": 0.5251, + "step": 727000 + }, + { + "epoch": 16.744957170489087, + "grad_norm": 3.265836238861084, + "learning_rate": 5.149067058361571e-06, + "loss": 0.5232, + "step": 727200 + }, + { + "epoch": 16.749562494243346, + "grad_norm": 2.891186475753784, + "learning_rate": 5.1417954945390564e-06, + "loss": 0.5294, + "step": 727400 + }, + { + "epoch": 16.754167817997605, + "grad_norm": 4.678770065307617, + "learning_rate": 5.13452393071654e-06, + "loss": 0.5194, + "step": 727600 + }, + { + "epoch": 16.758773141751867, + "grad_norm": 3.1750643253326416, + "learning_rate": 5.127252366894024e-06, + "loss": 0.5256, + "step": 727800 + }, + { + "epoch": 16.763378465506126, + "grad_norm": 4.042088508605957, + "learning_rate": 5.119980803071509e-06, + "loss": 0.5231, + "step": 728000 + }, + { + "epoch": 16.767983789260384, + "grad_norm": 3.6129207611083984, + "learning_rate": 5.112709239248993e-06, + "loss": 0.5343, + "step": 728200 + }, + { + "epoch": 16.772589113014647, + "grad_norm": 2.856450319290161, + "learning_rate": 5.105437675426477e-06, + "loss": 0.5233, + "step": 728400 + }, + { + "epoch": 16.777194436768905, + "grad_norm": 3.1779849529266357, + "learning_rate": 5.098166111603962e-06, + "loss": 0.5195, + "step": 728600 + }, + { + "epoch": 16.781799760523164, + "grad_norm": 3.1687657833099365, + "learning_rate": 5.090894547781446e-06, + "loss": 0.5315, + "step": 728800 + }, + { + "epoch": 16.786405084277426, + "grad_norm": 2.3541252613067627, + "learning_rate": 5.083622983958931e-06, + "loss": 0.5206, + "step": 729000 + }, + { + "epoch": 16.791010408031685, + "grad_norm": 3.6288070678710938, + "learning_rate": 5.076351420136414e-06, + "loss": 0.5189, + "step": 729200 + }, + { + "epoch": 16.795615731785944, + "grad_norm": 3.1729352474212646, + "learning_rate": 5.069079856313899e-06, + "loss": 0.5242, + "step": 729400 + }, + { + "epoch": 16.800221055540206, + "grad_norm": 3.039484739303589, + "learning_rate": 5.061808292491384e-06, + "loss": 0.5298, + "step": 729600 + }, + { + "epoch": 16.804826379294465, + "grad_norm": 3.6854867935180664, + "learning_rate": 5.0546094443070924e-06, + "loss": 0.5296, + "step": 729800 + }, + { + "epoch": 16.809431703048723, + "grad_norm": 3.062591791152954, + "learning_rate": 5.0473378804845775e-06, + "loss": 0.5335, + "step": 730000 + }, + { + "epoch": 16.814037026802986, + "grad_norm": 3.3723227977752686, + "learning_rate": 5.040066316662062e-06, + "loss": 0.5266, + "step": 730200 + }, + { + "epoch": 16.818642350557244, + "grad_norm": 2.7272493839263916, + "learning_rate": 5.032794752839545e-06, + "loss": 0.5222, + "step": 730400 + }, + { + "epoch": 16.823247674311503, + "grad_norm": 3.0595741271972656, + "learning_rate": 5.02552318901703e-06, + "loss": 0.5252, + "step": 730600 + }, + { + "epoch": 16.827852998065765, + "grad_norm": 3.748103380203247, + "learning_rate": 5.018251625194514e-06, + "loss": 0.5266, + "step": 730800 + }, + { + "epoch": 16.832458321820024, + "grad_norm": 2.6202964782714844, + "learning_rate": 5.0109800613719995e-06, + "loss": 0.5449, + "step": 731000 + }, + { + "epoch": 16.837063645574283, + "grad_norm": 2.5717170238494873, + "learning_rate": 5.003708497549483e-06, + "loss": 0.5277, + "step": 731200 + }, + { + "epoch": 16.841668969328545, + "grad_norm": 2.716874361038208, + "learning_rate": 4.996436933726967e-06, + "loss": 0.5235, + "step": 731400 + }, + { + "epoch": 16.846274293082804, + "grad_norm": 3.9380040168762207, + "learning_rate": 4.989165369904452e-06, + "loss": 0.5278, + "step": 731600 + }, + { + "epoch": 16.850879616837062, + "grad_norm": 3.0047407150268555, + "learning_rate": 4.981893806081936e-06, + "loss": 0.5331, + "step": 731800 + }, + { + "epoch": 16.855484940591325, + "grad_norm": 3.2942941188812256, + "learning_rate": 4.97462224225942e-06, + "loss": 0.5394, + "step": 732000 + }, + { + "epoch": 16.860090264345583, + "grad_norm": 2.960007905960083, + "learning_rate": 4.967350678436905e-06, + "loss": 0.5284, + "step": 732200 + }, + { + "epoch": 16.864695588099842, + "grad_norm": 4.325229167938232, + "learning_rate": 4.960079114614389e-06, + "loss": 0.5142, + "step": 732400 + }, + { + "epoch": 16.869300911854104, + "grad_norm": 3.9659183025360107, + "learning_rate": 4.952807550791874e-06, + "loss": 0.5277, + "step": 732600 + }, + { + "epoch": 16.873906235608363, + "grad_norm": 4.010354042053223, + "learning_rate": 4.9455359869693574e-06, + "loss": 0.545, + "step": 732800 + }, + { + "epoch": 16.87851155936262, + "grad_norm": 2.861638307571411, + "learning_rate": 4.938264423146842e-06, + "loss": 0.5232, + "step": 733000 + }, + { + "epoch": 16.883116883116884, + "grad_norm": 3.198781967163086, + "learning_rate": 4.930992859324327e-06, + "loss": 0.5184, + "step": 733200 + }, + { + "epoch": 16.887722206871143, + "grad_norm": 3.7329859733581543, + "learning_rate": 4.923721295501811e-06, + "loss": 0.5179, + "step": 733400 + }, + { + "epoch": 16.8923275306254, + "grad_norm": 3.9860944747924805, + "learning_rate": 4.916449731679295e-06, + "loss": 0.5298, + "step": 733600 + }, + { + "epoch": 16.896932854379664, + "grad_norm": 3.4595282077789307, + "learning_rate": 4.909178167856779e-06, + "loss": 0.5261, + "step": 733800 + }, + { + "epoch": 16.901538178133922, + "grad_norm": 3.319841146469116, + "learning_rate": 4.901906604034264e-06, + "loss": 0.5225, + "step": 734000 + }, + { + "epoch": 16.90614350188818, + "grad_norm": 3.025320529937744, + "learning_rate": 4.894635040211749e-06, + "loss": 0.5423, + "step": 734200 + }, + { + "epoch": 16.910748825642443, + "grad_norm": 3.0882341861724854, + "learning_rate": 4.887363476389233e-06, + "loss": 0.5239, + "step": 734400 + }, + { + "epoch": 16.915354149396702, + "grad_norm": 3.2497994899749756, + "learning_rate": 4.880091912566716e-06, + "loss": 0.5168, + "step": 734600 + }, + { + "epoch": 16.919959473150964, + "grad_norm": 3.428805351257324, + "learning_rate": 4.872820348744201e-06, + "loss": 0.5209, + "step": 734800 + }, + { + "epoch": 16.924564796905223, + "grad_norm": 3.8149008750915527, + "learning_rate": 4.8655487849216855e-06, + "loss": 0.5226, + "step": 735000 + }, + { + "epoch": 16.92917012065948, + "grad_norm": 2.430985450744629, + "learning_rate": 4.85827722109917e-06, + "loss": 0.5272, + "step": 735200 + }, + { + "epoch": 16.933775444413744, + "grad_norm": 3.142275333404541, + "learning_rate": 4.851005657276654e-06, + "loss": 0.5294, + "step": 735400 + }, + { + "epoch": 16.938380768168003, + "grad_norm": 3.180042028427124, + "learning_rate": 4.843734093454138e-06, + "loss": 0.5203, + "step": 735600 + }, + { + "epoch": 16.94298609192226, + "grad_norm": 3.004331350326538, + "learning_rate": 4.836462529631623e-06, + "loss": 0.5244, + "step": 735800 + }, + { + "epoch": 16.947591415676523, + "grad_norm": 2.585542678833008, + "learning_rate": 4.8291909658091075e-06, + "loss": 0.5289, + "step": 736000 + }, + { + "epoch": 16.952196739430782, + "grad_norm": 3.3415260314941406, + "learning_rate": 4.821919401986591e-06, + "loss": 0.5253, + "step": 736200 + }, + { + "epoch": 16.95680206318504, + "grad_norm": 3.3344316482543945, + "learning_rate": 4.814647838164076e-06, + "loss": 0.5234, + "step": 736400 + }, + { + "epoch": 16.961407386939303, + "grad_norm": 2.819533348083496, + "learning_rate": 4.807412632160673e-06, + "loss": 0.5313, + "step": 736600 + }, + { + "epoch": 16.966012710693562, + "grad_norm": 3.417186737060547, + "learning_rate": 4.80017742615727e-06, + "loss": 0.5333, + "step": 736800 + }, + { + "epoch": 16.97061803444782, + "grad_norm": 3.4736814498901367, + "learning_rate": 4.792905862334754e-06, + "loss": 0.5133, + "step": 737000 + }, + { + "epoch": 16.975223358202083, + "grad_norm": 2.8336431980133057, + "learning_rate": 4.785634298512238e-06, + "loss": 0.5268, + "step": 737200 + }, + { + "epoch": 16.97982868195634, + "grad_norm": 3.896120071411133, + "learning_rate": 4.778362734689722e-06, + "loss": 0.5304, + "step": 737400 + }, + { + "epoch": 16.9844340057106, + "grad_norm": 3.212686777114868, + "learning_rate": 4.771127528686319e-06, + "loss": 0.5216, + "step": 737600 + }, + { + "epoch": 16.989039329464863, + "grad_norm": 2.8021464347839355, + "learning_rate": 4.7638559648638035e-06, + "loss": 0.5258, + "step": 737800 + }, + { + "epoch": 16.99364465321912, + "grad_norm": 2.3054471015930176, + "learning_rate": 4.756584401041289e-06, + "loss": 0.5119, + "step": 738000 + }, + { + "epoch": 16.99824997697338, + "grad_norm": 3.4670674800872803, + "learning_rate": 4.749312837218773e-06, + "loss": 0.5228, + "step": 738200 + }, + { + "epoch": 17.0, + "eval_loss": 0.513902485370636, + "eval_runtime": 169.6655, + "eval_samples_per_second": 167.158, + "eval_steps_per_second": 10.45, + "step": 738276 + }, + { + "epoch": 17.002855300727642, + "grad_norm": 4.795275688171387, + "learning_rate": 4.74207763121537e-06, + "loss": 0.5208, + "step": 738400 + }, + { + "epoch": 17.0074606244819, + "grad_norm": 2.918687582015991, + "learning_rate": 4.734806067392853e-06, + "loss": 0.5252, + "step": 738600 + }, + { + "epoch": 17.01206594823616, + "grad_norm": 2.214686632156372, + "learning_rate": 4.727534503570338e-06, + "loss": 0.5238, + "step": 738800 + }, + { + "epoch": 17.016671271990422, + "grad_norm": 3.4804065227508545, + "learning_rate": 4.720262939747822e-06, + "loss": 0.5244, + "step": 739000 + }, + { + "epoch": 17.02127659574468, + "grad_norm": 2.5966029167175293, + "learning_rate": 4.712991375925307e-06, + "loss": 0.5301, + "step": 739200 + }, + { + "epoch": 17.02588191949894, + "grad_norm": 2.7424144744873047, + "learning_rate": 4.705719812102791e-06, + "loss": 0.5181, + "step": 739400 + }, + { + "epoch": 17.0304872432532, + "grad_norm": 2.9380509853363037, + "learning_rate": 4.698448248280275e-06, + "loss": 0.5324, + "step": 739600 + }, + { + "epoch": 17.03509256700746, + "grad_norm": 2.6877858638763428, + "learning_rate": 4.69117668445776e-06, + "loss": 0.5186, + "step": 739800 + }, + { + "epoch": 17.03969789076172, + "grad_norm": 2.7260663509368896, + "learning_rate": 4.683905120635244e-06, + "loss": 0.5212, + "step": 740000 + }, + { + "epoch": 17.04430321451598, + "grad_norm": 3.156268358230591, + "learning_rate": 4.676633556812728e-06, + "loss": 0.5262, + "step": 740200 + }, + { + "epoch": 17.04890853827024, + "grad_norm": 4.205811500549316, + "learning_rate": 4.669361992990213e-06, + "loss": 0.5204, + "step": 740400 + }, + { + "epoch": 17.0535138620245, + "grad_norm": 3.210843563079834, + "learning_rate": 4.662090429167697e-06, + "loss": 0.5324, + "step": 740600 + }, + { + "epoch": 17.05811918577876, + "grad_norm": 2.8761239051818848, + "learning_rate": 4.654818865345181e-06, + "loss": 0.5184, + "step": 740800 + }, + { + "epoch": 17.06272450953302, + "grad_norm": 2.9451096057891846, + "learning_rate": 4.647547301522665e-06, + "loss": 0.5132, + "step": 741000 + }, + { + "epoch": 17.067329833287282, + "grad_norm": 2.4379804134368896, + "learning_rate": 4.64027573770015e-06, + "loss": 0.5156, + "step": 741200 + }, + { + "epoch": 17.07193515704154, + "grad_norm": 2.7910115718841553, + "learning_rate": 4.633004173877635e-06, + "loss": 0.5234, + "step": 741400 + }, + { + "epoch": 17.0765404807958, + "grad_norm": 3.084688663482666, + "learning_rate": 4.625732610055119e-06, + "loss": 0.5324, + "step": 741600 + }, + { + "epoch": 17.08114580455006, + "grad_norm": 2.9508092403411865, + "learning_rate": 4.618461046232602e-06, + "loss": 0.512, + "step": 741800 + }, + { + "epoch": 17.08575112830432, + "grad_norm": 4.045099258422852, + "learning_rate": 4.611225840229199e-06, + "loss": 0.5115, + "step": 742000 + }, + { + "epoch": 17.09035645205858, + "grad_norm": 3.059542655944824, + "learning_rate": 4.603954276406684e-06, + "loss": 0.5239, + "step": 742200 + }, + { + "epoch": 17.09496177581284, + "grad_norm": 3.1609458923339844, + "learning_rate": 4.5966827125841685e-06, + "loss": 0.5215, + "step": 742400 + }, + { + "epoch": 17.0995670995671, + "grad_norm": 3.208566427230835, + "learning_rate": 4.589411148761653e-06, + "loss": 0.5125, + "step": 742600 + }, + { + "epoch": 17.10417242332136, + "grad_norm": 3.126858949661255, + "learning_rate": 4.582139584939137e-06, + "loss": 0.5145, + "step": 742800 + }, + { + "epoch": 17.10877774707562, + "grad_norm": 2.690805673599243, + "learning_rate": 4.574868021116621e-06, + "loss": 0.533, + "step": 743000 + }, + { + "epoch": 17.11338307082988, + "grad_norm": 3.323164939880371, + "learning_rate": 4.567596457294106e-06, + "loss": 0.5194, + "step": 743200 + }, + { + "epoch": 17.11798839458414, + "grad_norm": 2.800487995147705, + "learning_rate": 4.5603248934715904e-06, + "loss": 0.5217, + "step": 743400 + }, + { + "epoch": 17.1225937183384, + "grad_norm": 3.2171924114227295, + "learning_rate": 4.553053329649074e-06, + "loss": 0.5251, + "step": 743600 + }, + { + "epoch": 17.12719904209266, + "grad_norm": 2.8016812801361084, + "learning_rate": 4.545781765826559e-06, + "loss": 0.516, + "step": 743800 + }, + { + "epoch": 17.131804365846918, + "grad_norm": 2.8154873847961426, + "learning_rate": 4.538510202004043e-06, + "loss": 0.5167, + "step": 744000 + }, + { + "epoch": 17.13640968960118, + "grad_norm": 3.5362391471862793, + "learning_rate": 4.531238638181527e-06, + "loss": 0.5388, + "step": 744200 + }, + { + "epoch": 17.14101501335544, + "grad_norm": 3.001676321029663, + "learning_rate": 4.523967074359012e-06, + "loss": 0.5284, + "step": 744400 + }, + { + "epoch": 17.145620337109698, + "grad_norm": 2.9126548767089844, + "learning_rate": 4.516695510536496e-06, + "loss": 0.5217, + "step": 744600 + }, + { + "epoch": 17.15022566086396, + "grad_norm": 2.922006368637085, + "learning_rate": 4.509423946713981e-06, + "loss": 0.5176, + "step": 744800 + }, + { + "epoch": 17.15483098461822, + "grad_norm": 3.5556321144104004, + "learning_rate": 4.502152382891465e-06, + "loss": 0.5301, + "step": 745000 + }, + { + "epoch": 17.159436308372477, + "grad_norm": 2.726541519165039, + "learning_rate": 4.494880819068949e-06, + "loss": 0.5158, + "step": 745200 + }, + { + "epoch": 17.16404163212674, + "grad_norm": 2.8062782287597656, + "learning_rate": 4.4876092552464335e-06, + "loss": 0.5243, + "step": 745400 + }, + { + "epoch": 17.168646955880998, + "grad_norm": 2.6974618434906006, + "learning_rate": 4.480337691423918e-06, + "loss": 0.5189, + "step": 745600 + }, + { + "epoch": 17.173252279635257, + "grad_norm": 4.060593128204346, + "learning_rate": 4.473066127601402e-06, + "loss": 0.514, + "step": 745800 + }, + { + "epoch": 17.17785760338952, + "grad_norm": 2.953789472579956, + "learning_rate": 4.465794563778887e-06, + "loss": 0.5274, + "step": 746000 + }, + { + "epoch": 17.182462927143778, + "grad_norm": 2.9467928409576416, + "learning_rate": 4.45852299995637e-06, + "loss": 0.5094, + "step": 746200 + }, + { + "epoch": 17.187068250898037, + "grad_norm": 3.19561505317688, + "learning_rate": 4.4512514361338546e-06, + "loss": 0.5341, + "step": 746400 + }, + { + "epoch": 17.1916735746523, + "grad_norm": 2.521677017211914, + "learning_rate": 4.44397987231134e-06, + "loss": 0.525, + "step": 746600 + }, + { + "epoch": 17.196278898406558, + "grad_norm": 3.030155897140503, + "learning_rate": 4.436708308488824e-06, + "loss": 0.5327, + "step": 746800 + }, + { + "epoch": 17.200884222160816, + "grad_norm": 3.0446267127990723, + "learning_rate": 4.429436744666308e-06, + "loss": 0.5185, + "step": 747000 + }, + { + "epoch": 17.20548954591508, + "grad_norm": 3.0444605350494385, + "learning_rate": 4.422165180843792e-06, + "loss": 0.5256, + "step": 747200 + }, + { + "epoch": 17.210094869669337, + "grad_norm": 3.9111011028289795, + "learning_rate": 4.4148936170212765e-06, + "loss": 0.5247, + "step": 747400 + }, + { + "epoch": 17.214700193423596, + "grad_norm": 3.1988956928253174, + "learning_rate": 4.407622053198762e-06, + "loss": 0.5177, + "step": 747600 + }, + { + "epoch": 17.219305517177858, + "grad_norm": 3.0281171798706055, + "learning_rate": 4.400350489376245e-06, + "loss": 0.5212, + "step": 747800 + }, + { + "epoch": 17.223910840932117, + "grad_norm": 3.074202060699463, + "learning_rate": 4.393078925553729e-06, + "loss": 0.5171, + "step": 748000 + }, + { + "epoch": 17.22851616468638, + "grad_norm": 3.3265304565429688, + "learning_rate": 4.385807361731214e-06, + "loss": 0.5188, + "step": 748200 + }, + { + "epoch": 17.233121488440638, + "grad_norm": 3.696972370147705, + "learning_rate": 4.378572155727811e-06, + "loss": 0.5163, + "step": 748400 + }, + { + "epoch": 17.237726812194897, + "grad_norm": 2.553654193878174, + "learning_rate": 4.371300591905295e-06, + "loss": 0.5148, + "step": 748600 + }, + { + "epoch": 17.24233213594916, + "grad_norm": 3.0239148139953613, + "learning_rate": 4.36402902808278e-06, + "loss": 0.5086, + "step": 748800 + }, + { + "epoch": 17.246937459703418, + "grad_norm": 3.5661909580230713, + "learning_rate": 4.356757464260264e-06, + "loss": 0.5203, + "step": 749000 + }, + { + "epoch": 17.251542783457676, + "grad_norm": 3.1638379096984863, + "learning_rate": 4.349485900437748e-06, + "loss": 0.5089, + "step": 749200 + }, + { + "epoch": 17.25614810721194, + "grad_norm": 3.8323419094085693, + "learning_rate": 4.342214336615233e-06, + "loss": 0.527, + "step": 749400 + }, + { + "epoch": 17.260753430966197, + "grad_norm": 2.8561997413635254, + "learning_rate": 4.3349427727927165e-06, + "loss": 0.5227, + "step": 749600 + }, + { + "epoch": 17.265358754720456, + "grad_norm": 2.703752040863037, + "learning_rate": 4.327671208970201e-06, + "loss": 0.5293, + "step": 749800 + }, + { + "epoch": 17.269964078474718, + "grad_norm": 2.9282515048980713, + "learning_rate": 4.320399645147686e-06, + "loss": 0.5195, + "step": 750000 + }, + { + "epoch": 17.274569402228977, + "grad_norm": 3.2129111289978027, + "learning_rate": 4.31312808132517e-06, + "loss": 0.5199, + "step": 750200 + }, + { + "epoch": 17.279174725983236, + "grad_norm": 3.368345260620117, + "learning_rate": 4.305856517502654e-06, + "loss": 0.5246, + "step": 750400 + }, + { + "epoch": 17.283780049737498, + "grad_norm": 3.3532567024230957, + "learning_rate": 4.298621311499251e-06, + "loss": 0.5149, + "step": 750600 + }, + { + "epoch": 17.288385373491757, + "grad_norm": 3.7765021324157715, + "learning_rate": 4.291349747676735e-06, + "loss": 0.5282, + "step": 750800 + }, + { + "epoch": 17.292990697246015, + "grad_norm": 2.6797635555267334, + "learning_rate": 4.2840781838542195e-06, + "loss": 0.5261, + "step": 751000 + }, + { + "epoch": 17.297596021000277, + "grad_norm": 2.490145683288574, + "learning_rate": 4.276806620031705e-06, + "loss": 0.5254, + "step": 751200 + }, + { + "epoch": 17.302201344754536, + "grad_norm": 2.785992383956909, + "learning_rate": 4.269535056209188e-06, + "loss": 0.5196, + "step": 751400 + }, + { + "epoch": 17.306806668508795, + "grad_norm": 3.1607396602630615, + "learning_rate": 4.262263492386672e-06, + "loss": 0.5217, + "step": 751600 + }, + { + "epoch": 17.311411992263057, + "grad_norm": 3.664815902709961, + "learning_rate": 4.254991928564157e-06, + "loss": 0.5251, + "step": 751800 + }, + { + "epoch": 17.316017316017316, + "grad_norm": 3.4818427562713623, + "learning_rate": 4.247756722560754e-06, + "loss": 0.5282, + "step": 752000 + }, + { + "epoch": 17.320622639771575, + "grad_norm": 3.277512311935425, + "learning_rate": 4.240521516557351e-06, + "loss": 0.5343, + "step": 752200 + }, + { + "epoch": 17.325227963525837, + "grad_norm": 3.601278305053711, + "learning_rate": 4.233249952734835e-06, + "loss": 0.527, + "step": 752400 + }, + { + "epoch": 17.329833287280096, + "grad_norm": 3.438324451446533, + "learning_rate": 4.2259783889123195e-06, + "loss": 0.5108, + "step": 752600 + }, + { + "epoch": 17.334438611034354, + "grad_norm": 3.622999429702759, + "learning_rate": 4.218706825089804e-06, + "loss": 0.522, + "step": 752800 + }, + { + "epoch": 17.339043934788616, + "grad_norm": 2.718357563018799, + "learning_rate": 4.211435261267288e-06, + "loss": 0.5287, + "step": 753000 + }, + { + "epoch": 17.343649258542875, + "grad_norm": 3.1246633529663086, + "learning_rate": 4.204163697444773e-06, + "loss": 0.5293, + "step": 753200 + }, + { + "epoch": 17.348254582297134, + "grad_norm": 2.551449775695801, + "learning_rate": 4.196892133622257e-06, + "loss": 0.5222, + "step": 753400 + }, + { + "epoch": 17.352859906051396, + "grad_norm": 2.530979633331299, + "learning_rate": 4.189620569799741e-06, + "loss": 0.5185, + "step": 753600 + }, + { + "epoch": 17.357465229805655, + "grad_norm": 3.4675188064575195, + "learning_rate": 4.182349005977226e-06, + "loss": 0.5193, + "step": 753800 + }, + { + "epoch": 17.362070553559914, + "grad_norm": 3.597015619277954, + "learning_rate": 4.17507744215471e-06, + "loss": 0.5124, + "step": 754000 + }, + { + "epoch": 17.366675877314176, + "grad_norm": 3.48524808883667, + "learning_rate": 4.167805878332195e-06, + "loss": 0.5167, + "step": 754200 + }, + { + "epoch": 17.371281201068435, + "grad_norm": 2.2207281589508057, + "learning_rate": 4.160534314509678e-06, + "loss": 0.5343, + "step": 754400 + }, + { + "epoch": 17.375886524822697, + "grad_norm": 3.222555160522461, + "learning_rate": 4.1532627506871626e-06, + "loss": 0.5166, + "step": 754600 + }, + { + "epoch": 17.380491848576956, + "grad_norm": 3.4512622356414795, + "learning_rate": 4.145991186864648e-06, + "loss": 0.5148, + "step": 754800 + }, + { + "epoch": 17.385097172331214, + "grad_norm": 2.870035171508789, + "learning_rate": 4.138719623042132e-06, + "loss": 0.5195, + "step": 755000 + }, + { + "epoch": 17.389702496085476, + "grad_norm": 3.166290521621704, + "learning_rate": 4.131448059219615e-06, + "loss": 0.5147, + "step": 755200 + }, + { + "epoch": 17.394307819839735, + "grad_norm": 3.1299068927764893, + "learning_rate": 4.1241764953971e-06, + "loss": 0.5193, + "step": 755400 + }, + { + "epoch": 17.398913143593994, + "grad_norm": 2.6893653869628906, + "learning_rate": 4.1169049315745845e-06, + "loss": 0.5246, + "step": 755600 + }, + { + "epoch": 17.403518467348256, + "grad_norm": 2.988776206970215, + "learning_rate": 4.1096333677520696e-06, + "loss": 0.5223, + "step": 755800 + }, + { + "epoch": 17.408123791102515, + "grad_norm": 3.235541820526123, + "learning_rate": 4.102361803929553e-06, + "loss": 0.5116, + "step": 756000 + }, + { + "epoch": 17.412729114856774, + "grad_norm": 3.031001091003418, + "learning_rate": 4.09512659792615e-06, + "loss": 0.5167, + "step": 756200 + }, + { + "epoch": 17.417334438611036, + "grad_norm": 2.9991252422332764, + "learning_rate": 4.087855034103634e-06, + "loss": 0.5125, + "step": 756400 + }, + { + "epoch": 17.421939762365295, + "grad_norm": 3.0221939086914062, + "learning_rate": 4.080583470281119e-06, + "loss": 0.5213, + "step": 756600 + }, + { + "epoch": 17.426545086119553, + "grad_norm": 2.6054844856262207, + "learning_rate": 4.073311906458603e-06, + "loss": 0.5255, + "step": 756800 + }, + { + "epoch": 17.431150409873815, + "grad_norm": 2.908795118331909, + "learning_rate": 4.066040342636087e-06, + "loss": 0.5223, + "step": 757000 + }, + { + "epoch": 17.435755733628074, + "grad_norm": 3.4502451419830322, + "learning_rate": 4.058768778813572e-06, + "loss": 0.5242, + "step": 757200 + }, + { + "epoch": 17.440361057382333, + "grad_norm": 3.335181474685669, + "learning_rate": 4.051497214991056e-06, + "loss": 0.5198, + "step": 757400 + }, + { + "epoch": 17.444966381136595, + "grad_norm": 3.5651845932006836, + "learning_rate": 4.044225651168541e-06, + "loss": 0.5176, + "step": 757600 + }, + { + "epoch": 17.449571704890854, + "grad_norm": 2.8494350910186768, + "learning_rate": 4.0369540873460245e-06, + "loss": 0.52, + "step": 757800 + }, + { + "epoch": 17.454177028645113, + "grad_norm": 2.9451777935028076, + "learning_rate": 4.029682523523509e-06, + "loss": 0.5285, + "step": 758000 + }, + { + "epoch": 17.458782352399375, + "grad_norm": 3.2299864292144775, + "learning_rate": 4.022410959700994e-06, + "loss": 0.5242, + "step": 758200 + }, + { + "epoch": 17.463387676153634, + "grad_norm": 2.959676742553711, + "learning_rate": 4.015139395878478e-06, + "loss": 0.5261, + "step": 758400 + }, + { + "epoch": 17.467992999907892, + "grad_norm": 3.600370407104492, + "learning_rate": 4.007867832055961e-06, + "loss": 0.5173, + "step": 758600 + }, + { + "epoch": 17.472598323662154, + "grad_norm": 2.899125814437866, + "learning_rate": 4.000596268233446e-06, + "loss": 0.5236, + "step": 758800 + }, + { + "epoch": 17.477203647416413, + "grad_norm": 3.0333738327026367, + "learning_rate": 3.993324704410931e-06, + "loss": 0.5209, + "step": 759000 + }, + { + "epoch": 17.481808971170672, + "grad_norm": 2.875760078430176, + "learning_rate": 3.986053140588416e-06, + "loss": 0.5118, + "step": 759200 + }, + { + "epoch": 17.486414294924934, + "grad_norm": 2.935258150100708, + "learning_rate": 3.978781576765899e-06, + "loss": 0.5157, + "step": 759400 + }, + { + "epoch": 17.491019618679193, + "grad_norm": 3.5687882900238037, + "learning_rate": 3.971510012943383e-06, + "loss": 0.5255, + "step": 759600 + }, + { + "epoch": 17.49562494243345, + "grad_norm": 3.5541269779205322, + "learning_rate": 3.96427480693998e-06, + "loss": 0.5209, + "step": 759800 + }, + { + "epoch": 17.500230266187714, + "grad_norm": 2.5105998516082764, + "learning_rate": 3.957003243117465e-06, + "loss": 0.5237, + "step": 760000 + }, + { + "epoch": 17.504835589941973, + "grad_norm": 3.4563724994659424, + "learning_rate": 3.9497316792949495e-06, + "loss": 0.5289, + "step": 760200 + }, + { + "epoch": 17.50944091369623, + "grad_norm": 4.190462112426758, + "learning_rate": 3.942460115472433e-06, + "loss": 0.5144, + "step": 760400 + }, + { + "epoch": 17.514046237450493, + "grad_norm": 3.6691291332244873, + "learning_rate": 3.935188551649918e-06, + "loss": 0.5129, + "step": 760600 + }, + { + "epoch": 17.518651561204752, + "grad_norm": 3.044640064239502, + "learning_rate": 3.927916987827402e-06, + "loss": 0.5197, + "step": 760800 + }, + { + "epoch": 17.52325688495901, + "grad_norm": 3.2396230697631836, + "learning_rate": 3.920645424004887e-06, + "loss": 0.5099, + "step": 761000 + }, + { + "epoch": 17.527862208713273, + "grad_norm": 3.3681046962738037, + "learning_rate": 3.9133738601823706e-06, + "loss": 0.5258, + "step": 761200 + }, + { + "epoch": 17.532467532467532, + "grad_norm": 3.473139524459839, + "learning_rate": 3.906102296359855e-06, + "loss": 0.5112, + "step": 761400 + }, + { + "epoch": 17.53707285622179, + "grad_norm": 3.2254271507263184, + "learning_rate": 3.89883073253734e-06, + "loss": 0.5194, + "step": 761600 + }, + { + "epoch": 17.541678179976053, + "grad_norm": 3.007357120513916, + "learning_rate": 3.891559168714824e-06, + "loss": 0.5226, + "step": 761800 + }, + { + "epoch": 17.54628350373031, + "grad_norm": 3.1616954803466797, + "learning_rate": 3.884287604892308e-06, + "loss": 0.5195, + "step": 762000 + }, + { + "epoch": 17.550888827484574, + "grad_norm": 3.4159657955169678, + "learning_rate": 3.8770160410697925e-06, + "loss": 0.5196, + "step": 762200 + }, + { + "epoch": 17.555494151238833, + "grad_norm": 2.448462724685669, + "learning_rate": 3.869744477247277e-06, + "loss": 0.5221, + "step": 762400 + }, + { + "epoch": 17.56009947499309, + "grad_norm": 3.6879563331604004, + "learning_rate": 3.862472913424762e-06, + "loss": 0.5182, + "step": 762600 + }, + { + "epoch": 17.564704798747353, + "grad_norm": 3.100130796432495, + "learning_rate": 3.855201349602246e-06, + "loss": 0.5108, + "step": 762800 + }, + { + "epoch": 17.569310122501612, + "grad_norm": 3.8761160373687744, + "learning_rate": 3.847929785779729e-06, + "loss": 0.529, + "step": 763000 + }, + { + "epoch": 17.57391544625587, + "grad_norm": 3.630197525024414, + "learning_rate": 3.8406582219572145e-06, + "loss": 0.5229, + "step": 763200 + }, + { + "epoch": 17.578520770010133, + "grad_norm": 3.126621961593628, + "learning_rate": 3.833386658134699e-06, + "loss": 0.5199, + "step": 763400 + }, + { + "epoch": 17.583126093764392, + "grad_norm": 2.9933745861053467, + "learning_rate": 3.826115094312183e-06, + "loss": 0.5093, + "step": 763600 + }, + { + "epoch": 17.58773141751865, + "grad_norm": 3.0438787937164307, + "learning_rate": 3.818843530489667e-06, + "loss": 0.5211, + "step": 763800 + }, + { + "epoch": 17.592336741272913, + "grad_norm": 3.802643299102783, + "learning_rate": 3.8115719666671513e-06, + "loss": 0.5248, + "step": 764000 + }, + { + "epoch": 17.59694206502717, + "grad_norm": 4.053820610046387, + "learning_rate": 3.8043004028446364e-06, + "loss": 0.5208, + "step": 764200 + }, + { + "epoch": 17.60154738878143, + "grad_norm": 2.8927087783813477, + "learning_rate": 3.7970651968412333e-06, + "loss": 0.5216, + "step": 764400 + }, + { + "epoch": 17.606152712535692, + "grad_norm": 3.9510834217071533, + "learning_rate": 3.789793633018717e-06, + "loss": 0.5174, + "step": 764600 + }, + { + "epoch": 17.61075803628995, + "grad_norm": 2.498469114303589, + "learning_rate": 3.7825220691962013e-06, + "loss": 0.5162, + "step": 764800 + }, + { + "epoch": 17.61536336004421, + "grad_norm": 3.019011974334717, + "learning_rate": 3.775250505373686e-06, + "loss": 0.525, + "step": 765000 + }, + { + "epoch": 17.619968683798472, + "grad_norm": 3.5621178150177, + "learning_rate": 3.76797894155117e-06, + "loss": 0.5176, + "step": 765200 + }, + { + "epoch": 17.62457400755273, + "grad_norm": 3.7816877365112305, + "learning_rate": 3.760707377728654e-06, + "loss": 0.5132, + "step": 765400 + }, + { + "epoch": 17.62917933130699, + "grad_norm": 2.997817277908325, + "learning_rate": 3.753435813906139e-06, + "loss": 0.5202, + "step": 765600 + }, + { + "epoch": 17.633784655061252, + "grad_norm": 3.401996374130249, + "learning_rate": 3.746164250083623e-06, + "loss": 0.513, + "step": 765800 + }, + { + "epoch": 17.63838997881551, + "grad_norm": 3.3747670650482178, + "learning_rate": 3.7388926862611075e-06, + "loss": 0.5162, + "step": 766000 + }, + { + "epoch": 17.64299530256977, + "grad_norm": 3.419926404953003, + "learning_rate": 3.7316211224385917e-06, + "loss": 0.5122, + "step": 766200 + }, + { + "epoch": 17.64760062632403, + "grad_norm": 3.317556381225586, + "learning_rate": 3.7243859164351886e-06, + "loss": 0.5277, + "step": 766400 + }, + { + "epoch": 17.65220595007829, + "grad_norm": 3.193814516067505, + "learning_rate": 3.7171143526126733e-06, + "loss": 0.5075, + "step": 766600 + }, + { + "epoch": 17.65681127383255, + "grad_norm": 2.69360089302063, + "learning_rate": 3.709842788790157e-06, + "loss": 0.519, + "step": 766800 + }, + { + "epoch": 17.66141659758681, + "grad_norm": 2.8568925857543945, + "learning_rate": 3.7025712249676417e-06, + "loss": 0.5266, + "step": 767000 + }, + { + "epoch": 17.66602192134107, + "grad_norm": 3.275151491165161, + "learning_rate": 3.695299661145126e-06, + "loss": 0.522, + "step": 767200 + }, + { + "epoch": 17.67062724509533, + "grad_norm": 3.3356289863586426, + "learning_rate": 3.6880280973226106e-06, + "loss": 0.5257, + "step": 767400 + }, + { + "epoch": 17.67523256884959, + "grad_norm": 3.011615514755249, + "learning_rate": 3.6807565335000943e-06, + "loss": 0.5226, + "step": 767600 + }, + { + "epoch": 17.67983789260385, + "grad_norm": 2.5866901874542236, + "learning_rate": 3.673484969677579e-06, + "loss": 0.5196, + "step": 767800 + }, + { + "epoch": 17.684443216358112, + "grad_norm": 3.7336740493774414, + "learning_rate": 3.666213405855063e-06, + "loss": 0.5119, + "step": 768000 + }, + { + "epoch": 17.68904854011237, + "grad_norm": 2.9361438751220703, + "learning_rate": 3.658941842032548e-06, + "loss": 0.5093, + "step": 768200 + }, + { + "epoch": 17.69365386386663, + "grad_norm": 3.3662490844726562, + "learning_rate": 3.6516702782100317e-06, + "loss": 0.5179, + "step": 768400 + }, + { + "epoch": 17.69825918762089, + "grad_norm": 3.797208547592163, + "learning_rate": 3.6444350722066286e-06, + "loss": 0.5157, + "step": 768600 + }, + { + "epoch": 17.70286451137515, + "grad_norm": 3.147529363632202, + "learning_rate": 3.637163508384113e-06, + "loss": 0.5324, + "step": 768800 + }, + { + "epoch": 17.70746983512941, + "grad_norm": 3.4828877449035645, + "learning_rate": 3.62992830238071e-06, + "loss": 0.5217, + "step": 769000 + }, + { + "epoch": 17.71207515888367, + "grad_norm": 3.616196393966675, + "learning_rate": 3.6226567385581943e-06, + "loss": 0.5199, + "step": 769200 + }, + { + "epoch": 17.71668048263793, + "grad_norm": 2.796513319015503, + "learning_rate": 3.615385174735679e-06, + "loss": 0.5121, + "step": 769400 + }, + { + "epoch": 17.72128580639219, + "grad_norm": 2.718034029006958, + "learning_rate": 3.6081136109131628e-06, + "loss": 0.5194, + "step": 769600 + }, + { + "epoch": 17.72589113014645, + "grad_norm": 3.4165828227996826, + "learning_rate": 3.6008784049097597e-06, + "loss": 0.5348, + "step": 769800 + }, + { + "epoch": 17.73049645390071, + "grad_norm": 3.0737781524658203, + "learning_rate": 3.5936068410872443e-06, + "loss": 0.517, + "step": 770000 + }, + { + "epoch": 17.735101777654968, + "grad_norm": 3.283372163772583, + "learning_rate": 3.5863352772647285e-06, + "loss": 0.5262, + "step": 770200 + }, + { + "epoch": 17.73970710140923, + "grad_norm": 2.618170738220215, + "learning_rate": 3.579063713442213e-06, + "loss": 0.5252, + "step": 770400 + }, + { + "epoch": 17.74431242516349, + "grad_norm": 3.1445319652557373, + "learning_rate": 3.571792149619697e-06, + "loss": 0.5196, + "step": 770600 + }, + { + "epoch": 17.748917748917748, + "grad_norm": 3.0439717769622803, + "learning_rate": 3.5645205857971816e-06, + "loss": 0.5282, + "step": 770800 + }, + { + "epoch": 17.75352307267201, + "grad_norm": 2.830927610397339, + "learning_rate": 3.557249021974666e-06, + "loss": 0.517, + "step": 771000 + }, + { + "epoch": 17.75812839642627, + "grad_norm": 2.9426324367523193, + "learning_rate": 3.5499774581521505e-06, + "loss": 0.5185, + "step": 771200 + }, + { + "epoch": 17.762733720180528, + "grad_norm": 3.1209681034088135, + "learning_rate": 3.5427058943296343e-06, + "loss": 0.5221, + "step": 771400 + }, + { + "epoch": 17.76733904393479, + "grad_norm": 2.9420995712280273, + "learning_rate": 3.535434330507119e-06, + "loss": 0.5119, + "step": 771600 + }, + { + "epoch": 17.77194436768905, + "grad_norm": 2.995070219039917, + "learning_rate": 3.5281627666846036e-06, + "loss": 0.5254, + "step": 771800 + }, + { + "epoch": 17.776549691443307, + "grad_norm": 3.6873717308044434, + "learning_rate": 3.5208912028620878e-06, + "loss": 0.527, + "step": 772000 + }, + { + "epoch": 17.78115501519757, + "grad_norm": 2.82792592048645, + "learning_rate": 3.513619639039572e-06, + "loss": 0.5208, + "step": 772200 + }, + { + "epoch": 17.785760338951828, + "grad_norm": 2.409423351287842, + "learning_rate": 3.5063480752170562e-06, + "loss": 0.526, + "step": 772400 + }, + { + "epoch": 17.790365662706087, + "grad_norm": 2.7182860374450684, + "learning_rate": 3.499076511394541e-06, + "loss": 0.5197, + "step": 772600 + }, + { + "epoch": 17.79497098646035, + "grad_norm": 4.183868885040283, + "learning_rate": 3.491804947572025e-06, + "loss": 0.5077, + "step": 772800 + }, + { + "epoch": 17.799576310214608, + "grad_norm": 3.585371971130371, + "learning_rate": 3.4845333837495093e-06, + "loss": 0.5102, + "step": 773000 + }, + { + "epoch": 17.804181633968867, + "grad_norm": 3.362762451171875, + "learning_rate": 3.4772618199269935e-06, + "loss": 0.5208, + "step": 773200 + }, + { + "epoch": 17.80878695772313, + "grad_norm": 3.437537431716919, + "learning_rate": 3.469990256104478e-06, + "loss": 0.531, + "step": 773400 + }, + { + "epoch": 17.813392281477388, + "grad_norm": 3.5331273078918457, + "learning_rate": 3.4627186922819624e-06, + "loss": 0.504, + "step": 773600 + }, + { + "epoch": 17.817997605231646, + "grad_norm": 2.810429811477661, + "learning_rate": 3.4554471284594466e-06, + "loss": 0.5231, + "step": 773800 + }, + { + "epoch": 17.82260292898591, + "grad_norm": 2.8518621921539307, + "learning_rate": 3.448175564636931e-06, + "loss": 0.5173, + "step": 774000 + }, + { + "epoch": 17.827208252740167, + "grad_norm": 4.182199478149414, + "learning_rate": 3.4409040008144155e-06, + "loss": 0.5206, + "step": 774200 + }, + { + "epoch": 17.831813576494426, + "grad_norm": 2.786973237991333, + "learning_rate": 3.4336324369918997e-06, + "loss": 0.5256, + "step": 774400 + }, + { + "epoch": 17.836418900248688, + "grad_norm": 4.052456855773926, + "learning_rate": 3.426360873169384e-06, + "loss": 0.5245, + "step": 774600 + }, + { + "epoch": 17.841024224002947, + "grad_norm": 3.5962717533111572, + "learning_rate": 3.419089309346868e-06, + "loss": 0.5331, + "step": 774800 + }, + { + "epoch": 17.845629547757206, + "grad_norm": 3.3177649974823, + "learning_rate": 3.4118177455243528e-06, + "loss": 0.5154, + "step": 775000 + }, + { + "epoch": 17.850234871511468, + "grad_norm": 2.6768603324890137, + "learning_rate": 3.4045461817018366e-06, + "loss": 0.5156, + "step": 775200 + }, + { + "epoch": 17.854840195265727, + "grad_norm": 3.424499034881592, + "learning_rate": 3.397274617879321e-06, + "loss": 0.5172, + "step": 775400 + }, + { + "epoch": 17.85944551901999, + "grad_norm": 2.6786439418792725, + "learning_rate": 3.3900030540568054e-06, + "loss": 0.5164, + "step": 775600 + }, + { + "epoch": 17.864050842774247, + "grad_norm": 4.423825740814209, + "learning_rate": 3.38273149023429e-06, + "loss": 0.5096, + "step": 775800 + }, + { + "epoch": 17.868656166528506, + "grad_norm": 3.1935222148895264, + "learning_rate": 3.375459926411774e-06, + "loss": 0.5238, + "step": 776000 + }, + { + "epoch": 17.87326149028277, + "grad_norm": 3.05393648147583, + "learning_rate": 3.3681883625892585e-06, + "loss": 0.5086, + "step": 776200 + }, + { + "epoch": 17.877866814037027, + "grad_norm": 2.8868277072906494, + "learning_rate": 3.3609167987667427e-06, + "loss": 0.5192, + "step": 776400 + }, + { + "epoch": 17.882472137791286, + "grad_norm": 3.3218915462493896, + "learning_rate": 3.3536815927633396e-06, + "loss": 0.5108, + "step": 776600 + }, + { + "epoch": 17.887077461545548, + "grad_norm": 3.428025007247925, + "learning_rate": 3.3464100289408243e-06, + "loss": 0.5158, + "step": 776800 + }, + { + "epoch": 17.891682785299807, + "grad_norm": 2.7553398609161377, + "learning_rate": 3.339138465118308e-06, + "loss": 0.512, + "step": 777000 + }, + { + "epoch": 17.896288109054066, + "grad_norm": 3.779360294342041, + "learning_rate": 3.3318669012957927e-06, + "loss": 0.5345, + "step": 777200 + }, + { + "epoch": 17.900893432808328, + "grad_norm": 4.041235446929932, + "learning_rate": 3.324595337473277e-06, + "loss": 0.5183, + "step": 777400 + }, + { + "epoch": 17.905498756562586, + "grad_norm": 3.0583138465881348, + "learning_rate": 3.3173237736507616e-06, + "loss": 0.5256, + "step": 777600 + }, + { + "epoch": 17.910104080316845, + "grad_norm": 2.8636226654052734, + "learning_rate": 3.3100522098282454e-06, + "loss": 0.5244, + "step": 777800 + }, + { + "epoch": 17.914709404071107, + "grad_norm": 3.2224411964416504, + "learning_rate": 3.30278064600573e-06, + "loss": 0.5183, + "step": 778000 + }, + { + "epoch": 17.919314727825366, + "grad_norm": 3.5364627838134766, + "learning_rate": 3.2955090821832142e-06, + "loss": 0.5119, + "step": 778200 + }, + { + "epoch": 17.923920051579625, + "grad_norm": 3.234290361404419, + "learning_rate": 3.288237518360699e-06, + "loss": 0.5245, + "step": 778400 + }, + { + "epoch": 17.928525375333887, + "grad_norm": 2.7013673782348633, + "learning_rate": 3.2809659545381827e-06, + "loss": 0.5182, + "step": 778600 + }, + { + "epoch": 17.933130699088146, + "grad_norm": 3.9952573776245117, + "learning_rate": 3.2736943907156673e-06, + "loss": 0.5145, + "step": 778800 + }, + { + "epoch": 17.937736022842405, + "grad_norm": 3.0978848934173584, + "learning_rate": 3.2664228268931515e-06, + "loss": 0.5202, + "step": 779000 + }, + { + "epoch": 17.942341346596667, + "grad_norm": 3.4453442096710205, + "learning_rate": 3.2591876208897484e-06, + "loss": 0.5161, + "step": 779200 + }, + { + "epoch": 17.946946670350926, + "grad_norm": 2.99537992477417, + "learning_rate": 3.251916057067233e-06, + "loss": 0.5138, + "step": 779400 + }, + { + "epoch": 17.951551994105184, + "grad_norm": 2.7019853591918945, + "learning_rate": 3.244644493244717e-06, + "loss": 0.5056, + "step": 779600 + }, + { + "epoch": 17.956157317859446, + "grad_norm": 3.723837375640869, + "learning_rate": 3.2373729294222015e-06, + "loss": 0.5151, + "step": 779800 + }, + { + "epoch": 17.960762641613705, + "grad_norm": 2.7109344005584717, + "learning_rate": 3.230101365599686e-06, + "loss": 0.5082, + "step": 780000 + }, + { + "epoch": 17.965367965367964, + "grad_norm": 3.6867780685424805, + "learning_rate": 3.2228298017771704e-06, + "loss": 0.5128, + "step": 780200 + }, + { + "epoch": 17.969973289122226, + "grad_norm": 3.721771478652954, + "learning_rate": 3.2155945957737673e-06, + "loss": 0.5192, + "step": 780400 + }, + { + "epoch": 17.974578612876485, + "grad_norm": 2.7441351413726807, + "learning_rate": 3.2083230319512515e-06, + "loss": 0.5246, + "step": 780600 + }, + { + "epoch": 17.979183936630744, + "grad_norm": 3.119499921798706, + "learning_rate": 3.2010514681287357e-06, + "loss": 0.519, + "step": 780800 + }, + { + "epoch": 17.983789260385006, + "grad_norm": 5.068978786468506, + "learning_rate": 3.1937799043062204e-06, + "loss": 0.5247, + "step": 781000 + }, + { + "epoch": 17.988394584139265, + "grad_norm": 2.891235828399658, + "learning_rate": 3.1865083404837046e-06, + "loss": 0.5135, + "step": 781200 + }, + { + "epoch": 17.992999907893523, + "grad_norm": 3.192763566970825, + "learning_rate": 3.179236776661189e-06, + "loss": 0.5187, + "step": 781400 + }, + { + "epoch": 17.997605231647785, + "grad_norm": 3.4980311393737793, + "learning_rate": 3.171965212838673e-06, + "loss": 0.5234, + "step": 781600 + }, + { + "epoch": 18.0, + "eval_loss": 0.5081140398979187, + "eval_runtime": 170.1048, + "eval_samples_per_second": 166.727, + "eval_steps_per_second": 10.423, + "step": 781704 + }, + { + "epoch": 18.002210555402044, + "grad_norm": 2.9518814086914062, + "learning_rate": 3.1646936490161577e-06, + "loss": 0.5216, + "step": 781800 + }, + { + "epoch": 18.006815879156306, + "grad_norm": 2.917123794555664, + "learning_rate": 3.157422085193642e-06, + "loss": 0.5168, + "step": 782000 + }, + { + "epoch": 18.011421202910565, + "grad_norm": 3.122274875640869, + "learning_rate": 3.150150521371126e-06, + "loss": 0.511, + "step": 782200 + }, + { + "epoch": 18.016026526664824, + "grad_norm": 3.571920394897461, + "learning_rate": 3.1428789575486103e-06, + "loss": 0.5183, + "step": 782400 + }, + { + "epoch": 18.020631850419086, + "grad_norm": 3.9012715816497803, + "learning_rate": 3.135607393726095e-06, + "loss": 0.5222, + "step": 782600 + }, + { + "epoch": 18.025237174173345, + "grad_norm": 3.0531198978424072, + "learning_rate": 3.128335829903579e-06, + "loss": 0.5064, + "step": 782800 + }, + { + "epoch": 18.029842497927604, + "grad_norm": 3.265467405319214, + "learning_rate": 3.121100623900176e-06, + "loss": 0.5119, + "step": 783000 + }, + { + "epoch": 18.034447821681866, + "grad_norm": 2.407761335372925, + "learning_rate": 3.1138290600776603e-06, + "loss": 0.5109, + "step": 783200 + }, + { + "epoch": 18.039053145436124, + "grad_norm": 3.1489205360412598, + "learning_rate": 3.1065574962551445e-06, + "loss": 0.509, + "step": 783400 + }, + { + "epoch": 18.043658469190383, + "grad_norm": 3.7624034881591797, + "learning_rate": 3.099285932432629e-06, + "loss": 0.5208, + "step": 783600 + }, + { + "epoch": 18.048263792944645, + "grad_norm": 3.1477630138397217, + "learning_rate": 3.0920143686101134e-06, + "loss": 0.5228, + "step": 783800 + }, + { + "epoch": 18.052869116698904, + "grad_norm": 3.3497445583343506, + "learning_rate": 3.0847428047875976e-06, + "loss": 0.5191, + "step": 784000 + }, + { + "epoch": 18.057474440453163, + "grad_norm": 2.5725488662719727, + "learning_rate": 3.077471240965082e-06, + "loss": 0.5346, + "step": 784200 + }, + { + "epoch": 18.062079764207425, + "grad_norm": 3.2691822052001953, + "learning_rate": 3.0701996771425665e-06, + "loss": 0.5029, + "step": 784400 + }, + { + "epoch": 18.066685087961684, + "grad_norm": 3.542738914489746, + "learning_rate": 3.0629281133200507e-06, + "loss": 0.5097, + "step": 784600 + }, + { + "epoch": 18.071290411715943, + "grad_norm": 3.2178702354431152, + "learning_rate": 3.055656549497535e-06, + "loss": 0.5153, + "step": 784800 + }, + { + "epoch": 18.075895735470205, + "grad_norm": 3.4788858890533447, + "learning_rate": 3.048384985675019e-06, + "loss": 0.5136, + "step": 785000 + }, + { + "epoch": 18.080501059224463, + "grad_norm": 2.703796625137329, + "learning_rate": 3.041113421852504e-06, + "loss": 0.525, + "step": 785200 + }, + { + "epoch": 18.085106382978722, + "grad_norm": 3.2644095420837402, + "learning_rate": 3.033841858029988e-06, + "loss": 0.5144, + "step": 785400 + }, + { + "epoch": 18.089711706732984, + "grad_norm": 3.0157806873321533, + "learning_rate": 3.0265702942074722e-06, + "loss": 0.5259, + "step": 785600 + }, + { + "epoch": 18.094317030487243, + "grad_norm": 3.0737905502319336, + "learning_rate": 3.0192987303849565e-06, + "loss": 0.5122, + "step": 785800 + }, + { + "epoch": 18.098922354241502, + "grad_norm": 2.9035747051239014, + "learning_rate": 3.012027166562441e-06, + "loss": 0.5151, + "step": 786000 + }, + { + "epoch": 18.103527677995764, + "grad_norm": 2.6687934398651123, + "learning_rate": 3.0047556027399253e-06, + "loss": 0.516, + "step": 786200 + }, + { + "epoch": 18.108133001750023, + "grad_norm": 3.1634881496429443, + "learning_rate": 2.9974840389174095e-06, + "loss": 0.5203, + "step": 786400 + }, + { + "epoch": 18.11273832550428, + "grad_norm": 2.5920820236206055, + "learning_rate": 2.9902124750948938e-06, + "loss": 0.5052, + "step": 786600 + }, + { + "epoch": 18.117343649258544, + "grad_norm": 3.145585536956787, + "learning_rate": 2.9829409112723784e-06, + "loss": 0.509, + "step": 786800 + }, + { + "epoch": 18.121948973012802, + "grad_norm": 3.541355609893799, + "learning_rate": 2.9756693474498626e-06, + "loss": 0.5237, + "step": 787000 + }, + { + "epoch": 18.12655429676706, + "grad_norm": 2.4166955947875977, + "learning_rate": 2.968397783627347e-06, + "loss": 0.508, + "step": 787200 + }, + { + "epoch": 18.131159620521323, + "grad_norm": 3.0432989597320557, + "learning_rate": 2.961126219804831e-06, + "loss": 0.5131, + "step": 787400 + }, + { + "epoch": 18.135764944275582, + "grad_norm": 2.8021647930145264, + "learning_rate": 2.9538546559823157e-06, + "loss": 0.5154, + "step": 787600 + }, + { + "epoch": 18.14037026802984, + "grad_norm": 4.511200428009033, + "learning_rate": 2.9465830921598e-06, + "loss": 0.5155, + "step": 787800 + }, + { + "epoch": 18.144975591784103, + "grad_norm": 3.441237449645996, + "learning_rate": 2.939311528337284e-06, + "loss": 0.5131, + "step": 788000 + }, + { + "epoch": 18.149580915538362, + "grad_norm": 3.7599685192108154, + "learning_rate": 2.9320399645147688e-06, + "loss": 0.5169, + "step": 788200 + }, + { + "epoch": 18.15418623929262, + "grad_norm": 3.4300057888031006, + "learning_rate": 2.9248047585113653e-06, + "loss": 0.5213, + "step": 788400 + }, + { + "epoch": 18.158791563046883, + "grad_norm": 3.6189639568328857, + "learning_rate": 2.91753319468885e-06, + "loss": 0.5173, + "step": 788600 + }, + { + "epoch": 18.16339688680114, + "grad_norm": 2.8627123832702637, + "learning_rate": 2.9102616308663346e-06, + "loss": 0.5261, + "step": 788800 + }, + { + "epoch": 18.168002210555404, + "grad_norm": 3.8576009273529053, + "learning_rate": 2.9029900670438183e-06, + "loss": 0.52, + "step": 789000 + }, + { + "epoch": 18.172607534309662, + "grad_norm": 3.4420907497406006, + "learning_rate": 2.895718503221303e-06, + "loss": 0.5262, + "step": 789200 + }, + { + "epoch": 18.17721285806392, + "grad_norm": 3.3978118896484375, + "learning_rate": 2.888446939398787e-06, + "loss": 0.5168, + "step": 789400 + }, + { + "epoch": 18.181818181818183, + "grad_norm": 2.9462685585021973, + "learning_rate": 2.881175375576272e-06, + "loss": 0.5105, + "step": 789600 + }, + { + "epoch": 18.186423505572442, + "grad_norm": 2.9186863899230957, + "learning_rate": 2.8739038117537556e-06, + "loss": 0.5146, + "step": 789800 + }, + { + "epoch": 18.1910288293267, + "grad_norm": 2.7235682010650635, + "learning_rate": 2.8666322479312403e-06, + "loss": 0.5206, + "step": 790000 + }, + { + "epoch": 18.195634153080963, + "grad_norm": 3.430112361907959, + "learning_rate": 2.8593606841087245e-06, + "loss": 0.5105, + "step": 790200 + }, + { + "epoch": 18.200239476835222, + "grad_norm": 3.3343279361724854, + "learning_rate": 2.852089120286209e-06, + "loss": 0.5144, + "step": 790400 + }, + { + "epoch": 18.20484480058948, + "grad_norm": 3.0267772674560547, + "learning_rate": 2.844817556463693e-06, + "loss": 0.5233, + "step": 790600 + }, + { + "epoch": 18.209450124343743, + "grad_norm": 3.3748552799224854, + "learning_rate": 2.8375459926411776e-06, + "loss": 0.5131, + "step": 790800 + }, + { + "epoch": 18.214055448098, + "grad_norm": 2.5174360275268555, + "learning_rate": 2.830274428818662e-06, + "loss": 0.5182, + "step": 791000 + }, + { + "epoch": 18.21866077185226, + "grad_norm": 3.363555669784546, + "learning_rate": 2.8230028649961465e-06, + "loss": 0.531, + "step": 791200 + }, + { + "epoch": 18.223266095606522, + "grad_norm": 2.635622024536133, + "learning_rate": 2.8157313011736302e-06, + "loss": 0.5156, + "step": 791400 + }, + { + "epoch": 18.22787141936078, + "grad_norm": 3.1038053035736084, + "learning_rate": 2.808459737351115e-06, + "loss": 0.52, + "step": 791600 + }, + { + "epoch": 18.23247674311504, + "grad_norm": 3.0769765377044678, + "learning_rate": 2.801188173528599e-06, + "loss": 0.5156, + "step": 791800 + }, + { + "epoch": 18.237082066869302, + "grad_norm": 2.5016119480133057, + "learning_rate": 2.7939166097060838e-06, + "loss": 0.5227, + "step": 792000 + }, + { + "epoch": 18.24168739062356, + "grad_norm": 3.1567609310150146, + "learning_rate": 2.7866450458835676e-06, + "loss": 0.5215, + "step": 792200 + }, + { + "epoch": 18.24629271437782, + "grad_norm": 3.602930784225464, + "learning_rate": 2.779373482061052e-06, + "loss": 0.5226, + "step": 792400 + }, + { + "epoch": 18.250898038132082, + "grad_norm": 2.699720859527588, + "learning_rate": 2.7721019182385364e-06, + "loss": 0.5195, + "step": 792600 + }, + { + "epoch": 18.25550336188634, + "grad_norm": 2.6667487621307373, + "learning_rate": 2.764830354416021e-06, + "loss": 0.5111, + "step": 792800 + }, + { + "epoch": 18.2601086856406, + "grad_norm": 3.1403419971466064, + "learning_rate": 2.757558790593505e-06, + "loss": 0.5139, + "step": 793000 + }, + { + "epoch": 18.26471400939486, + "grad_norm": 3.1445913314819336, + "learning_rate": 2.7502872267709895e-06, + "loss": 0.5169, + "step": 793200 + }, + { + "epoch": 18.26931933314912, + "grad_norm": 3.3420560359954834, + "learning_rate": 2.7430156629484737e-06, + "loss": 0.5188, + "step": 793400 + }, + { + "epoch": 18.27392465690338, + "grad_norm": 3.481611490249634, + "learning_rate": 2.7357804569450706e-06, + "loss": 0.51, + "step": 793600 + }, + { + "epoch": 18.27852998065764, + "grad_norm": 3.8973641395568848, + "learning_rate": 2.7285088931225553e-06, + "loss": 0.5136, + "step": 793800 + }, + { + "epoch": 18.2831353044119, + "grad_norm": 2.5481739044189453, + "learning_rate": 2.721237329300039e-06, + "loss": 0.5212, + "step": 794000 + }, + { + "epoch": 18.28774062816616, + "grad_norm": 2.917550802230835, + "learning_rate": 2.7139657654775237e-06, + "loss": 0.5216, + "step": 794200 + }, + { + "epoch": 18.29234595192042, + "grad_norm": 3.176706314086914, + "learning_rate": 2.706694201655008e-06, + "loss": 0.5028, + "step": 794400 + }, + { + "epoch": 18.29695127567468, + "grad_norm": 3.2188303470611572, + "learning_rate": 2.699458995651605e-06, + "loss": 0.52, + "step": 794600 + }, + { + "epoch": 18.301556599428938, + "grad_norm": 3.91021466255188, + "learning_rate": 2.6921874318290895e-06, + "loss": 0.5127, + "step": 794800 + }, + { + "epoch": 18.3061619231832, + "grad_norm": 3.8334686756134033, + "learning_rate": 2.6849158680065733e-06, + "loss": 0.5062, + "step": 795000 + }, + { + "epoch": 18.31076724693746, + "grad_norm": 3.018327236175537, + "learning_rate": 2.677644304184058e-06, + "loss": 0.5228, + "step": 795200 + }, + { + "epoch": 18.31537257069172, + "grad_norm": 4.303511619567871, + "learning_rate": 2.670372740361542e-06, + "loss": 0.5138, + "step": 795400 + }, + { + "epoch": 18.31997789444598, + "grad_norm": 3.589290142059326, + "learning_rate": 2.6631011765390268e-06, + "loss": 0.5281, + "step": 795600 + }, + { + "epoch": 18.32458321820024, + "grad_norm": 2.7188146114349365, + "learning_rate": 2.6558296127165106e-06, + "loss": 0.5093, + "step": 795800 + }, + { + "epoch": 18.3291885419545, + "grad_norm": 3.7704238891601562, + "learning_rate": 2.6485580488939952e-06, + "loss": 0.5106, + "step": 796000 + }, + { + "epoch": 18.33379386570876, + "grad_norm": 2.9038262367248535, + "learning_rate": 2.6412864850714794e-06, + "loss": 0.5047, + "step": 796200 + }, + { + "epoch": 18.33839918946302, + "grad_norm": 3.1424877643585205, + "learning_rate": 2.634014921248964e-06, + "loss": 0.5227, + "step": 796400 + }, + { + "epoch": 18.34300451321728, + "grad_norm": 3.6267952919006348, + "learning_rate": 2.626743357426448e-06, + "loss": 0.5094, + "step": 796600 + }, + { + "epoch": 18.34760983697154, + "grad_norm": 3.1225740909576416, + "learning_rate": 2.6194717936039325e-06, + "loss": 0.5222, + "step": 796800 + }, + { + "epoch": 18.352215160725798, + "grad_norm": 3.5148890018463135, + "learning_rate": 2.612200229781417e-06, + "loss": 0.5134, + "step": 797000 + }, + { + "epoch": 18.35682048448006, + "grad_norm": 3.0749411582946777, + "learning_rate": 2.6049286659589014e-06, + "loss": 0.5055, + "step": 797200 + }, + { + "epoch": 18.36142580823432, + "grad_norm": 3.1104702949523926, + "learning_rate": 2.5976571021363856e-06, + "loss": 0.5224, + "step": 797400 + }, + { + "epoch": 18.366031131988578, + "grad_norm": 4.1002020835876465, + "learning_rate": 2.59038553831387e-06, + "loss": 0.5125, + "step": 797600 + }, + { + "epoch": 18.37063645574284, + "grad_norm": 3.385798692703247, + "learning_rate": 2.5831139744913545e-06, + "loss": 0.5238, + "step": 797800 + }, + { + "epoch": 18.3752417794971, + "grad_norm": 2.667778253555298, + "learning_rate": 2.5758424106688383e-06, + "loss": 0.519, + "step": 798000 + }, + { + "epoch": 18.379847103251358, + "grad_norm": 3.486220121383667, + "learning_rate": 2.568570846846323e-06, + "loss": 0.5123, + "step": 798200 + }, + { + "epoch": 18.38445242700562, + "grad_norm": 2.9387404918670654, + "learning_rate": 2.56133564084292e-06, + "loss": 0.5075, + "step": 798400 + }, + { + "epoch": 18.38905775075988, + "grad_norm": 3.13200044631958, + "learning_rate": 2.554064077020404e-06, + "loss": 0.5107, + "step": 798600 + }, + { + "epoch": 18.393663074514137, + "grad_norm": 2.374018669128418, + "learning_rate": 2.5467925131978887e-06, + "loss": 0.5245, + "step": 798800 + }, + { + "epoch": 18.3982683982684, + "grad_norm": 3.3281686305999756, + "learning_rate": 2.539520949375373e-06, + "loss": 0.5111, + "step": 799000 + }, + { + "epoch": 18.402873722022658, + "grad_norm": 2.587111711502075, + "learning_rate": 2.532249385552857e-06, + "loss": 0.5144, + "step": 799200 + }, + { + "epoch": 18.407479045776917, + "grad_norm": 2.7880938053131104, + "learning_rate": 2.5249778217303413e-06, + "loss": 0.5129, + "step": 799400 + }, + { + "epoch": 18.41208436953118, + "grad_norm": 2.612704038619995, + "learning_rate": 2.517706257907826e-06, + "loss": 0.5169, + "step": 799600 + }, + { + "epoch": 18.416689693285438, + "grad_norm": 3.7261786460876465, + "learning_rate": 2.5104346940853098e-06, + "loss": 0.5128, + "step": 799800 + }, + { + "epoch": 18.421295017039697, + "grad_norm": 3.7250654697418213, + "learning_rate": 2.5031631302627944e-06, + "loss": 0.5108, + "step": 800000 + }, + { + "epoch": 18.42590034079396, + "grad_norm": 3.5736753940582275, + "learning_rate": 2.4958915664402786e-06, + "loss": 0.5307, + "step": 800200 + }, + { + "epoch": 18.430505664548217, + "grad_norm": 2.80611515045166, + "learning_rate": 2.4886200026177633e-06, + "loss": 0.5141, + "step": 800400 + }, + { + "epoch": 18.435110988302476, + "grad_norm": 3.6763579845428467, + "learning_rate": 2.481348438795247e-06, + "loss": 0.5067, + "step": 800600 + }, + { + "epoch": 18.43971631205674, + "grad_norm": 2.915642023086548, + "learning_rate": 2.4740768749727317e-06, + "loss": 0.5219, + "step": 800800 + }, + { + "epoch": 18.444321635810997, + "grad_norm": 4.065909385681152, + "learning_rate": 2.466805311150216e-06, + "loss": 0.5083, + "step": 801000 + }, + { + "epoch": 18.448926959565256, + "grad_norm": 3.1720340251922607, + "learning_rate": 2.4595337473277006e-06, + "loss": 0.5222, + "step": 801200 + }, + { + "epoch": 18.453532283319518, + "grad_norm": 2.6975619792938232, + "learning_rate": 2.4522621835051844e-06, + "loss": 0.5067, + "step": 801400 + }, + { + "epoch": 18.458137607073777, + "grad_norm": 2.7786128520965576, + "learning_rate": 2.444990619682669e-06, + "loss": 0.5181, + "step": 801600 + }, + { + "epoch": 18.462742930828036, + "grad_norm": 2.5183842182159424, + "learning_rate": 2.4377190558601532e-06, + "loss": 0.5166, + "step": 801800 + }, + { + "epoch": 18.467348254582298, + "grad_norm": 2.8333635330200195, + "learning_rate": 2.430447492037638e-06, + "loss": 0.5163, + "step": 802000 + }, + { + "epoch": 18.471953578336556, + "grad_norm": 3.31788969039917, + "learning_rate": 2.4231759282151217e-06, + "loss": 0.518, + "step": 802200 + }, + { + "epoch": 18.476558902090815, + "grad_norm": 3.1232898235321045, + "learning_rate": 2.4159043643926063e-06, + "loss": 0.5137, + "step": 802400 + }, + { + "epoch": 18.481164225845077, + "grad_norm": 3.5303895473480225, + "learning_rate": 2.4086691583892032e-06, + "loss": 0.5241, + "step": 802600 + }, + { + "epoch": 18.485769549599336, + "grad_norm": 2.972893714904785, + "learning_rate": 2.4013975945666874e-06, + "loss": 0.5078, + "step": 802800 + }, + { + "epoch": 18.4903748733536, + "grad_norm": 2.6240293979644775, + "learning_rate": 2.394126030744172e-06, + "loss": 0.5166, + "step": 803000 + }, + { + "epoch": 18.494980197107857, + "grad_norm": 2.6254336833953857, + "learning_rate": 2.386890824740769e-06, + "loss": 0.5195, + "step": 803200 + }, + { + "epoch": 18.499585520862116, + "grad_norm": 3.5183451175689697, + "learning_rate": 2.3796192609182528e-06, + "loss": 0.5125, + "step": 803400 + }, + { + "epoch": 18.504190844616378, + "grad_norm": 2.951514720916748, + "learning_rate": 2.3723476970957374e-06, + "loss": 0.515, + "step": 803600 + }, + { + "epoch": 18.508796168370637, + "grad_norm": 3.723506450653076, + "learning_rate": 2.3650761332732216e-06, + "loss": 0.5108, + "step": 803800 + }, + { + "epoch": 18.513401492124895, + "grad_norm": 2.8375089168548584, + "learning_rate": 2.3578045694507063e-06, + "loss": 0.5099, + "step": 804000 + }, + { + "epoch": 18.518006815879158, + "grad_norm": 2.5292952060699463, + "learning_rate": 2.35053300562819e-06, + "loss": 0.5176, + "step": 804200 + }, + { + "epoch": 18.522612139633416, + "grad_norm": 2.64688777923584, + "learning_rate": 2.3432614418056747e-06, + "loss": 0.5067, + "step": 804400 + }, + { + "epoch": 18.527217463387675, + "grad_norm": 3.6783273220062256, + "learning_rate": 2.335989877983159e-06, + "loss": 0.5204, + "step": 804600 + }, + { + "epoch": 18.531822787141937, + "grad_norm": 3.5552361011505127, + "learning_rate": 2.3287183141606436e-06, + "loss": 0.5043, + "step": 804800 + }, + { + "epoch": 18.536428110896196, + "grad_norm": 3.2236578464508057, + "learning_rate": 2.3214467503381274e-06, + "loss": 0.5089, + "step": 805000 + }, + { + "epoch": 18.541033434650455, + "grad_norm": 3.2282896041870117, + "learning_rate": 2.314175186515612e-06, + "loss": 0.5058, + "step": 805200 + }, + { + "epoch": 18.545638758404717, + "grad_norm": 2.5554604530334473, + "learning_rate": 2.3069036226930963e-06, + "loss": 0.5093, + "step": 805400 + }, + { + "epoch": 18.550244082158976, + "grad_norm": 3.2304649353027344, + "learning_rate": 2.299632058870581e-06, + "loss": 0.5146, + "step": 805600 + }, + { + "epoch": 18.554849405913235, + "grad_norm": 4.473756790161133, + "learning_rate": 2.292360495048065e-06, + "loss": 0.505, + "step": 805800 + }, + { + "epoch": 18.559454729667497, + "grad_norm": 3.0843703746795654, + "learning_rate": 2.2850889312255493e-06, + "loss": 0.5167, + "step": 806000 + }, + { + "epoch": 18.564060053421755, + "grad_norm": 3.773874044418335, + "learning_rate": 2.277817367403034e-06, + "loss": 0.5102, + "step": 806200 + }, + { + "epoch": 18.568665377176014, + "grad_norm": 3.845970630645752, + "learning_rate": 2.270545803580518e-06, + "loss": 0.5235, + "step": 806400 + }, + { + "epoch": 18.573270700930276, + "grad_norm": 3.6182453632354736, + "learning_rate": 2.2632742397580024e-06, + "loss": 0.5057, + "step": 806600 + }, + { + "epoch": 18.577876024684535, + "grad_norm": 3.738835573196411, + "learning_rate": 2.2560026759354866e-06, + "loss": 0.5207, + "step": 806800 + }, + { + "epoch": 18.582481348438794, + "grad_norm": 3.5557377338409424, + "learning_rate": 2.2487311121129713e-06, + "loss": 0.529, + "step": 807000 + }, + { + "epoch": 18.587086672193056, + "grad_norm": 3.059429168701172, + "learning_rate": 2.2414595482904555e-06, + "loss": 0.5075, + "step": 807200 + }, + { + "epoch": 18.591691995947315, + "grad_norm": 3.12056303024292, + "learning_rate": 2.2342243422870524e-06, + "loss": 0.5114, + "step": 807400 + }, + { + "epoch": 18.596297319701574, + "grad_norm": 3.1706645488739014, + "learning_rate": 2.2269527784645366e-06, + "loss": 0.5217, + "step": 807600 + }, + { + "epoch": 18.600902643455836, + "grad_norm": 3.850172281265259, + "learning_rate": 2.219681214642021e-06, + "loss": 0.5137, + "step": 807800 + }, + { + "epoch": 18.605507967210094, + "grad_norm": 3.080021858215332, + "learning_rate": 2.2124096508195055e-06, + "loss": 0.5139, + "step": 808000 + }, + { + "epoch": 18.610113290964353, + "grad_norm": 2.29182505607605, + "learning_rate": 2.2051380869969897e-06, + "loss": 0.5026, + "step": 808200 + }, + { + "epoch": 18.614718614718615, + "grad_norm": 3.1314103603363037, + "learning_rate": 2.197866523174474e-06, + "loss": 0.5123, + "step": 808400 + }, + { + "epoch": 18.619323938472874, + "grad_norm": 3.334752082824707, + "learning_rate": 2.190594959351958e-06, + "loss": 0.5101, + "step": 808600 + }, + { + "epoch": 18.623929262227133, + "grad_norm": 3.3216257095336914, + "learning_rate": 2.1833233955294428e-06, + "loss": 0.5094, + "step": 808800 + }, + { + "epoch": 18.628534585981395, + "grad_norm": 2.6570966243743896, + "learning_rate": 2.176051831706927e-06, + "loss": 0.5118, + "step": 809000 + }, + { + "epoch": 18.633139909735654, + "grad_norm": 3.783985137939453, + "learning_rate": 2.1687802678844112e-06, + "loss": 0.5226, + "step": 809200 + }, + { + "epoch": 18.637745233489916, + "grad_norm": 3.1615185737609863, + "learning_rate": 2.1615087040618954e-06, + "loss": 0.5194, + "step": 809400 + }, + { + "epoch": 18.642350557244175, + "grad_norm": 3.571136951446533, + "learning_rate": 2.15423714023938e-06, + "loss": 0.5248, + "step": 809600 + }, + { + "epoch": 18.646955880998433, + "grad_norm": 2.769298553466797, + "learning_rate": 2.147001934235977e-06, + "loss": 0.5104, + "step": 809800 + }, + { + "epoch": 18.651561204752696, + "grad_norm": 2.678037166595459, + "learning_rate": 2.1397303704134612e-06, + "loss": 0.5246, + "step": 810000 + }, + { + "epoch": 18.656166528506954, + "grad_norm": 3.228646993637085, + "learning_rate": 2.1324588065909454e-06, + "loss": 0.502, + "step": 810200 + }, + { + "epoch": 18.660771852261213, + "grad_norm": 2.839290142059326, + "learning_rate": 2.1252236005875423e-06, + "loss": 0.5132, + "step": 810400 + }, + { + "epoch": 18.665377176015475, + "grad_norm": 2.865377187728882, + "learning_rate": 2.1179520367650266e-06, + "loss": 0.5063, + "step": 810600 + }, + { + "epoch": 18.669982499769734, + "grad_norm": 4.583608150482178, + "learning_rate": 2.110680472942511e-06, + "loss": 0.5155, + "step": 810800 + }, + { + "epoch": 18.674587823523993, + "grad_norm": 2.7023823261260986, + "learning_rate": 2.1034089091199954e-06, + "loss": 0.5033, + "step": 811000 + }, + { + "epoch": 18.679193147278255, + "grad_norm": 2.9620158672332764, + "learning_rate": 2.0961373452974796e-06, + "loss": 0.5184, + "step": 811200 + }, + { + "epoch": 18.683798471032514, + "grad_norm": 2.519291639328003, + "learning_rate": 2.088865781474964e-06, + "loss": 0.5126, + "step": 811400 + }, + { + "epoch": 18.688403794786772, + "grad_norm": 2.743424415588379, + "learning_rate": 2.0815942176524485e-06, + "loss": 0.509, + "step": 811600 + }, + { + "epoch": 18.693009118541035, + "grad_norm": 3.654670000076294, + "learning_rate": 2.0743226538299327e-06, + "loss": 0.5155, + "step": 811800 + }, + { + "epoch": 18.697614442295293, + "grad_norm": 3.285017251968384, + "learning_rate": 2.067051090007417e-06, + "loss": 0.5126, + "step": 812000 + }, + { + "epoch": 18.702219766049552, + "grad_norm": 2.268007755279541, + "learning_rate": 2.059779526184901e-06, + "loss": 0.5104, + "step": 812200 + }, + { + "epoch": 18.706825089803814, + "grad_norm": 3.560014247894287, + "learning_rate": 2.052507962362386e-06, + "loss": 0.5294, + "step": 812400 + }, + { + "epoch": 18.711430413558073, + "grad_norm": 2.835283041000366, + "learning_rate": 2.04523639853987e-06, + "loss": 0.5132, + "step": 812600 + }, + { + "epoch": 18.716035737312332, + "grad_norm": 2.768789291381836, + "learning_rate": 2.0379648347173542e-06, + "loss": 0.5094, + "step": 812800 + }, + { + "epoch": 18.720641061066594, + "grad_norm": 2.965252637863159, + "learning_rate": 2.0306932708948385e-06, + "loss": 0.5031, + "step": 813000 + }, + { + "epoch": 18.725246384820853, + "grad_norm": 3.548333168029785, + "learning_rate": 2.023421707072323e-06, + "loss": 0.5152, + "step": 813200 + }, + { + "epoch": 18.72985170857511, + "grad_norm": 2.307636260986328, + "learning_rate": 2.0161501432498073e-06, + "loss": 0.5115, + "step": 813400 + }, + { + "epoch": 18.734457032329374, + "grad_norm": 3.0482430458068848, + "learning_rate": 2.0088785794272915e-06, + "loss": 0.5234, + "step": 813600 + }, + { + "epoch": 18.739062356083632, + "grad_norm": 2.7281389236450195, + "learning_rate": 2.0016070156047758e-06, + "loss": 0.5183, + "step": 813800 + }, + { + "epoch": 18.74366767983789, + "grad_norm": 3.274071216583252, + "learning_rate": 1.9943354517822604e-06, + "loss": 0.5144, + "step": 814000 + }, + { + "epoch": 18.748273003592153, + "grad_norm": 4.017879486083984, + "learning_rate": 1.9870638879597446e-06, + "loss": 0.5162, + "step": 814200 + }, + { + "epoch": 18.752878327346412, + "grad_norm": 3.111361265182495, + "learning_rate": 1.9798286819563415e-06, + "loss": 0.5183, + "step": 814400 + }, + { + "epoch": 18.75748365110067, + "grad_norm": 2.9756317138671875, + "learning_rate": 1.9725571181338258e-06, + "loss": 0.516, + "step": 814600 + }, + { + "epoch": 18.762088974854933, + "grad_norm": 2.925050973892212, + "learning_rate": 1.96528555431131e-06, + "loss": 0.5216, + "step": 814800 + }, + { + "epoch": 18.766694298609192, + "grad_norm": 3.605431079864502, + "learning_rate": 1.9580139904887946e-06, + "loss": 0.5169, + "step": 815000 + }, + { + "epoch": 18.77129962236345, + "grad_norm": 4.39565372467041, + "learning_rate": 1.9507424266662793e-06, + "loss": 0.5192, + "step": 815200 + }, + { + "epoch": 18.775904946117713, + "grad_norm": 2.9125242233276367, + "learning_rate": 1.943470862843763e-06, + "loss": 0.5228, + "step": 815400 + }, + { + "epoch": 18.78051026987197, + "grad_norm": 2.8098058700561523, + "learning_rate": 1.9361992990212477e-06, + "loss": 0.5089, + "step": 815600 + }, + { + "epoch": 18.78511559362623, + "grad_norm": 3.1895477771759033, + "learning_rate": 1.928927735198732e-06, + "loss": 0.5062, + "step": 815800 + }, + { + "epoch": 18.789720917380492, + "grad_norm": 3.4902753829956055, + "learning_rate": 1.9216561713762166e-06, + "loss": 0.5108, + "step": 816000 + }, + { + "epoch": 18.79432624113475, + "grad_norm": 2.913203239440918, + "learning_rate": 1.9143846075537004e-06, + "loss": 0.5183, + "step": 816200 + }, + { + "epoch": 18.798931564889013, + "grad_norm": 3.108332872390747, + "learning_rate": 1.9071130437311848e-06, + "loss": 0.514, + "step": 816400 + }, + { + "epoch": 18.803536888643272, + "grad_norm": 3.0031442642211914, + "learning_rate": 1.8998778377277817e-06, + "loss": 0.5184, + "step": 816600 + }, + { + "epoch": 18.80814221239753, + "grad_norm": 3.7675793170928955, + "learning_rate": 1.8926062739052661e-06, + "loss": 0.5141, + "step": 816800 + }, + { + "epoch": 18.812747536151793, + "grad_norm": 3.3338944911956787, + "learning_rate": 1.8853347100827506e-06, + "loss": 0.5124, + "step": 817000 + }, + { + "epoch": 18.81735285990605, + "grad_norm": 4.226090908050537, + "learning_rate": 1.8780631462602346e-06, + "loss": 0.5115, + "step": 817200 + }, + { + "epoch": 18.82195818366031, + "grad_norm": 3.7023580074310303, + "learning_rate": 1.870791582437719e-06, + "loss": 0.5124, + "step": 817400 + }, + { + "epoch": 18.826563507414573, + "grad_norm": 2.838956832885742, + "learning_rate": 1.8635200186152034e-06, + "loss": 0.524, + "step": 817600 + }, + { + "epoch": 18.83116883116883, + "grad_norm": 3.4643635749816895, + "learning_rate": 1.8562484547926876e-06, + "loss": 0.5137, + "step": 817800 + }, + { + "epoch": 18.83577415492309, + "grad_norm": 3.38789963722229, + "learning_rate": 1.848976890970172e-06, + "loss": 0.5007, + "step": 818000 + }, + { + "epoch": 18.840379478677352, + "grad_norm": 3.213732957839966, + "learning_rate": 1.8417053271476563e-06, + "loss": 0.5106, + "step": 818200 + }, + { + "epoch": 18.84498480243161, + "grad_norm": 3.5286900997161865, + "learning_rate": 1.8344337633251407e-06, + "loss": 0.5138, + "step": 818400 + }, + { + "epoch": 18.84959012618587, + "grad_norm": 3.5109329223632812, + "learning_rate": 1.827162199502625e-06, + "loss": 0.5178, + "step": 818600 + }, + { + "epoch": 18.854195449940132, + "grad_norm": 2.604468822479248, + "learning_rate": 1.8198906356801094e-06, + "loss": 0.5086, + "step": 818800 + }, + { + "epoch": 18.85880077369439, + "grad_norm": 3.2741763591766357, + "learning_rate": 1.8126190718575936e-06, + "loss": 0.5152, + "step": 819000 + }, + { + "epoch": 18.86340609744865, + "grad_norm": 3.35343337059021, + "learning_rate": 1.8053475080350782e-06, + "loss": 0.5163, + "step": 819200 + }, + { + "epoch": 18.86801142120291, + "grad_norm": 3.6704483032226562, + "learning_rate": 1.7980759442125625e-06, + "loss": 0.5255, + "step": 819400 + }, + { + "epoch": 18.87261674495717, + "grad_norm": 3.315521240234375, + "learning_rate": 1.790804380390047e-06, + "loss": 0.5222, + "step": 819600 + }, + { + "epoch": 18.87722206871143, + "grad_norm": 2.896103858947754, + "learning_rate": 1.7835328165675311e-06, + "loss": 0.5166, + "step": 819800 + }, + { + "epoch": 18.88182739246569, + "grad_norm": 2.6083381175994873, + "learning_rate": 1.7762612527450155e-06, + "loss": 0.5098, + "step": 820000 + }, + { + "epoch": 18.88643271621995, + "grad_norm": 3.2741894721984863, + "learning_rate": 1.7689896889224998e-06, + "loss": 0.5153, + "step": 820200 + }, + { + "epoch": 18.89103803997421, + "grad_norm": 2.9880030155181885, + "learning_rate": 1.7617181250999842e-06, + "loss": 0.5124, + "step": 820400 + }, + { + "epoch": 18.89564336372847, + "grad_norm": 3.405291795730591, + "learning_rate": 1.7544465612774684e-06, + "loss": 0.518, + "step": 820600 + }, + { + "epoch": 18.90024868748273, + "grad_norm": 2.607799768447876, + "learning_rate": 1.7472113552740653e-06, + "loss": 0.5123, + "step": 820800 + }, + { + "epoch": 18.90485401123699, + "grad_norm": 3.2214035987854004, + "learning_rate": 1.7399397914515498e-06, + "loss": 0.5064, + "step": 821000 + }, + { + "epoch": 18.90945933499125, + "grad_norm": 3.4306139945983887, + "learning_rate": 1.7327045854481467e-06, + "loss": 0.5062, + "step": 821200 + }, + { + "epoch": 18.91406465874551, + "grad_norm": 2.69059681892395, + "learning_rate": 1.7254330216256309e-06, + "loss": 0.5194, + "step": 821400 + }, + { + "epoch": 18.918669982499768, + "grad_norm": 3.644313097000122, + "learning_rate": 1.7181614578031153e-06, + "loss": 0.5043, + "step": 821600 + }, + { + "epoch": 18.92327530625403, + "grad_norm": 3.1063473224639893, + "learning_rate": 1.7108898939805995e-06, + "loss": 0.5124, + "step": 821800 + }, + { + "epoch": 18.92788063000829, + "grad_norm": 3.852067708969116, + "learning_rate": 1.703618330158084e-06, + "loss": 0.5106, + "step": 822000 + }, + { + "epoch": 18.932485953762548, + "grad_norm": 2.8518900871276855, + "learning_rate": 1.6963467663355682e-06, + "loss": 0.5175, + "step": 822200 + }, + { + "epoch": 18.93709127751681, + "grad_norm": 2.894487142562866, + "learning_rate": 1.6890752025130526e-06, + "loss": 0.5137, + "step": 822400 + }, + { + "epoch": 18.94169660127107, + "grad_norm": 3.1942451000213623, + "learning_rate": 1.6818036386905368e-06, + "loss": 0.5127, + "step": 822600 + }, + { + "epoch": 18.94630192502533, + "grad_norm": 3.005305051803589, + "learning_rate": 1.6745320748680213e-06, + "loss": 0.5046, + "step": 822800 + }, + { + "epoch": 18.95090724877959, + "grad_norm": 4.5873284339904785, + "learning_rate": 1.6672605110455055e-06, + "loss": 0.5146, + "step": 823000 + }, + { + "epoch": 18.95551257253385, + "grad_norm": 2.7985775470733643, + "learning_rate": 1.65998894722299e-06, + "loss": 0.5285, + "step": 823200 + }, + { + "epoch": 18.96011789628811, + "grad_norm": 3.6909046173095703, + "learning_rate": 1.6527173834004741e-06, + "loss": 0.522, + "step": 823400 + }, + { + "epoch": 18.96472322004237, + "grad_norm": 2.9598708152770996, + "learning_rate": 1.6454458195779586e-06, + "loss": 0.5164, + "step": 823600 + }, + { + "epoch": 18.969328543796628, + "grad_norm": 3.4591517448425293, + "learning_rate": 1.6381742557554428e-06, + "loss": 0.5118, + "step": 823800 + }, + { + "epoch": 18.97393386755089, + "grad_norm": 3.0720736980438232, + "learning_rate": 1.6309026919329272e-06, + "loss": 0.527, + "step": 824000 + }, + { + "epoch": 18.97853919130515, + "grad_norm": 4.1211018562316895, + "learning_rate": 1.6236311281104114e-06, + "loss": 0.5107, + "step": 824200 + }, + { + "epoch": 18.983144515059408, + "grad_norm": 2.694967746734619, + "learning_rate": 1.6163595642878959e-06, + "loss": 0.5063, + "step": 824400 + }, + { + "epoch": 18.98774983881367, + "grad_norm": 2.379441022872925, + "learning_rate": 1.60908800046538e-06, + "loss": 0.5177, + "step": 824600 + }, + { + "epoch": 18.99235516256793, + "grad_norm": 2.991995096206665, + "learning_rate": 1.6018164366428645e-06, + "loss": 0.5018, + "step": 824800 + }, + { + "epoch": 18.996960486322187, + "grad_norm": 3.4043898582458496, + "learning_rate": 1.5945448728203487e-06, + "loss": 0.5125, + "step": 825000 + }, + { + "epoch": 19.0, + "eval_loss": 0.5035088062286377, + "eval_runtime": 162.1896, + "eval_samples_per_second": 174.863, + "eval_steps_per_second": 10.932, + "step": 825132 + }, + { + "epoch": 19.00156581007645, + "grad_norm": 2.809699296951294, + "learning_rate": 1.5873096668169456e-06, + "loss": 0.5089, + "step": 825200 + }, + { + "epoch": 19.00617113383071, + "grad_norm": 3.992568254470825, + "learning_rate": 1.58003810299443e-06, + "loss": 0.5059, + "step": 825400 + }, + { + "epoch": 19.010776457584967, + "grad_norm": 2.535313129425049, + "learning_rate": 1.5727665391719143e-06, + "loss": 0.5081, + "step": 825600 + }, + { + "epoch": 19.01538178133923, + "grad_norm": 2.8412795066833496, + "learning_rate": 1.5654949753493987e-06, + "loss": 0.521, + "step": 825800 + }, + { + "epoch": 19.019987105093488, + "grad_norm": 3.4555296897888184, + "learning_rate": 1.558223411526883e-06, + "loss": 0.5154, + "step": 826000 + }, + { + "epoch": 19.024592428847747, + "grad_norm": 3.3591201305389404, + "learning_rate": 1.5509518477043674e-06, + "loss": 0.5027, + "step": 826200 + }, + { + "epoch": 19.02919775260201, + "grad_norm": 2.9588282108306885, + "learning_rate": 1.5436802838818516e-06, + "loss": 0.516, + "step": 826400 + }, + { + "epoch": 19.033803076356268, + "grad_norm": 3.250680446624756, + "learning_rate": 1.536408720059336e-06, + "loss": 0.5068, + "step": 826600 + }, + { + "epoch": 19.038408400110526, + "grad_norm": 2.998945713043213, + "learning_rate": 1.5291371562368202e-06, + "loss": 0.5137, + "step": 826800 + }, + { + "epoch": 19.04301372386479, + "grad_norm": 2.949064254760742, + "learning_rate": 1.5218655924143047e-06, + "loss": 0.4984, + "step": 827000 + }, + { + "epoch": 19.047619047619047, + "grad_norm": 3.1367082595825195, + "learning_rate": 1.5146303864109016e-06, + "loss": 0.5117, + "step": 827200 + }, + { + "epoch": 19.052224371373306, + "grad_norm": 2.956386089324951, + "learning_rate": 1.5073588225883858e-06, + "loss": 0.5142, + "step": 827400 + }, + { + "epoch": 19.05682969512757, + "grad_norm": 3.038003444671631, + "learning_rate": 1.5000872587658702e-06, + "loss": 0.4999, + "step": 827600 + }, + { + "epoch": 19.061435018881827, + "grad_norm": 3.545823335647583, + "learning_rate": 1.4928156949433545e-06, + "loss": 0.5041, + "step": 827800 + }, + { + "epoch": 19.066040342636086, + "grad_norm": 2.507877826690674, + "learning_rate": 1.4855804889399514e-06, + "loss": 0.5173, + "step": 828000 + }, + { + "epoch": 19.070645666390348, + "grad_norm": 2.498446464538574, + "learning_rate": 1.4783089251174358e-06, + "loss": 0.5251, + "step": 828200 + }, + { + "epoch": 19.075250990144607, + "grad_norm": 3.057725429534912, + "learning_rate": 1.47103736129492e-06, + "loss": 0.511, + "step": 828400 + }, + { + "epoch": 19.079856313898865, + "grad_norm": 2.1749329566955566, + "learning_rate": 1.4637657974724044e-06, + "loss": 0.49, + "step": 828600 + }, + { + "epoch": 19.084461637653128, + "grad_norm": 2.94429874420166, + "learning_rate": 1.4564942336498887e-06, + "loss": 0.5181, + "step": 828800 + }, + { + "epoch": 19.089066961407386, + "grad_norm": 2.4639828205108643, + "learning_rate": 1.449222669827373e-06, + "loss": 0.5149, + "step": 829000 + }, + { + "epoch": 19.093672285161645, + "grad_norm": 3.422637462615967, + "learning_rate": 1.4419511060048575e-06, + "loss": 0.5154, + "step": 829200 + }, + { + "epoch": 19.098277608915907, + "grad_norm": 2.8384313583374023, + "learning_rate": 1.434679542182342e-06, + "loss": 0.505, + "step": 829400 + }, + { + "epoch": 19.102882932670166, + "grad_norm": 2.945739984512329, + "learning_rate": 1.4274079783598262e-06, + "loss": 0.5124, + "step": 829600 + }, + { + "epoch": 19.10748825642443, + "grad_norm": 2.952714204788208, + "learning_rate": 1.4201364145373106e-06, + "loss": 0.5124, + "step": 829800 + }, + { + "epoch": 19.112093580178687, + "grad_norm": 4.0392680168151855, + "learning_rate": 1.4128648507147948e-06, + "loss": 0.5164, + "step": 830000 + }, + { + "epoch": 19.116698903932946, + "grad_norm": 3.5149545669555664, + "learning_rate": 1.4055932868922793e-06, + "loss": 0.511, + "step": 830200 + }, + { + "epoch": 19.121304227687208, + "grad_norm": 2.5348634719848633, + "learning_rate": 1.3983217230697635e-06, + "loss": 0.5078, + "step": 830400 + }, + { + "epoch": 19.125909551441467, + "grad_norm": 3.143413782119751, + "learning_rate": 1.3910865170663604e-06, + "loss": 0.5182, + "step": 830600 + }, + { + "epoch": 19.130514875195725, + "grad_norm": 2.6804118156433105, + "learning_rate": 1.3838149532438448e-06, + "loss": 0.5154, + "step": 830800 + }, + { + "epoch": 19.135120198949988, + "grad_norm": 3.680730104446411, + "learning_rate": 1.376543389421329e-06, + "loss": 0.5083, + "step": 831000 + }, + { + "epoch": 19.139725522704246, + "grad_norm": 3.507612943649292, + "learning_rate": 1.369308183417926e-06, + "loss": 0.5128, + "step": 831200 + }, + { + "epoch": 19.144330846458505, + "grad_norm": 2.8536086082458496, + "learning_rate": 1.3620366195954104e-06, + "loss": 0.4996, + "step": 831400 + }, + { + "epoch": 19.148936170212767, + "grad_norm": 3.133923053741455, + "learning_rate": 1.3547650557728946e-06, + "loss": 0.5172, + "step": 831600 + }, + { + "epoch": 19.153541493967026, + "grad_norm": 3.9035439491271973, + "learning_rate": 1.347493491950379e-06, + "loss": 0.5043, + "step": 831800 + }, + { + "epoch": 19.158146817721285, + "grad_norm": 2.622859477996826, + "learning_rate": 1.3402219281278632e-06, + "loss": 0.5147, + "step": 832000 + }, + { + "epoch": 19.162752141475547, + "grad_norm": 3.7109525203704834, + "learning_rate": 1.3329503643053477e-06, + "loss": 0.5115, + "step": 832200 + }, + { + "epoch": 19.167357465229806, + "grad_norm": 3.2106027603149414, + "learning_rate": 1.325678800482832e-06, + "loss": 0.5089, + "step": 832400 + }, + { + "epoch": 19.171962788984064, + "grad_norm": 2.890758514404297, + "learning_rate": 1.3184072366603163e-06, + "loss": 0.5142, + "step": 832600 + }, + { + "epoch": 19.176568112738327, + "grad_norm": 3.477639675140381, + "learning_rate": 1.3111356728378005e-06, + "loss": 0.5041, + "step": 832800 + }, + { + "epoch": 19.181173436492585, + "grad_norm": 3.0593950748443604, + "learning_rate": 1.303864109015285e-06, + "loss": 0.5304, + "step": 833000 + }, + { + "epoch": 19.185778760246844, + "grad_norm": 2.967453956604004, + "learning_rate": 1.2965925451927692e-06, + "loss": 0.5035, + "step": 833200 + }, + { + "epoch": 19.190384084001106, + "grad_norm": 3.582881450653076, + "learning_rate": 1.2893209813702536e-06, + "loss": 0.5136, + "step": 833400 + }, + { + "epoch": 19.194989407755365, + "grad_norm": 3.2862138748168945, + "learning_rate": 1.2820494175477379e-06, + "loss": 0.5052, + "step": 833600 + }, + { + "epoch": 19.199594731509624, + "grad_norm": 3.457026720046997, + "learning_rate": 1.2747778537252223e-06, + "loss": 0.5166, + "step": 833800 + }, + { + "epoch": 19.204200055263886, + "grad_norm": 3.373370885848999, + "learning_rate": 1.2675062899027065e-06, + "loss": 0.5075, + "step": 834000 + }, + { + "epoch": 19.208805379018145, + "grad_norm": 3.317833185195923, + "learning_rate": 1.260234726080191e-06, + "loss": 0.5078, + "step": 834200 + }, + { + "epoch": 19.213410702772403, + "grad_norm": 2.9720406532287598, + "learning_rate": 1.2529631622576752e-06, + "loss": 0.5074, + "step": 834400 + }, + { + "epoch": 19.218016026526666, + "grad_norm": 3.7691690921783447, + "learning_rate": 1.245727956254272e-06, + "loss": 0.5205, + "step": 834600 + }, + { + "epoch": 19.222621350280924, + "grad_norm": 3.1545891761779785, + "learning_rate": 1.2384563924317565e-06, + "loss": 0.5151, + "step": 834800 + }, + { + "epoch": 19.227226674035183, + "grad_norm": 2.8010830879211426, + "learning_rate": 1.2311848286092407e-06, + "loss": 0.515, + "step": 835000 + }, + { + "epoch": 19.231831997789445, + "grad_norm": 2.3824315071105957, + "learning_rate": 1.2239132647867251e-06, + "loss": 0.5145, + "step": 835200 + }, + { + "epoch": 19.236437321543704, + "grad_norm": 3.0972163677215576, + "learning_rate": 1.2166417009642094e-06, + "loss": 0.5138, + "step": 835400 + }, + { + "epoch": 19.241042645297963, + "grad_norm": 2.7618439197540283, + "learning_rate": 1.2093701371416938e-06, + "loss": 0.5125, + "step": 835600 + }, + { + "epoch": 19.245647969052225, + "grad_norm": 3.029303789138794, + "learning_rate": 1.202098573319178e-06, + "loss": 0.5098, + "step": 835800 + }, + { + "epoch": 19.250253292806484, + "grad_norm": 3.3938510417938232, + "learning_rate": 1.1948270094966622e-06, + "loss": 0.5101, + "step": 836000 + }, + { + "epoch": 19.254858616560742, + "grad_norm": 3.0954368114471436, + "learning_rate": 1.1875554456741467e-06, + "loss": 0.5129, + "step": 836200 + }, + { + "epoch": 19.259463940315005, + "grad_norm": 2.748145818710327, + "learning_rate": 1.1802838818516309e-06, + "loss": 0.517, + "step": 836400 + }, + { + "epoch": 19.264069264069263, + "grad_norm": 3.037444829940796, + "learning_rate": 1.1730123180291153e-06, + "loss": 0.5031, + "step": 836600 + }, + { + "epoch": 19.268674587823526, + "grad_norm": 2.792266845703125, + "learning_rate": 1.1657407542065995e-06, + "loss": 0.5045, + "step": 836800 + }, + { + "epoch": 19.273279911577784, + "grad_norm": 2.9860918521881104, + "learning_rate": 1.158469190384084e-06, + "loss": 0.5285, + "step": 837000 + }, + { + "epoch": 19.277885235332043, + "grad_norm": 3.5693347454071045, + "learning_rate": 1.1511976265615682e-06, + "loss": 0.5165, + "step": 837200 + }, + { + "epoch": 19.282490559086305, + "grad_norm": 2.5332469940185547, + "learning_rate": 1.1439260627390526e-06, + "loss": 0.5243, + "step": 837400 + }, + { + "epoch": 19.287095882840564, + "grad_norm": 3.3224828243255615, + "learning_rate": 1.1366544989165368e-06, + "loss": 0.5106, + "step": 837600 + }, + { + "epoch": 19.291701206594823, + "grad_norm": 3.539848804473877, + "learning_rate": 1.1293829350940213e-06, + "loss": 0.516, + "step": 837800 + }, + { + "epoch": 19.296306530349085, + "grad_norm": 4.043239116668701, + "learning_rate": 1.1221113712715057e-06, + "loss": 0.5165, + "step": 838000 + }, + { + "epoch": 19.300911854103344, + "grad_norm": 3.046630620956421, + "learning_rate": 1.1148398074489901e-06, + "loss": 0.5079, + "step": 838200 + }, + { + "epoch": 19.305517177857602, + "grad_norm": 2.7998993396759033, + "learning_rate": 1.1075682436264743e-06, + "loss": 0.5248, + "step": 838400 + }, + { + "epoch": 19.310122501611865, + "grad_norm": 3.3862979412078857, + "learning_rate": 1.1002966798039588e-06, + "loss": 0.5173, + "step": 838600 + }, + { + "epoch": 19.314727825366123, + "grad_norm": 3.7672178745269775, + "learning_rate": 1.093025115981443e-06, + "loss": 0.5157, + "step": 838800 + }, + { + "epoch": 19.319333149120382, + "grad_norm": 3.1289238929748535, + "learning_rate": 1.0857535521589274e-06, + "loss": 0.5193, + "step": 839000 + }, + { + "epoch": 19.323938472874644, + "grad_norm": 3.5175318717956543, + "learning_rate": 1.0784819883364116e-06, + "loss": 0.5246, + "step": 839200 + }, + { + "epoch": 19.328543796628903, + "grad_norm": 2.3767151832580566, + "learning_rate": 1.071210424513896e-06, + "loss": 0.5068, + "step": 839400 + }, + { + "epoch": 19.333149120383162, + "grad_norm": 2.1531851291656494, + "learning_rate": 1.0639388606913803e-06, + "loss": 0.5172, + "step": 839600 + }, + { + "epoch": 19.337754444137424, + "grad_norm": 3.3230369091033936, + "learning_rate": 1.0566672968688647e-06, + "loss": 0.5105, + "step": 839800 + }, + { + "epoch": 19.342359767891683, + "grad_norm": 3.470402956008911, + "learning_rate": 1.049395733046349e-06, + "loss": 0.5121, + "step": 840000 + }, + { + "epoch": 19.34696509164594, + "grad_norm": 3.0232300758361816, + "learning_rate": 1.0421241692238334e-06, + "loss": 0.5038, + "step": 840200 + }, + { + "epoch": 19.351570415400204, + "grad_norm": 2.391409397125244, + "learning_rate": 1.0348526054013176e-06, + "loss": 0.5104, + "step": 840400 + }, + { + "epoch": 19.356175739154462, + "grad_norm": 2.891075849533081, + "learning_rate": 1.027581041578802e-06, + "loss": 0.5088, + "step": 840600 + }, + { + "epoch": 19.36078106290872, + "grad_norm": 3.1132750511169434, + "learning_rate": 1.0203094777562862e-06, + "loss": 0.5115, + "step": 840800 + }, + { + "epoch": 19.365386386662983, + "grad_norm": 2.8549110889434814, + "learning_rate": 1.0130742717528832e-06, + "loss": 0.5142, + "step": 841000 + }, + { + "epoch": 19.369991710417242, + "grad_norm": 3.4312000274658203, + "learning_rate": 1.0058027079303676e-06, + "loss": 0.5001, + "step": 841200 + }, + { + "epoch": 19.3745970341715, + "grad_norm": 3.5737624168395996, + "learning_rate": 9.985675019269645e-07, + "loss": 0.5036, + "step": 841400 + }, + { + "epoch": 19.379202357925763, + "grad_norm": 2.611541748046875, + "learning_rate": 9.912959381044487e-07, + "loss": 0.5061, + "step": 841600 + }, + { + "epoch": 19.38380768168002, + "grad_norm": 3.1176815032958984, + "learning_rate": 9.840243742819331e-07, + "loss": 0.5011, + "step": 841800 + }, + { + "epoch": 19.38841300543428, + "grad_norm": 3.4241085052490234, + "learning_rate": 9.767528104594174e-07, + "loss": 0.5101, + "step": 842000 + }, + { + "epoch": 19.393018329188543, + "grad_norm": 3.769061803817749, + "learning_rate": 9.694812466369018e-07, + "loss": 0.5212, + "step": 842200 + }, + { + "epoch": 19.3976236529428, + "grad_norm": 3.25093674659729, + "learning_rate": 9.62209682814386e-07, + "loss": 0.5043, + "step": 842400 + }, + { + "epoch": 19.40222897669706, + "grad_norm": 3.0781145095825195, + "learning_rate": 9.549381189918704e-07, + "loss": 0.5031, + "step": 842600 + }, + { + "epoch": 19.406834300451322, + "grad_norm": 2.676129102706909, + "learning_rate": 9.476665551693547e-07, + "loss": 0.5187, + "step": 842800 + }, + { + "epoch": 19.41143962420558, + "grad_norm": 3.2870798110961914, + "learning_rate": 9.403949913468391e-07, + "loss": 0.5144, + "step": 843000 + }, + { + "epoch": 19.41604494795984, + "grad_norm": 2.994854211807251, + "learning_rate": 9.331234275243234e-07, + "loss": 0.5111, + "step": 843200 + }, + { + "epoch": 19.420650271714102, + "grad_norm": 4.02811336517334, + "learning_rate": 9.258882215209203e-07, + "loss": 0.5123, + "step": 843400 + }, + { + "epoch": 19.42525559546836, + "grad_norm": 2.6425375938415527, + "learning_rate": 9.186166576984047e-07, + "loss": 0.5134, + "step": 843600 + }, + { + "epoch": 19.429860919222623, + "grad_norm": 3.219266891479492, + "learning_rate": 9.11345093875889e-07, + "loss": 0.5197, + "step": 843800 + }, + { + "epoch": 19.43446624297688, + "grad_norm": 3.178544282913208, + "learning_rate": 9.040735300533733e-07, + "loss": 0.5238, + "step": 844000 + }, + { + "epoch": 19.43907156673114, + "grad_norm": 2.9460504055023193, + "learning_rate": 8.968019662308576e-07, + "loss": 0.5171, + "step": 844200 + }, + { + "epoch": 19.443676890485403, + "grad_norm": 2.947618246078491, + "learning_rate": 8.89530402408342e-07, + "loss": 0.5113, + "step": 844400 + }, + { + "epoch": 19.44828221423966, + "grad_norm": 3.1434247493743896, + "learning_rate": 8.822588385858263e-07, + "loss": 0.5121, + "step": 844600 + }, + { + "epoch": 19.45288753799392, + "grad_norm": 4.052412986755371, + "learning_rate": 8.749872747633106e-07, + "loss": 0.5132, + "step": 844800 + }, + { + "epoch": 19.457492861748182, + "grad_norm": 2.8195929527282715, + "learning_rate": 8.677157109407949e-07, + "loss": 0.5209, + "step": 845000 + }, + { + "epoch": 19.46209818550244, + "grad_norm": 2.5590524673461914, + "learning_rate": 8.604805049373918e-07, + "loss": 0.5102, + "step": 845200 + }, + { + "epoch": 19.4667035092567, + "grad_norm": 2.98577618598938, + "learning_rate": 8.532089411148762e-07, + "loss": 0.5053, + "step": 845400 + }, + { + "epoch": 19.471308833010962, + "grad_norm": 3.019172430038452, + "learning_rate": 8.459373772923605e-07, + "loss": 0.5076, + "step": 845600 + }, + { + "epoch": 19.47591415676522, + "grad_norm": 3.0066978931427, + "learning_rate": 8.386658134698449e-07, + "loss": 0.5134, + "step": 845800 + }, + { + "epoch": 19.48051948051948, + "grad_norm": 2.8860740661621094, + "learning_rate": 8.313942496473292e-07, + "loss": 0.5171, + "step": 846000 + }, + { + "epoch": 19.48512480427374, + "grad_norm": 3.4044198989868164, + "learning_rate": 8.241226858248136e-07, + "loss": 0.5019, + "step": 846200 + }, + { + "epoch": 19.489730128028, + "grad_norm": 2.7292261123657227, + "learning_rate": 8.168511220022979e-07, + "loss": 0.5093, + "step": 846400 + }, + { + "epoch": 19.49433545178226, + "grad_norm": 3.8156752586364746, + "learning_rate": 8.095795581797822e-07, + "loss": 0.5105, + "step": 846600 + }, + { + "epoch": 19.49894077553652, + "grad_norm": 3.2243752479553223, + "learning_rate": 8.023079943572666e-07, + "loss": 0.5104, + "step": 846800 + }, + { + "epoch": 19.50354609929078, + "grad_norm": 3.429675340652466, + "learning_rate": 7.950364305347509e-07, + "loss": 0.5064, + "step": 847000 + }, + { + "epoch": 19.50815142304504, + "grad_norm": 2.856170415878296, + "learning_rate": 7.877648667122352e-07, + "loss": 0.5156, + "step": 847200 + }, + { + "epoch": 19.5127567467993, + "grad_norm": 3.27316951751709, + "learning_rate": 7.804933028897195e-07, + "loss": 0.4983, + "step": 847400 + }, + { + "epoch": 19.51736207055356, + "grad_norm": 3.1368207931518555, + "learning_rate": 7.732217390672039e-07, + "loss": 0.5118, + "step": 847600 + }, + { + "epoch": 19.52196739430782, + "grad_norm": 2.677781581878662, + "learning_rate": 7.659501752446882e-07, + "loss": 0.5106, + "step": 847800 + }, + { + "epoch": 19.52657271806208, + "grad_norm": 2.9507906436920166, + "learning_rate": 7.586786114221725e-07, + "loss": 0.5203, + "step": 848000 + }, + { + "epoch": 19.53117804181634, + "grad_norm": 3.28304123878479, + "learning_rate": 7.514070475996568e-07, + "loss": 0.5062, + "step": 848200 + }, + { + "epoch": 19.535783365570598, + "grad_norm": 3.1453018188476562, + "learning_rate": 7.44135483777141e-07, + "loss": 0.5085, + "step": 848400 + }, + { + "epoch": 19.54038868932486, + "grad_norm": 2.7345168590545654, + "learning_rate": 7.368639199546254e-07, + "loss": 0.5128, + "step": 848600 + }, + { + "epoch": 19.54499401307912, + "grad_norm": 3.114023447036743, + "learning_rate": 7.295923561321097e-07, + "loss": 0.5089, + "step": 848800 + }, + { + "epoch": 19.549599336833378, + "grad_norm": 2.9676406383514404, + "learning_rate": 7.22320792309594e-07, + "loss": 0.5033, + "step": 849000 + }, + { + "epoch": 19.55420466058764, + "grad_norm": 3.178053855895996, + "learning_rate": 7.150492284870785e-07, + "loss": 0.5015, + "step": 849200 + }, + { + "epoch": 19.5588099843419, + "grad_norm": 2.661820411682129, + "learning_rate": 7.077776646645628e-07, + "loss": 0.5122, + "step": 849400 + }, + { + "epoch": 19.563415308096157, + "grad_norm": 3.9830830097198486, + "learning_rate": 7.005061008420471e-07, + "loss": 0.5087, + "step": 849600 + }, + { + "epoch": 19.56802063185042, + "grad_norm": 2.1384353637695312, + "learning_rate": 6.932345370195314e-07, + "loss": 0.5082, + "step": 849800 + }, + { + "epoch": 19.57262595560468, + "grad_norm": 3.7364590167999268, + "learning_rate": 6.859629731970158e-07, + "loss": 0.4953, + "step": 850000 + }, + { + "epoch": 19.57723127935894, + "grad_norm": 2.6958413124084473, + "learning_rate": 6.786914093745001e-07, + "loss": 0.5113, + "step": 850200 + }, + { + "epoch": 19.5818366031132, + "grad_norm": 3.8758256435394287, + "learning_rate": 6.714198455519844e-07, + "loss": 0.507, + "step": 850400 + }, + { + "epoch": 19.586441926867458, + "grad_norm": 2.9180338382720947, + "learning_rate": 6.641482817294687e-07, + "loss": 0.519, + "step": 850600 + }, + { + "epoch": 19.59104725062172, + "grad_norm": 2.6355738639831543, + "learning_rate": 6.568767179069531e-07, + "loss": 0.4985, + "step": 850800 + }, + { + "epoch": 19.59565257437598, + "grad_norm": 2.67168927192688, + "learning_rate": 6.4964151190355e-07, + "loss": 0.5045, + "step": 851000 + }, + { + "epoch": 19.600257898130238, + "grad_norm": 4.244370460510254, + "learning_rate": 6.423699480810343e-07, + "loss": 0.5072, + "step": 851200 + }, + { + "epoch": 19.6048632218845, + "grad_norm": 3.0310142040252686, + "learning_rate": 6.350983842585186e-07, + "loss": 0.5097, + "step": 851400 + }, + { + "epoch": 19.60946854563876, + "grad_norm": 3.2820241451263428, + "learning_rate": 6.278268204360029e-07, + "loss": 0.5038, + "step": 851600 + }, + { + "epoch": 19.614073869393017, + "grad_norm": 3.4763691425323486, + "learning_rate": 6.205552566134873e-07, + "loss": 0.5262, + "step": 851800 + }, + { + "epoch": 19.61867919314728, + "grad_norm": 3.8789632320404053, + "learning_rate": 6.132836927909716e-07, + "loss": 0.5077, + "step": 852000 + }, + { + "epoch": 19.62328451690154, + "grad_norm": 2.8242287635803223, + "learning_rate": 6.060121289684559e-07, + "loss": 0.5073, + "step": 852200 + }, + { + "epoch": 19.627889840655797, + "grad_norm": 3.055107593536377, + "learning_rate": 5.987405651459402e-07, + "loss": 0.5079, + "step": 852400 + }, + { + "epoch": 19.63249516441006, + "grad_norm": 3.6184749603271484, + "learning_rate": 5.914690013234246e-07, + "loss": 0.5184, + "step": 852600 + }, + { + "epoch": 19.637100488164318, + "grad_norm": 2.965026378631592, + "learning_rate": 5.841974375009089e-07, + "loss": 0.5054, + "step": 852800 + }, + { + "epoch": 19.641705811918577, + "grad_norm": 2.952613115310669, + "learning_rate": 5.769258736783932e-07, + "loss": 0.5095, + "step": 853000 + }, + { + "epoch": 19.64631113567284, + "grad_norm": 3.1526806354522705, + "learning_rate": 5.696543098558776e-07, + "loss": 0.501, + "step": 853200 + }, + { + "epoch": 19.650916459427098, + "grad_norm": 2.6503217220306396, + "learning_rate": 5.62382746033362e-07, + "loss": 0.5078, + "step": 853400 + }, + { + "epoch": 19.655521783181356, + "grad_norm": 3.481039524078369, + "learning_rate": 5.551111822108463e-07, + "loss": 0.5219, + "step": 853600 + }, + { + "epoch": 19.66012710693562, + "grad_norm": 2.944347858428955, + "learning_rate": 5.478396183883306e-07, + "loss": 0.5095, + "step": 853800 + }, + { + "epoch": 19.664732430689877, + "grad_norm": 3.2116689682006836, + "learning_rate": 5.40568054565815e-07, + "loss": 0.5008, + "step": 854000 + }, + { + "epoch": 19.669337754444136, + "grad_norm": 2.333085775375366, + "learning_rate": 5.332964907432993e-07, + "loss": 0.5028, + "step": 854200 + }, + { + "epoch": 19.6739430781984, + "grad_norm": 3.2551276683807373, + "learning_rate": 5.260249269207836e-07, + "loss": 0.5081, + "step": 854400 + }, + { + "epoch": 19.678548401952657, + "grad_norm": 2.8687589168548584, + "learning_rate": 5.187533630982679e-07, + "loss": 0.5078, + "step": 854600 + }, + { + "epoch": 19.683153725706916, + "grad_norm": 2.7332775592803955, + "learning_rate": 5.114817992757523e-07, + "loss": 0.5065, + "step": 854800 + }, + { + "epoch": 19.687759049461178, + "grad_norm": 3.2149274349212646, + "learning_rate": 5.042465932723492e-07, + "loss": 0.5121, + "step": 855000 + }, + { + "epoch": 19.692364373215437, + "grad_norm": 3.2051103115081787, + "learning_rate": 4.969750294498335e-07, + "loss": 0.5176, + "step": 855200 + }, + { + "epoch": 19.696969696969695, + "grad_norm": 4.147640228271484, + "learning_rate": 4.897034656273178e-07, + "loss": 0.4981, + "step": 855400 + }, + { + "epoch": 19.701575020723958, + "grad_norm": 2.6071434020996094, + "learning_rate": 4.824319018048021e-07, + "loss": 0.519, + "step": 855600 + }, + { + "epoch": 19.706180344478216, + "grad_norm": 2.8884124755859375, + "learning_rate": 4.7516033798228646e-07, + "loss": 0.5043, + "step": 855800 + }, + { + "epoch": 19.710785668232475, + "grad_norm": 3.735039234161377, + "learning_rate": 4.678887741597708e-07, + "loss": 0.5042, + "step": 856000 + }, + { + "epoch": 19.715390991986737, + "grad_norm": 2.957726001739502, + "learning_rate": 4.606172103372551e-07, + "loss": 0.5158, + "step": 856200 + }, + { + "epoch": 19.719996315740996, + "grad_norm": 2.8788440227508545, + "learning_rate": 4.533456465147395e-07, + "loss": 0.5084, + "step": 856400 + }, + { + "epoch": 19.724601639495255, + "grad_norm": 3.8605153560638428, + "learning_rate": 4.460740826922238e-07, + "loss": 0.5201, + "step": 856600 + }, + { + "epoch": 19.729206963249517, + "grad_norm": 3.0567197799682617, + "learning_rate": 4.3880251886970814e-07, + "loss": 0.4976, + "step": 856800 + }, + { + "epoch": 19.733812287003776, + "grad_norm": 3.3743460178375244, + "learning_rate": 4.3153095504719246e-07, + "loss": 0.509, + "step": 857000 + }, + { + "epoch": 19.738417610758034, + "grad_norm": 2.7658159732818604, + "learning_rate": 4.242593912246768e-07, + "loss": 0.5259, + "step": 857200 + }, + { + "epoch": 19.743022934512297, + "grad_norm": 3.7195098400115967, + "learning_rate": 4.170241852212737e-07, + "loss": 0.5088, + "step": 857400 + }, + { + "epoch": 19.747628258266555, + "grad_norm": 2.6873762607574463, + "learning_rate": 4.09752621398758e-07, + "loss": 0.5168, + "step": 857600 + }, + { + "epoch": 19.752233582020818, + "grad_norm": 2.6738898754119873, + "learning_rate": 4.0248105757624235e-07, + "loss": 0.5081, + "step": 857800 + }, + { + "epoch": 19.756838905775076, + "grad_norm": 2.91037917137146, + "learning_rate": 3.9520949375372667e-07, + "loss": 0.5066, + "step": 858000 + }, + { + "epoch": 19.761444229529335, + "grad_norm": 2.883091926574707, + "learning_rate": 3.87937929931211e-07, + "loss": 0.5125, + "step": 858200 + }, + { + "epoch": 19.766049553283597, + "grad_norm": 2.677788257598877, + "learning_rate": 3.806663661086953e-07, + "loss": 0.5051, + "step": 858400 + }, + { + "epoch": 19.770654877037856, + "grad_norm": 3.4007511138916016, + "learning_rate": 3.7339480228617965e-07, + "loss": 0.5061, + "step": 858600 + }, + { + "epoch": 19.775260200792115, + "grad_norm": 3.597402572631836, + "learning_rate": 3.6612323846366403e-07, + "loss": 0.5014, + "step": 858800 + }, + { + "epoch": 19.779865524546377, + "grad_norm": 3.0766663551330566, + "learning_rate": 3.5885167464114835e-07, + "loss": 0.5205, + "step": 859000 + }, + { + "epoch": 19.784470848300636, + "grad_norm": 3.5148487091064453, + "learning_rate": 3.516164686377452e-07, + "loss": 0.5238, + "step": 859200 + }, + { + "epoch": 19.789076172054894, + "grad_norm": 2.736527681350708, + "learning_rate": 3.4434490481522953e-07, + "loss": 0.5033, + "step": 859400 + }, + { + "epoch": 19.793681495809157, + "grad_norm": 3.351614236831665, + "learning_rate": 3.370733409927139e-07, + "loss": 0.5144, + "step": 859600 + }, + { + "epoch": 19.798286819563415, + "grad_norm": 2.9245457649230957, + "learning_rate": 3.2980177717019824e-07, + "loss": 0.5079, + "step": 859800 + }, + { + "epoch": 19.802892143317674, + "grad_norm": 2.770165205001831, + "learning_rate": 3.2253021334768256e-07, + "loss": 0.5054, + "step": 860000 + }, + { + "epoch": 19.807497467071936, + "grad_norm": 3.2325165271759033, + "learning_rate": 3.152586495251669e-07, + "loss": 0.5162, + "step": 860200 + }, + { + "epoch": 19.812102790826195, + "grad_norm": 2.7553768157958984, + "learning_rate": 3.079870857026512e-07, + "loss": 0.5005, + "step": 860400 + }, + { + "epoch": 19.816708114580454, + "grad_norm": 3.5754334926605225, + "learning_rate": 3.0071552188013554e-07, + "loss": 0.5289, + "step": 860600 + }, + { + "epoch": 19.821313438334716, + "grad_norm": 3.362692356109619, + "learning_rate": 2.9344395805761986e-07, + "loss": 0.5098, + "step": 860800 + }, + { + "epoch": 19.825918762088975, + "grad_norm": 2.9253594875335693, + "learning_rate": 2.861723942351042e-07, + "loss": 0.5011, + "step": 861000 + }, + { + "epoch": 19.830524085843233, + "grad_norm": 3.8785996437072754, + "learning_rate": 2.7890083041258857e-07, + "loss": 0.5125, + "step": 861200 + }, + { + "epoch": 19.835129409597496, + "grad_norm": 3.061610698699951, + "learning_rate": 2.716292665900729e-07, + "loss": 0.5266, + "step": 861400 + }, + { + "epoch": 19.839734733351754, + "grad_norm": 3.2867627143859863, + "learning_rate": 2.643577027675572e-07, + "loss": 0.5109, + "step": 861600 + }, + { + "epoch": 19.844340057106013, + "grad_norm": 3.554579496383667, + "learning_rate": 2.5708613894504154e-07, + "loss": 0.5218, + "step": 861800 + }, + { + "epoch": 19.848945380860275, + "grad_norm": 3.178513288497925, + "learning_rate": 2.4981457512252587e-07, + "loss": 0.5147, + "step": 862000 + }, + { + "epoch": 19.853550704614534, + "grad_norm": 3.356754779815674, + "learning_rate": 2.425430113000102e-07, + "loss": 0.5141, + "step": 862200 + }, + { + "epoch": 19.858156028368793, + "grad_norm": 2.666679859161377, + "learning_rate": 2.3527144747749452e-07, + "loss": 0.5183, + "step": 862400 + }, + { + "epoch": 19.862761352123055, + "grad_norm": 3.7374143600463867, + "learning_rate": 2.2799988365497884e-07, + "loss": 0.5166, + "step": 862600 + }, + { + "epoch": 19.867366675877314, + "grad_norm": 3.064572334289551, + "learning_rate": 2.2072831983246317e-07, + "loss": 0.5002, + "step": 862800 + }, + { + "epoch": 19.871971999631572, + "grad_norm": 3.1594460010528564, + "learning_rate": 2.134567560099475e-07, + "loss": 0.5128, + "step": 863000 + }, + { + "epoch": 19.876577323385835, + "grad_norm": 3.091594934463501, + "learning_rate": 2.0618519218743182e-07, + "loss": 0.5092, + "step": 863200 + }, + { + "epoch": 19.881182647140093, + "grad_norm": 3.6168246269226074, + "learning_rate": 1.9891362836491617e-07, + "loss": 0.5154, + "step": 863400 + }, + { + "epoch": 19.885787970894356, + "grad_norm": 3.3390212059020996, + "learning_rate": 1.916420645424005e-07, + "loss": 0.5118, + "step": 863600 + }, + { + "epoch": 19.890393294648614, + "grad_norm": 2.9384098052978516, + "learning_rate": 1.8437050071988482e-07, + "loss": 0.4973, + "step": 863800 + }, + { + "epoch": 19.894998618402873, + "grad_norm": 3.3351097106933594, + "learning_rate": 1.7713529471648173e-07, + "loss": 0.5152, + "step": 864000 + }, + { + "epoch": 19.899603942157135, + "grad_norm": 3.4556448459625244, + "learning_rate": 1.6986373089396606e-07, + "loss": 0.5134, + "step": 864200 + }, + { + "epoch": 19.904209265911394, + "grad_norm": 3.008500337600708, + "learning_rate": 1.6262852489056296e-07, + "loss": 0.5085, + "step": 864400 + }, + { + "epoch": 19.908814589665653, + "grad_norm": 2.8039371967315674, + "learning_rate": 1.5535696106804732e-07, + "loss": 0.513, + "step": 864600 + }, + { + "epoch": 19.913419913419915, + "grad_norm": 4.19392204284668, + "learning_rate": 1.4808539724553164e-07, + "loss": 0.5077, + "step": 864800 + }, + { + "epoch": 19.918025237174174, + "grad_norm": 3.3566884994506836, + "learning_rate": 1.4081383342301597e-07, + "loss": 0.5156, + "step": 865000 + }, + { + "epoch": 19.922630560928432, + "grad_norm": 3.012193202972412, + "learning_rate": 1.335422696005003e-07, + "loss": 0.5019, + "step": 865200 + }, + { + "epoch": 19.927235884682695, + "grad_norm": 3.2162373065948486, + "learning_rate": 1.2627070577798462e-07, + "loss": 0.5135, + "step": 865400 + }, + { + "epoch": 19.931841208436953, + "grad_norm": 3.0207014083862305, + "learning_rate": 1.1903549977458152e-07, + "loss": 0.506, + "step": 865600 + }, + { + "epoch": 19.936446532191212, + "grad_norm": 3.044567823410034, + "learning_rate": 1.1176393595206585e-07, + "loss": 0.5098, + "step": 865800 + }, + { + "epoch": 19.941051855945474, + "grad_norm": 2.884814977645874, + "learning_rate": 1.0449237212955019e-07, + "loss": 0.5142, + "step": 866000 + }, + { + "epoch": 19.945657179699733, + "grad_norm": 3.6238908767700195, + "learning_rate": 9.722080830703451e-08, + "loss": 0.5142, + "step": 866200 + }, + { + "epoch": 19.95026250345399, + "grad_norm": 2.9918384552001953, + "learning_rate": 8.994924448451885e-08, + "loss": 0.518, + "step": 866400 + }, + { + "epoch": 19.954867827208254, + "grad_norm": 3.4772942066192627, + "learning_rate": 8.267768066200316e-08, + "loss": 0.5172, + "step": 866600 + }, + { + "epoch": 19.959473150962513, + "grad_norm": 3.1812825202941895, + "learning_rate": 7.54061168394875e-08, + "loss": 0.5074, + "step": 866800 + }, + { + "epoch": 19.96407847471677, + "grad_norm": 3.3182106018066406, + "learning_rate": 6.813455301697183e-08, + "loss": 0.5054, + "step": 867000 + }, + { + "epoch": 19.968683798471034, + "grad_norm": 3.577573299407959, + "learning_rate": 6.086298919445617e-08, + "loss": 0.5108, + "step": 867200 + }, + { + "epoch": 19.973289122225292, + "grad_norm": 3.277371644973755, + "learning_rate": 5.359142537194049e-08, + "loss": 0.5053, + "step": 867400 + }, + { + "epoch": 19.97789444597955, + "grad_norm": 2.689061164855957, + "learning_rate": 4.631986154942482e-08, + "loss": 0.516, + "step": 867600 + }, + { + "epoch": 19.982499769733813, + "grad_norm": 2.8559155464172363, + "learning_rate": 3.904829772690915e-08, + "loss": 0.5057, + "step": 867800 + }, + { + "epoch": 19.987105093488072, + "grad_norm": 3.4477171897888184, + "learning_rate": 3.177673390439348e-08, + "loss": 0.5065, + "step": 868000 + }, + { + "epoch": 19.99171041724233, + "grad_norm": 3.181300163269043, + "learning_rate": 2.450517008187781e-08, + "loss": 0.5202, + "step": 868200 + }, + { + "epoch": 19.996315740996593, + "grad_norm": 2.9424209594726562, + "learning_rate": 1.723360625936214e-08, + "loss": 0.5106, + "step": 868400 + }, + { + "epoch": 20.0, + "eval_loss": 0.5036933422088623, + "eval_runtime": 162.2891, + "eval_samples_per_second": 174.756, + "eval_steps_per_second": 10.925, + "step": 868560 + } + ], + "logging_steps": 200, + "max_steps": 868560, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.661507840870656e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}